Skip to content

Commit

Permalink
Destination-S3: Migrate to Bulk Load CDK (#50857)
Browse files Browse the repository at this point in the history
  • Loading branch information
johnny-schmidt authored Jan 3, 2025
1 parent 9b717fc commit 346c23f
Show file tree
Hide file tree
Showing 52 changed files with 191 additions and 2,043 deletions.
2 changes: 0 additions & 2 deletions airbyte-integrations/connectors/destination-s3-v2/README.md

This file was deleted.

48 changes: 0 additions & 48 deletions airbyte-integrations/connectors/destination-s3-v2/build.gradle

This file was deleted.

This file was deleted.

1 change: 0 additions & 1 deletion airbyte-integrations/connectors/destination-s3-v2/icon.svg

This file was deleted.

124 changes: 0 additions & 124 deletions airbyte-integrations/connectors/destination-s3-v2/metadata.yaml

This file was deleted.

28 changes: 1 addition & 27 deletions airbyte-integrations/connectors/destination-s3/README.md
Original file line number Diff line number Diff line change
@@ -1,28 +1,2 @@
# S3 Test Configuration
# S3 V2 (Bulk CDK) Destination

In order to test the S3 destination, you need an AWS account (or alternative S3 account).

## Community Contributor

As a community contributor, you will need access to AWS to run the integration tests.

- Create an S3 bucket for testing.
- Get your `access_key_id` and `secret_access_key` that can read and write to the above bucket.
- if you leave `access_key_id` and `secret_access_key` in blank, the authentication will rely on the instance profile authentication
- Paste the bucket and key information into the config files under [`./sample_secrets`](./sample_secrets).
- Rename the directory from `sample_secrets` to `secrets`.
- Feel free to modify the config files with different settings in the acceptance test file (e.g. `S3CsvDestinationAcceptanceTest.java`, method `getFormatConfig`), as long as they follow the schema defined in [spec.json](src/main/resources/spec.json).

## Airbyte Employee

- Access the `destination s3 creds` secrets on Last Pass, and put it in `sample_secrets/config.json`.
- Rename the directory from `sample_secrets` to `secrets`.

## Add New Output Format

- Add a new enum in `S3Format`.
- Modify `spec.json` to specify the configuration of this new format.
- Update `S3FormatConfigs` to be able to construct a config for this new format.
- Create a new package under `io.airbyte.integrations.destination.s3`.
- Implement a new `DestinationFileWriter`. The implementation can extend `BaseS3Writer`.
- Write an acceptance test for the new output format. The test can extend `S3DestinationAcceptanceTest`.
60 changes: 20 additions & 40 deletions airbyte-integrations/connectors/destination-s3/build.gradle
Original file line number Diff line number Diff line change
@@ -1,24 +1,21 @@
plugins {
id 'application'
id 'airbyte-java-connector'
id 'airbyte-bulk-connector'
}

airbyteJavaConnector {
cdkVersionRequired = '0.48.1'
features = ['db-destinations', 's3-destinations']
useLocalCdk = false
airbyteBulkConnector {
core = 'load'
toolkits = ['load-s3']
cdk = 'local'
}

airbyteJavaConnector.addCdkDependencies()

application {
mainClass = 'io.airbyte.integrations.destination.s3.S3DestinationRunner'
applicationDefaultJvmArgs = ['-XX:+ExitOnOutOfMemoryError', '-XX:MaxRAMPercentage=75.0']
mainClass = 'io.airbyte.integrations.destination.s3_v2.S3V2Destination'

// uncomment and replace to run locally
//applicationDefaultJvmArgs = ['-XX:+ExitOnOutOfMemoryError', '-XX:MaxRAMPercentage=75.0', '--add-opens', 'java.base/java.lang=ALL-UNNAMED']
applicationDefaultJvmArgs = [
'-XX:+ExitOnOutOfMemoryError', '-XX:MaxRAMPercentage=75.0'
'-XX:+ExitOnOutOfMemoryError', '-XX:MaxRAMPercentage=75.0',
// Uncomment to run locally:
// '--add-opens', 'java.base/java.lang=ALL-UNNAMED'
// Uncomment to enable remote profiling:
// '-XX:NativeMemoryTracking=detail',
// '-Djava.rmi.server.hostname=localhost',
Expand All @@ -27,42 +24,25 @@ application {
// '-Dcom.sun.management.jmxremote.rmi.port=6000',
// '-Dcom.sun.management.jmxremote.local.only=false',
// '-Dcom.sun.management.jmxremote.authenticate=false',
// '-Dcom.sun.management.jmxremote.ssl=false',
// '-Dcom.sun.management.jmxremote.ssl=false'
]

// Uncomment and replace to run locally
//applicationDefaultJvmArgs = ['-XX:+ExitOnOutOfMemoryError', '-XX:MaxRAMPercentage=75.0', '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED', '--add-opens', 'java.base/sun.security.action=ALL-UNNAMED', '--add-opens', 'java.base/java.lang=ALL-UNNAMED']
}

// uncomment to run locally
// Uncomment to run locally
//run {
// standardInput = System.in
//}

dependencies {
// temporary dependencies so that we can continue running the legacy test suite.
// eventually we should remove those tests + rely solely on the bulk CDK tests.
integrationTestLegacyImplementation testFixtures(project(":airbyte-cdk:java:airbyte-cdk:airbyte-cdk-s3-destinations"))

// csv
implementation 'com.amazonaws:aws-java-sdk-s3:1.12.772'
implementation 'com.amazonaws:aws-java-sdk-iam:1.12.772'
implementation 'org.apache.commons:commons-csv:1.11.0'
implementation 'com.github.alexmojaki:s3-stream-upload:2.2.4'
// TODO this should come from from the cdk plugin + respect the cdk version
integrationTestImplementation testFixtures(project(":airbyte-cdk:bulk:toolkits:bulk-cdk-toolkit-load-avro"))
}

// parquet
implementation ('org.apache.hadoop:hadoop-common:3.4.0') {
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
exclude group: 'org.slf4j', module: 'slf4j-reload4j'
exclude group: 'org.apache.zookeeper'
}
implementation ('org.apache.hadoop:hadoop-aws:3.4.0') { exclude group: 'org.slf4j', module: 'slf4j-log4j12'}
implementation ('org.apache.hadoop:hadoop-mapreduce-client-core:3.4.0') {
exclude group: 'org.slf4j', module: 'slf4j-log4j12'
exclude group: 'org.slf4j', module: 'slf4j-reload4j'
exclude group: 'org.apache.zookeeper'
}
implementation ('org.apache.parquet:parquet-avro:1.14.2') { exclude group: 'org.slf4j', module: 'slf4j-log4j12'}
implementation ('com.github.airbytehq:json-avro-converter:1.1.3') { exclude group: 'ch.qos.logback', module: 'logback-classic'}
implementation group: 'com.hadoop.gplcompression', name: 'hadoop-lzo', version: '0.4.20'
testImplementation 'org.apache.commons:commons-lang3:3.17.0'
testImplementation 'org.xerial.snappy:snappy-java:1.1.10.7'
testImplementation "org.mockito:mockito-inline:5.2.0"
// Exclude conflicting log4j-over-slf4j dependency
configurations.all {
exclude group: "org.slf4j", module: "slf4j-reload4j"
}
Loading

0 comments on commit 346c23f

Please sign in to comment.