diff --git a/.github/workflows/python_pytest.yml b/.github/workflows/python_pytest.yml index 395bea36..430b8641 100644 --- a/.github/workflows/python_pytest.yml +++ b/.github/workflows/python_pytest.yml @@ -70,9 +70,11 @@ jobs: # Job-specific step(s): - name: Run Pytest (No-Creds) env: - # Force this to an invalid value to ensure tests that no creds are required are run. - GCP_GSM_CREDENTIALS: "no-creds" - run: poetry run pytest -m "not requires_creds" + # Force this to a blank value. + GCP_GSM_CREDENTIALS: "" + run: > + poetry run pytest -m + "not requires_creds and not linting and not super_slow" pytest: name: Pytest (All, Python ${{ matrix.python-version }}, ${{ matrix.os }}) @@ -114,4 +116,5 @@ jobs: - name: Run Pytest env: GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} - run: poetry run pytest -m "not linting" + run: > + poetry run pytest -m "not linting and not super_slow" diff --git a/.github/workflows/test-pr-command.yml b/.github/workflows/test-pr-command.yml index 7937311b..7a71a9e4 100644 --- a/.github/workflows/test-pr-command.yml +++ b/.github/workflows/test-pr-command.yml @@ -82,7 +82,7 @@ jobs: - name: Run Pytest env: GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} - run: poetry run pytest + run: poetry run pytest -m "not super_slow" log-success-comment: name: Append 'Success' Comment diff --git a/.gitignore b/.gitignore index e15da207..aac1cc1e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,12 @@ -# Directories and subdirectories called 'secrets' or '.secrets' +# Packaged docs +docs/*.zip + +# Misc +.DS_Store + +# Directories and subdirectories called '.secrets' and the top-level '/secrets' directory .secrets -secrets +/secrets # Virtual Environments .venv diff --git a/README.md b/README.md index 7ed76cbf..001623d7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,16 @@ # PyAirbyte -PyAirbyte brings the power of Airbyte to every Python developer. PyAirbyte provides a set of utilities to use Airbyte connectors in Python. It is meant to be used in situations where setting up an Airbyte server or cloud account is not possible or desirable. +PyAirbyte brings the power of Airbyte to every Python developer. PyAirbyte provides a set of utilities to use Airbyte connectors in Python. + +[![PyPI version](https://badge.fury.io/py/airbyte.svg)](https://badge.fury.io/py/airbyte) +[![PyPI - Downloads](https://img.shields.io/pypi/dm/airbyte)](https://pypi.org/project/airbyte/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/airbyte)](https://pypi.org/project/airbyte/) + +[![PyPI - Wheel](https://img.shields.io/pypi/wheel/airbyte)](https://pypi.org/project/airbyte/) + +[![PyPI - Implementation](https://img.shields.io/pypi/implementation/airbyte)](https://pypi.org/project/airbyte/) +[![PyPI - Format](https://img.shields.io/pypi/format/airbyte)](https://pypi.org/project/airbyte/) +[![Star on GitHub](https://img.shields.io/github/stars/airbytehq/pyairbyte.svg?style=social&label=★%20on%20GitHub)](https://github.com/airbytehq/pyairbyte) - [Getting Started](#getting-started) - [Secrets Management](#secrets-management) @@ -29,24 +39,34 @@ PyAirbyte can auto-import secrets from the following sources: 3. [Google Colab secrets](https://medium.com/@parthdasawant/how-to-use-secrets-in-google-colab-450c38e3ec75). 4. Manual entry via [`getpass`](https://docs.python.org/3.9/library/getpass.html). -_Note: Additional secret store options may be supported in the future. [More info here.](https://github.com/airbytehq/airbyte-lib-private-beta/discussions/5)_ +_Note: You can also build your own secret manager by subclassing the `CustomSecretManager` implementation. For more information, see the `airbyte.secrets.CustomSecretManager` class definiton._ ### Retrieving Secrets ```python -from airbyte import get_secret, SecretSource +import airbyte as ab -source = get_source("source-github") +source = ab.get_source("source-github") source.set_config( "credentials": { - "personal_access_token": get_secret("GITHUB_PERSONAL_ACCESS_TOKEN"), + "personal_access_token": ab.get_secret("GITHUB_PERSONAL_ACCESS_TOKEN"), } ) ``` -The `get_secret()` function accepts an optional `source` argument of enum type `SecretSource`. If omitted or set to `SecretSource.ANY`, PyAirbyte will search all available secrets sources. If `source` is set to a specific source, then only that source will be checked. If a list of `SecretSource` entries is passed, then the sources will be checked using the provided ordering. +By default, PyAirbyte will search all available secrets sources. The `get_secret()` function also accepts an optional `sources` argument of specific source names (`SecretSourceEnum`) and/or secret manager objects to check. -By default, PyAirbyte will prompt the user for any requested secrets that are not provided via other secret managers. You can disable this prompt by passing `prompt=False` to `get_secret()`. +By default, PyAirbyte will prompt the user for any requested secrets that are not provided via other secret managers. You can disable this prompt by passing `allow_prompt=False` to `get_secret()`. + +For more information, see the `airbyte.secrets` module. + +### Secrets Auto-Discovery + +If you have a secret matching an expected name, PyAirbyte will automatically use it. For example, if you have a secret named `GITHUB_PERSONAL_ACCESS_TOKEN`, PyAirbyte will automatically use it when configuring the GitHub source. + +The naming convention for secrets is as `{CONNECTOR_NAME}_{PROPERTY_NAME}`, for instance `SNOWFLAKE_PASSWORD` and `BIGQUERY_CREDENTIALS_PATH`. + +PyAirbyte will also auto-discover secrets for interop with hosted Airbyte: `AIRBYTE_CLOUD_API_URL`, `AIRBYTE_CLOUD_API_KEY`, etc. ## Connector compatibility @@ -120,7 +140,6 @@ Yes. Just pick the cache type matching the destination - like SnowflakeCache for **6. Can PyAirbyte import a connector from a local directory that has python project files, or does it have to be pip install** Yes, PyAirbyte can use any local install that has a CLI - and will automatically find connectors by name if they are on PATH. - ## Changelog and Release Notes For a version history and list of all changes, please see our [GitHub Releases](https://github.com/airbytehq/PyAirbyte/releases) page. diff --git a/airbyte/__init__.py b/airbyte/__init__.py index 7c65a199..d4155c17 100644 --- a/airbyte/__init__.py +++ b/airbyte/__init__.py @@ -1,3 +1,4 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. """PyAirbyte brings Airbyte ELT to every Python developer. .. include:: ../README.md @@ -7,14 +8,14 @@ """ from __future__ import annotations -from airbyte import caches, datasets, documents, exceptions, results, secrets, sources +from airbyte import caches, cloud, datasets, documents, exceptions, results, secrets, sources from airbyte.caches.bigquery import BigQueryCache from airbyte.caches.duckdb import DuckDBCache from airbyte.caches.util import get_default_cache, new_local_cache from airbyte.datasets import CachedDataset from airbyte.records import StreamRecord from airbyte.results import ReadResult -from airbyte.secrets import SecretSource, get_secret +from airbyte.secrets import SecretSourceEnum, get_secret from airbyte.sources import registry from airbyte.sources.base import Source from airbyte.sources.registry import get_available_connectors @@ -23,6 +24,7 @@ __all__ = [ # Modules + "cloud", "caches", "datasets", "documents", @@ -43,7 +45,7 @@ "CachedDataset", "DuckDBCache", "ReadResult", - "SecretSource", + "SecretSourceEnum", "Source", "StreamRecord", ] diff --git a/airbyte/_executor.py b/airbyte/_executor.py index 139d3ef5..3792fdb7 100644 --- a/airbyte/_executor.py +++ b/airbyte/_executor.py @@ -47,7 +47,7 @@ def __init__( The 'name' param is required if 'metadata' is None. """ if not name and not metadata: - raise exc.AirbyteLibInternalError(message="Either name or metadata must be provided.") + raise exc.PyAirbyteInternalError(message="Either name or metadata must be provided.") self.name: str = name or cast(ConnectorMetadata, metadata).name # metadata is not None here self.metadata: ConnectorMetadata | None = metadata @@ -270,7 +270,7 @@ def _get_installed_version( if not self.interpreter_path.exists(): # No point in trying to detect the version if the interpreter does not exist if raise_on_error: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Connector's virtual environment interpreter could not be found.", context={ "interpreter_path": self.interpreter_path, diff --git a/airbyte/_processors/base.py b/airbyte/_processors/base.py index 6c1d35a7..84234dcf 100644 --- a/airbyte/_processors/base.py +++ b/airbyte/_processors/base.py @@ -60,7 +60,7 @@ def __init__( self._expected_streams: set[str] | None = None self.cache: CacheBase = cache if not isinstance(self.cache, CacheBase): - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message=( f"Expected config class of type 'CacheBase'. " f"Instead received type '{type(self.cache).__name__}'." @@ -92,7 +92,7 @@ def register_source( ) -> None: """Register the source name and catalog.""" if not self._catalog_manager: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Catalog manager should exist but does not.", ) self._catalog_manager.register_source( @@ -226,7 +226,7 @@ def _finalize_state_messages( ) -> None: """Handle state messages by passing them to the catalog manager.""" if not self._catalog_manager: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Catalog manager should exist but does not.", ) if state_messages and self._source_name: @@ -251,7 +251,7 @@ def _get_stream_config( ) -> ConfiguredAirbyteStream: """Return the definition of the given stream.""" if not self._catalog_manager: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Catalog manager should exist but does not.", ) diff --git a/airbyte/_processors/file/base.py b/airbyte/_processors/file/base.py index 52267bc1..5c644371 100644 --- a/airbyte/_processors/file/base.py +++ b/airbyte/_processors/file/base.py @@ -162,7 +162,7 @@ def process_record_message( batch_handle = self._new_batch(stream_name=stream_name) if batch_handle.open_file_writer is None: - raise exc.AirbyteLibInternalError(message="Expected open file writer.") + raise exc.PyAirbyteInternalError(message="Expected open file writer.") self._write_record_dict( record_dict=StreamRecord.from_record_message( diff --git a/airbyte/_processors/sql/base.py b/airbyte/_processors/sql/base.py index bb67a696..db2bb55c 100644 --- a/airbyte/_processors/sql/base.py +++ b/airbyte/_processors/sql/base.py @@ -296,7 +296,7 @@ def _get_table_by_name( query. To ignore the cache and force a refresh, set 'force_refresh' to True. """ if force_refresh and shallow_okay: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Cannot force refresh and use shallow query at the same time." ) @@ -453,7 +453,7 @@ def _ensure_compatible_table_schema( ] if missing_columns: if raise_on_error: - raise exc.AirbyteLibCacheTableValidationError( + raise exc.PyAirbyteCacheTableValidationError( violation="Cache table is missing expected columns.", context={ "stream_column_names": stream_column_names, @@ -666,7 +666,7 @@ def _write_files_to_new_table( # Pandas will auto-create the table if it doesn't exist, which we don't want. if not self._table_exists(temp_table_name): - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Table does not exist after creation.", context={ "temp_table_name": temp_table_name, @@ -727,7 +727,7 @@ def _write_temp_table_to_final_table( has_pks: bool = bool(self._get_primary_keys(stream_name)) has_incremental_key: bool = bool(self._get_incremental_key(stream_name)) if write_strategy == WriteStrategy.MERGE and not has_pks: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Cannot use merge strategy on a stream with no primary keys.", context={ "stream_name": stream_name, @@ -783,7 +783,7 @@ def _write_temp_table_to_final_table( ) return - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Write strategy is not supported.", context={ "write_strategy": write_strategy, @@ -843,9 +843,9 @@ def _swap_temp_table_with_final_table( Databases that do not support this syntax can override this method. """ if final_table_name is None: - raise exc.AirbyteLibInternalError(message="Arg 'final_table_name' cannot be None.") + raise exc.PyAirbyteInternalError(message="Arg 'final_table_name' cannot be None.") if temp_table_name is None: - raise exc.AirbyteLibInternalError(message="Arg 'temp_table_name' cannot be None.") + raise exc.PyAirbyteInternalError(message="Arg 'temp_table_name' cannot be None.") _ = stream_name deletion_name = f"{final_table_name}_deleteme" @@ -909,7 +909,7 @@ def _get_column_by_name(self, table: str | Table, column_name: str) -> Column: # Try to get the column in a case-insensitive manner return next(col for col in table.c if col.name.lower() == column_name.lower()) except StopIteration: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Could not find matching column.", context={ "table": table, diff --git a/airbyte/_processors/sql/bigquery.py b/airbyte/_processors/sql/bigquery.py index 6c4ec4ad..7c821ce0 100644 --- a/airbyte/_processors/sql/bigquery.py +++ b/airbyte/_processors/sql/bigquery.py @@ -175,7 +175,7 @@ def _table_exists( return False except ValueError as ex: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Invalid project name or dataset name.", context={ "table_id": table_id, @@ -225,9 +225,9 @@ def _swap_temp_table_with_final_table( ALTER TABLE my_schema.my_old_table_name RENAME TO my_new_table_name; """ if final_table_name is None: - raise exc.AirbyteLibInternalError(message="Arg 'final_table_name' cannot be None.") + raise exc.PyAirbyteInternalError(message="Arg 'final_table_name' cannot be None.") if temp_table_name is None: - raise exc.AirbyteLibInternalError(message="Arg 'temp_table_name' cannot be None.") + raise exc.PyAirbyteInternalError(message="Arg 'temp_table_name' cannot be None.") _ = stream_name deletion_name = f"{final_table_name}_deleteme" diff --git a/airbyte/_util/api_duck_types.py b/airbyte/_util/api_duck_types.py new file mode 100644 index 00000000..fe0a8c20 --- /dev/null +++ b/airbyte/_util/api_duck_types.py @@ -0,0 +1,21 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""A set of duck-typed classes for working with the Airbyte API.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Protocol + + +if TYPE_CHECKING: + import requests + + +class AirbyteApiResponseDuckType(Protocol): + """Used for duck-typing various Airbyte API responses.""" + + content_type: str + r"""HTTP response content type for this operation""" + status_code: int + r"""HTTP response status code for this operation""" + raw_response: requests.Response + r"""Raw HTTP response; suitable for custom response parsing""" diff --git a/airbyte/_util/api_util.py b/airbyte/_util/api_util.py new file mode 100644 index 00000000..7b0edd30 --- /dev/null +++ b/airbyte/_util/api_util.py @@ -0,0 +1,551 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""These internal functions are used to interact with the Airbyte API (module named `airbyte`). + +In order to insulate users from breaking changes and to avoid general confusion around naming +and design inconsistencies, we do not expose these functions or other Airbyte API classes within +PyAirbyte. Classes and functions from the Airbyte API external library should always be wrapped in +PyAirbyte classes - unless there's a very compelling reason to surface these models intentionally. + +Similarly, modules outside of this file should try to avoid interfacing with `airbyte_api` library +directly. This will ensure a single source of truth when mapping between the `airbyte` and +`airbyte_api` libraries. +""" + +from __future__ import annotations + +import json +from typing import Any + +import airbyte_api +from airbyte_api.models import operations as api_operations +from airbyte_api.models import shared as api_models +from airbyte_api.models.shared.jobcreaterequest import JobCreateRequest, JobTypeEnum + +from airbyte.exceptions import ( + AirbyteConnectionSyncError, + AirbyteError, + AirbyteMissingResourceError, + AirbyteMultipleResourcesError, +) + + +JOB_WAIT_INTERVAL_SECS = 2.0 +JOB_WAIT_TIMEOUT_SECS_DEFAULT = 60 * 60 # 1 hour +CLOUD_API_ROOT = "https://api.airbyte.com/v1" + +# Helper functions + + +def status_ok(status_code: int) -> bool: + """Check if a status code is OK.""" + return status_code >= 200 and status_code < 300 # noqa: PLR2004 # allow inline magic numbers + + +def get_airbyte_server_instance( + *, + api_key: str, + api_root: str, +) -> airbyte_api.Airbyte: + """Get an Airbyte instance.""" + return airbyte_api.Airbyte( + security=api_models.Security( + bearer_auth=api_key, + ), + server_url=api_root, + ) + + +# Get workspace + + +def get_workspace( + workspace_id: str, + *, + api_root: str, + api_key: str, +) -> api_models.WorkspaceResponse: + """Get a connection.""" + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.workspaces.get_workspace( + api_operations.GetWorkspaceRequest( + workspace_id=workspace_id, + ), + ) + if status_ok(response.status_code) and response.workspace_response: + return response.workspace_response + + raise AirbyteMissingResourceError( + resource_type="workspace", + context={ + "workspace_id": workspace_id, + "response": response, + }, + ) + + +# List, get, and run connections + + +def list_connections( + workspace_id: str, + *, + api_root: str, + api_key: str, +) -> list[api_models.ConnectionResponse]: + """Get a connection.""" + _ = workspace_id # Not used (yet) + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.connections.list_connections( + api_operations.ListConnectionsRequest()( + workspace_ids=[workspace_id], + ), + ) + + if status_ok(response.status_code) and response.connections_response: + return response.connections_response.data + + raise AirbyteError( + context={ + "workspace_id": workspace_id, + "response": response, + } + ) + + +def get_connection( + workspace_id: str, + connection_id: str, + *, + api_root: str, + api_key: str, +) -> api_models.ConnectionResponse: + """Get a connection.""" + _ = workspace_id # Not used (yet) + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.connections.get_connection( + api_operations.GetConnectionRequest( + connection_id=connection_id, + ), + ) + if status_ok(response.status_code) and response.connection_response: + return response.connection_response + + raise AirbyteMissingResourceError(connection_id, "connection", response.text) + + +def run_connection( + workspace_id: str, + connection_id: str, + *, + api_root: str, + api_key: str, +) -> api_models.ConnectionResponse: + """Get a connection. + + If block is True, this will block until the connection is finished running. + + If raise_on_failure is True, this will raise an exception if the connection fails. + """ + _ = workspace_id # Not used (yet) + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.jobs.create_job( + JobCreateRequest( + connection_id=connection_id, + job_type=JobTypeEnum.SYNC, + ), + ) + if status_ok(response.status_code) and response.job_response: + return response.job_response + + raise AirbyteConnectionSyncError( + connection_id=connection_id, + context={ + "workspace_id": workspace_id, + }, + response=response, + ) + + +# Get job info (logs) + + +def get_job_logs( + workspace_id: str, + connection_id: str, + limit: int = 20, + *, + api_root: str, + api_key: str, +) -> list[api_models.JobResponse]: + """Get a job's logs.""" + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response: api_operations.ListJobsResponse = airbyte_instance.jobs.list_jobs( + api_operations.ListJobsRequest( + workspace_ids=[workspace_id], + connection_id=connection_id, + limit=limit, + ), + ) + if status_ok(response.status_code) and response.jobs_response: + return response.jobs_response.data + + raise AirbyteMissingResourceError( + response=response, + resource_type="job", + context={ + "workspace_id": workspace_id, + "connection_id": connection_id, + }, + ) + + +def get_job_info( + job_id: str, + *, + api_root: str, + api_key: str, +) -> api_models.JobResponse: + """Get a job.""" + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.jobs.get_job( + api_operations.GetJobRequest( + job_id=job_id, + ), + ) + if status_ok(response.status_code) and response.job_response: + return response.job_response + + raise AirbyteMissingResourceError(job_id, "job", response.text) + + +# Create, get, and delete sources + + +def create_source( + name: str, + *, + workspace_id: str, + config: dict[str, Any], + api_root: str, + api_key: str, +) -> api_models.SourceResponse: + """Get a connection.""" + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response: api_operations.CreateSourceResponse = airbyte_instance.sources.create_source( + api_models.SourceCreateRequest( + name=name, + workspace_id=workspace_id, + configuration=config, # TODO: wrap in a proper configuration object + definition_id=None, # Not used alternative to config.sourceType. + secret_id=None, # For OAuth, not yet supported + ), + ) + if status_ok(response.status_code) and response.source_response: + return response.source_response + + raise AirbyteError( + message="Could not create source.", + response=response, + ) + + +def get_source( + source_id: str, + *, + api_root: str, + api_key: str, +) -> api_models.SourceResponse: + """Get a connection.""" + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.sources.get_source( + api_operations.GetSourceRequest( + source_id=source_id, + ), + ) + if status_ok(response.status_code) and response.connection_response: + return response.connection_response + + raise AirbyteMissingResourceError(source_id, "source", response.text) + + +def delete_source( + source_id: str, + *, + api_root: str, + api_key: str, + workspace_id: str | None = None, +) -> None: + """Delete a source.""" + _ = workspace_id # Not used (yet) + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.sources.delete_source( + api_operations.DeleteSourceRequest( + source_id=source_id, + ), + ) + if not status_ok(response.status_code): + raise AirbyteError( + context={ + "source_id": source_id, + "response": response, + }, + ) + + +# Create, get, and delete destinations + + +def create_destination( + name: str, + *, + workspace_id: str, + config: dict[str, Any], + api_root: str, + api_key: str, +) -> api_models.DestinationResponse: + """Get a connection.""" + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response: api_operations.CreateDestinationResponse = ( + airbyte_instance.destinations.create_destination( + api_models.DestinationCreateRequest( + name=name, + workspace_id=workspace_id, + configuration=config, # TODO: wrap in a proper configuration object + ), + ) + ) + if status_ok(response.status_code) and response.destination_response: + return response.destination_response + + raise AirbyteError( + message="Could not create destination.", + response=response, + ) + + +def get_destination( + destination_id: str, + *, + api_root: str, + api_key: str, +) -> api_models.DestinationResponse: + """Get a connection.""" + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.destinations.get_destination( + api_operations.GetDestinationRequest( + destination_id=destination_id, + ), + ) + if status_ok(response.status_code): + # TODO: This is a temporary workaround to resolve an issue where + # the destination API response is of the wrong type. + raw_response: dict[str, Any] = json.loads(response.raw_response.text) + raw_configuration: dict[str, Any] = raw_response["configuration"] + destination_type = raw_response.get("destinationType") + if destination_type == "snowflake": + response.destination_response.configuration = api_models.DestinationSnowflake.from_dict( + raw_configuration, + ) + if destination_type == "bigquery": + response.destination_response.configuration = api_models.DestinationBigquery.from_dict( + raw_configuration, + ) + if destination_type == "postgres": + response.destination_response.configuration = api_models.DestinationPostgres.from_dict( + raw_configuration, + ) + if destination_type == "duckdb": + response.destination_response.configuration = api_models.DestinationDuckdb.from_dict( + raw_configuration, + ) + + return response.destination_response + + raise AirbyteMissingResourceError(destination_id, "destination", response.text) + + +def delete_destination( + destination_id: str, + *, + api_root: str, + api_key: str, + workspace_id: str | None = None, +) -> None: + """Delete a destination.""" + _ = workspace_id # Not used (yet) + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.destinations.delete_destination( + api_operations.DeleteDestinationRequest( + destination_id=destination_id, + ), + ) + if not status_ok(response.status_code): + raise AirbyteError( + context={ + "destination_id": destination_id, + "response": response, + }, + ) + + +# Create and delete connections + + +def create_connection( + name: str, + *, + source_id: str, + destination_id: str, + api_root: str, + api_key: str, + workspace_id: str | None = None, + prefix: str, + selected_stream_names: list[str], +) -> api_models.ConnectionResponse: + _ = workspace_id # Not used (yet) + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + stream_configurations: list[api_models.StreamConfiguration] = [] + if selected_stream_names: + for stream_name in selected_stream_names: + stream_configuration = api_models.StreamConfiguration( + name=stream_name, + ) + stream_configurations.append(stream_configuration) + + stream_configurations = api_models.StreamConfigurations(stream_configurations) + response = airbyte_instance.connections.create_connection( + api_models.ConnectionCreateRequest( + name=name, + source_id=source_id, + destination_id=destination_id, + configurations=stream_configurations, + prefix=prefix, + ), + ) + if not status_ok(response.status_code): + raise AirbyteError( + context={ + "source_id": source_id, + "destination_id": destination_id, + "response": response, + }, + ) + + return response.connection_response + + +def get_connection_by_name( + workspace_id: str, + connection_name: str, + *, + api_root: str, + api_key: str, +) -> api_models.ConnectionResponse: + """Get a connection.""" + connections = list_connections( + workspace_id=workspace_id, + api_key=api_key, + api_root=api_root, + ) + found: list[api_models.ConnectionResponse] = [ + connection for connection in connections if connection.name == connection_name + ] + if len(found) == 0: + raise AirbyteMissingResourceError( + connection_name, "connection", f"Workspace: {workspace_id}" + ) + + if len(found) > 1: + raise AirbyteMultipleResourcesError( + resource_type="connection", + resource_name_or_id=connection_name, + context={ + "workspace_id": workspace_id, + "multiples": found, + }, + ) + + return found[0] + + +def delete_connection( + connection_id: str, + api_root: str, + workspace_id: str, + api_key: str, +) -> None: + _ = workspace_id # Not used (yet) + airbyte_instance = get_airbyte_server_instance( + api_key=api_key, + api_root=api_root, + ) + response = airbyte_instance.connections.delete_connection( + api_operations.DeleteConnectionRequest( + connection_id=connection_id, + ), + ) + if not status_ok(response.status_code): + raise AirbyteError( + context={ + "connection_id": connection_id, + "response": response, + }, + ) + + +# Not yet implemented + + +def check_source( + source_id: str, + *, + api_root: str, + api_key: str, + workspace_id: str | None = None, +) -> api_models.SourceCheckResponse: + """Check a source. + + # TODO: Need to use legacy Configuration API for this: + # https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#post-/v1/sources/check_connection + """ + _ = source_id, workspace_id, api_root, api_key + raise NotImplementedError diff --git a/airbyte/_util/google_secrets.py b/airbyte/_util/google_secrets.py deleted file mode 100644 index 7ff426dc..00000000 --- a/airbyte/_util/google_secrets.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Helpers for accessing Google secrets.""" - -from __future__ import annotations - -import json -import os - -from google.cloud import secretmanager - - -def get_gcp_secret( - project_name: str, - secret_name: str, -) -> str: - """Try to get a GCP secret from the environment, or raise an error. - - We assume that the Google service account credentials file contents are stored in the - environment variable GCP_GSM_CREDENTIALS. If this environment variable is not set, we raise an - error. Otherwise, we use the Google Secret Manager API to fetch the secret with the given name. - """ - if "GCP_GSM_CREDENTIALS" not in os.environ: - raise EnvironmentError( # noqa: TRY003, UP024 - "GCP_GSM_CREDENTIALS env variable not set, can't fetch secrets. Make sure they are set " - "up as described: " - "https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/ci_credentials/" - "README.md#get-gsm-access" - ) - - # load secrets from GSM using the GCP_GSM_CREDENTIALS env variable - secret_client = secretmanager.SecretManagerServiceClient.from_service_account_info( - json.loads(os.environ["GCP_GSM_CREDENTIALS"]) - ) - return secret_client.access_secret_version( - name=f"projects/{project_name}/secrets/{secret_name}/versions/latest" - ).payload.data.decode("UTF-8") - - -def get_gcp_secret_json( - project_name: str, - secret_name: str, -) -> dict: - """Get a JSON GCP secret and return as a dict. - - We assume that the Google service account credentials file contents are stored in the - environment variable GCP_GSM_CREDENTIALS. If this environment variable is not set, we raise an - error. Otherwise, we use the Google Secret Manager API to fetch the secret with the given name. - """ - return json.loads(get_gcp_secret(secret_name, project_name)) diff --git a/airbyte/_util/meta.py b/airbyte/_util/meta.py index 40283697..190634ce 100644 --- a/airbyte/_util/meta.py +++ b/airbyte/_util/meta.py @@ -53,6 +53,20 @@ def is_colab() -> bool: return bool(get_colab_release_version()) +@lru_cache +def is_interactive() -> bool: + if is_colab() or is_jupyter(): + return True + + if is_ci(): + return False + + if sys.__stdin__.isatty() and sys.__stdout__.isatty(): + return True + + return False + + @lru_cache def is_jupyter() -> bool: """Return True if running in a Jupyter notebook or qtconsole. diff --git a/airbyte/_util/temp_files.py b/airbyte/_util/temp_files.py new file mode 100644 index 00000000..a1a56532 --- /dev/null +++ b/airbyte/_util/temp_files.py @@ -0,0 +1,33 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +import json +import tempfile +from contextlib import contextmanager, suppress +from pathlib import Path +from typing import TYPE_CHECKING, Any + + +if TYPE_CHECKING: + from collections.abc import Generator + + +@contextmanager +def as_temp_files(files_contents: list[dict | str]) -> Generator[list[str], Any, None]: + """Write the given contents to temporary files and yield the file paths as strings.""" + temp_files: list[Any] = [] + try: + for content in files_contents: + temp_file = tempfile.NamedTemporaryFile(mode="w+t", delete=False) + temp_file.write( + json.dumps(content) if isinstance(content, dict) else content, + ) + temp_file.flush() + temp_files.append(temp_file) + yield [file.name for file in temp_files] + finally: + for temp_file in temp_files: + with suppress(Exception): + Path(temp_file.name).unlink() diff --git a/airbyte/caches/__init__.py b/airbyte/caches/__init__.py index c565a976..1b1df33f 100644 --- a/airbyte/caches/__init__.py +++ b/airbyte/caches/__init__.py @@ -4,6 +4,7 @@ from airbyte.caches import bigquery, duckdb, motherduck, postgres, snowflake, util from airbyte.caches.base import CacheBase +from airbyte.caches.bigquery import BigQueryCache from airbyte.caches.duckdb import DuckDBCache from airbyte.caches.motherduck import MotherDuckCache from airbyte.caches.postgres import PostgresCache @@ -17,6 +18,7 @@ "get_default_cache", "new_local_cache", # Classes + "BigQueryCache", "CacheBase", "DuckDBCache", "MotherDuckCache", diff --git a/airbyte/caches/_catalog_manager.py b/airbyte/caches/_catalog_manager.py index 6306b26d..ea26d521 100644 --- a/airbyte/caches/_catalog_manager.py +++ b/airbyte/caches/_catalog_manager.py @@ -85,10 +85,10 @@ def source_catalog(self) -> ConfiguredAirbyteCatalog: """Return the source catalog. Raises: - AirbyteLibInternalError: If the source catalog is not set. + PyAirbyteInternalError: If the source catalog is not set. """ if not self._source_catalog: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Source catalog should be initialized but is not.", ) @@ -231,7 +231,7 @@ def get_stream_config( ) -> ConfiguredAirbyteStream: """Return the column definitions for the given stream.""" if not self.source_catalog: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Cannot get stream JSON schema without a catalog.", ) @@ -249,7 +249,7 @@ def get_stream_config( ) if len(matching_streams) > 1: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Multiple streams found with same name.", context={ "stream_name": stream_name, diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index 7f5f34f6..4af67266 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -44,6 +44,10 @@ class CacheBase(BaseModel): table_suffix: str = "" """A suffix to add to all table names.""" + _deployed_api_root: Optional[str] = PrivateAttr(default=None) + _deployed_workspace_id: Optional[str] = PrivateAttr(default=None) + _deployed_destination_id: Optional[str] = PrivateAttr(default=None) + _sql_processor_class: type[SqlProcessorBase] = PrivateAttr() _sql_processor: Optional[SqlProcessorBase] = PrivateAttr(default=None) @@ -108,7 +112,7 @@ def _catalog_manager( self, ) -> CatalogManager: if not self._has_catalog_manager: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Catalog manager should exist but does not.", ) diff --git a/airbyte/caches/duckdb.py b/airbyte/caches/duckdb.py index 1bbaf550..d5514b3b 100644 --- a/airbyte/caches/duckdb.py +++ b/airbyte/caches/duckdb.py @@ -8,9 +8,10 @@ from airbyte.caches import DuckDBCache cache = DuckDBCache( - db_path="/path/to/my/database.duckdb", + db_path="/path/to/my/duckdb-file", schema_name="myschema", ) +``` """ from __future__ import annotations @@ -41,7 +42,7 @@ class DuckDBCache(CacheBase): """Normally db_path is a Path object. The database name will be inferred from the file name. For example, given a `db_path` of - `/path/to/my/my_db.duckdb`, the database name is `my_db`. + `/path/to/my/duckdb-file`, the database name is `my_db`. """ schema_name: str = "main" diff --git a/airbyte/caches/motherduck.py b/airbyte/caches/motherduck.py index 99b599cc..4f538334 100644 --- a/airbyte/caches/motherduck.py +++ b/airbyte/caches/motherduck.py @@ -20,6 +20,7 @@ from airbyte._processors.sql.motherduck import MotherDuckSqlProcessor from airbyte.caches.duckdb import DuckDBCache +from airbyte.secrets import SecretString class MotherDuckCache(DuckDBCache): @@ -27,14 +28,14 @@ class MotherDuckCache(DuckDBCache): db_path: str = Field(default="md:") database: str - api_key: str + api_key: SecretString _sql_processor_class = MotherDuckSqlProcessor @overrides - def get_sql_alchemy_url(self) -> str: + def get_sql_alchemy_url(self) -> SecretString: """Return the SQLAlchemy URL to use.""" - return ( + return SecretString( f"duckdb:///md:{self.database}?motherduck_token={self.api_key}" # f"&schema={self.schema_name}" # TODO: Debug why this doesn't work ) diff --git a/airbyte/caches/postgres.py b/airbyte/caches/postgres.py index 5d4c33e2..c82869ba 100644 --- a/airbyte/caches/postgres.py +++ b/airbyte/caches/postgres.py @@ -23,6 +23,7 @@ from airbyte._processors.sql.postgres import PostgresSqlProcessor from airbyte.caches.base import CacheBase +from airbyte.secrets import SecretString class PostgresCache(CacheBase): @@ -34,15 +35,17 @@ class PostgresCache(CacheBase): host: str port: int username: str - password: str + password: SecretString database: str _sql_processor_class = PostgresSqlProcessor @overrides - def get_sql_alchemy_url(self) -> str: + def get_sql_alchemy_url(self) -> SecretString: """Return the SQLAlchemy URL to use.""" - return f"postgresql+psycopg2://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}" + return SecretString( + f"postgresql+psycopg2://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}" + ) @overrides def get_database_name(self) -> str: diff --git a/airbyte/caches/snowflake.py b/airbyte/caches/snowflake.py index f0e55f3b..4819b919 100644 --- a/airbyte/caches/snowflake.py +++ b/airbyte/caches/snowflake.py @@ -27,6 +27,7 @@ from airbyte._processors.sql.base import RecordDedupeMode from airbyte._processors.sql.snowflake import SnowflakeSqlProcessor from airbyte.caches.base import CacheBase +from airbyte.secrets import SecretString class SnowflakeCache(CacheBase): @@ -34,7 +35,7 @@ class SnowflakeCache(CacheBase): account: str username: str - password: str + password: SecretString warehouse: str database: str role: str @@ -47,9 +48,9 @@ class SnowflakeCache(CacheBase): # schema_name: str @overrides - def get_sql_alchemy_url(self) -> str: + def get_sql_alchemy_url(self) -> SecretString: """Return the SQLAlchemy URL to use.""" - return str( + return SecretString( URL( account=self.account, user=self.username, diff --git a/airbyte/caches/util.py b/airbyte/caches/util.py index 8e3a6e81..d1cf2128 100644 --- a/airbyte/caches/util.py +++ b/airbyte/caches/util.py @@ -42,13 +42,13 @@ def new_local_cache( """ if cache_name: if " " in cache_name: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Cache name cannot contain spaces.", input_value=cache_name, ) if not cache_name.replace("_", "").isalnum(): - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Cache name can only contain alphanumeric characters and underscores.", input_value=cache_name, ) diff --git a/airbyte/cloud/__init__.py b/airbyte/cloud/__init__.py new file mode 100644 index 00000000..cd58e5de --- /dev/null +++ b/airbyte/cloud/__init__.py @@ -0,0 +1,47 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""PyAirbyte classes and methods for interacting with the Airbyte Cloud API. + +You can use this module to interact with Airbyte Cloud, OSS, and Enterprise. + +Usage example: + +```python +import airbyte as ab +from airbyte import cloud + +workspace = cloud.CloudWorkspace( + workspace_id="123", + api_key=ab.get_secret("AIRBYTE_CLOUD_API_KEY"), +) + +sync_result = workspace.run_sync( + connection_id="456", +) +print(sync_result.get_job_status()) +``` + + +ℹ️ **Experimental Features** + +You can use the `airbyte.cloud.experimental` module to access experimental features. +These additional features are subject to change and may not be available in all environments. +""" # noqa: RUF002 # Allow emoji + +from __future__ import annotations + +from airbyte.cloud import connections, sync_results, workspaces +from airbyte.cloud.connections import CloudConnection +from airbyte.cloud.sync_results import SyncResult +from airbyte.cloud.workspaces import CloudWorkspace + + +__all__ = [ + # Submodules + "workspaces", + "connections", + "sync_results", + # Classes + "CloudWorkspace", + "CloudConnection", + "SyncResult", +] diff --git a/airbyte/cloud/_destination_util.py b/airbyte/cloud/_destination_util.py new file mode 100644 index 00000000..08366398 --- /dev/null +++ b/airbyte/cloud/_destination_util.py @@ -0,0 +1,214 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Cloud destinations for Airbyte.""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from airbyte_api.models.shared import ( + DestinationBigquery, + DestinationDuckdb, + DestinationPostgres, + DestinationSnowflake, + StandardInserts, + UsernameAndPassword, +) + +from airbyte.caches import ( + BigQueryCache, + DuckDBCache, + MotherDuckCache, + PostgresCache, + SnowflakeCache, +) +from airbyte.secrets import get_secret + + +if TYPE_CHECKING: + from collections.abc import Callable + + from airbyte.caches.base import CacheBase + + +SNOWFLAKE_PASSWORD_SECRET_NAME = "SNOWFLAKE_PASSWORD" + + +def get_destination_config_from_cache( + cache: CacheBase, +) -> dict[str, str]: + """Get the destination configuration from the cache.""" + conversion_fn_map: dict[str, Callable[[Any], dict[str, str]]] = { + "BigQueryCache": get_bigquery_destination_config, + "DuckDBCache": get_duckdb_destination_config, + "MotherDuckCache": get_motherduck_destination_config, + "PostgresCache": get_postgres_destination_config, + "SnowflakeCache": get_snowflake_destination_config, + } + cache_class_name = cache.__class__.__name__ + if cache_class_name not in conversion_fn_map: + raise ValueError( # noqa: TRY003 + "Cannot convert cache type to destination configuration. Cache type not supported. ", + f"Supported cache types: {list(conversion_fn_map.keys())}", + ) + + conversion_fn = conversion_fn_map[cache_class_name] + return conversion_fn(cache) + + +def get_duckdb_destination_config( + cache: DuckDBCache, +) -> dict[str, str]: + """Get the destination configuration from the DuckDB cache.""" + return DestinationDuckdb( + destination_path=cache.db_path, + schema=cache.schema_name, + ).to_dict() + + +def get_motherduck_destination_config( + cache: MotherDuckCache, +) -> dict[str, str]: + """Get the destination configuration from the DuckDB cache.""" + return DestinationDuckdb( + destination_path=cache.db_path, + schema=cache.schema_name, + motherduck_api_key=cache.api_key, + ).to_dict() + + +def get_postgres_destination_config( + cache: PostgresCache, +) -> dict[str, str]: + """Get the destination configuration from the Postgres cache.""" + return DestinationPostgres( + database=cache.database, + host=cache.host, + password=cache.password, + port=cache.port, + schema=cache.schema_name, + username=cache.username, + ).to_dict() + + +def get_snowflake_destination_config( + cache: SnowflakeCache, +) -> dict[str, str]: + """Get the destination configuration from the Snowflake cache.""" + return DestinationSnowflake( + host=f"{cache.account}.snowflakecomputing.com", + database=cache.get_database_name().upper(), + schema=cache.schema_name.upper(), + warehouse=cache.warehouse, + role=cache.role, + username=cache.username, + credentials=UsernameAndPassword( + password=cache.password, + ), + ).to_dict() + + +def get_bigquery_destination_config( + cache: BigQueryCache, +) -> dict[str, str]: + """Get the destination configuration from the BigQuery cache.""" + credentials_json: str | None = ( + Path(cache.credentials_path).read_text() if cache.credentials_path else None + ) + destination = DestinationBigquery( + project_id=cache.project_name, + dataset_id=cache.dataset_name, + dataset_location="US", + credentials_json=credentials_json, + loading_method=StandardInserts, + ) + return destination.to_dict() + + +def create_bigquery_cache( + destination_configuration: DestinationBigquery, +) -> BigQueryCache: + """Create a new BigQuery cache from the destination configuration.""" + credentials_path = get_secret("BIGQUERY_CREDENTIALS_PATH") + return BigQueryCache( + project_name=destination_configuration.project_id, + dataset_name=destination_configuration.dataset_id, + schema_name=destination_configuration.schema, + credentials_path=credentials_path, + ) + + +def create_duckdb_cache( + destination_configuration: DestinationDuckdb, +) -> DuckDBCache: + """Create a new DuckDB cache from the destination configuration.""" + return DuckDBCache( + db_path=destination_configuration.destination_path, + schema_name=destination_configuration.schema, + ) + + +def create_motherduck_cache( + destination_configuration: DestinationDuckdb, +) -> MotherDuckCache: + """Create a new DuckDB cache from the destination configuration.""" + return MotherDuckCache( + database=destination_configuration.destination_path, + schema_name=destination_configuration.schema, + api_key=destination_configuration.motherduck_api_key, + ) + + +def create_postgres_cache( + destination_configuration: DestinationPostgres, +) -> PostgresCache: + """Create a new Postgres cache from the destination configuration.""" + port: int = int(destination_configuration.port) if "port" in destination_configuration else 5432 + return PostgresCache( + database=destination_configuration.database, + host=destination_configuration.host, + password=destination_configuration.password, + port=port, + schema_name=destination_configuration.schema, + username=destination_configuration.username, + ) + + +def create_snowflake_cache( + destination_configuration: DestinationSnowflake, + password_secret_name: str = SNOWFLAKE_PASSWORD_SECRET_NAME, +) -> SnowflakeCache: + """Create a new Snowflake cache from the destination configuration.""" + return SnowflakeCache( + account=destination_configuration.host.split(".snowflakecomputing")[0], + database=destination_configuration.database, + schema_name=destination_configuration.schema, + warehouse=destination_configuration.warehouse, + role=destination_configuration.role, + username=destination_configuration.username, + password=get_secret(password_secret_name), + ) + + +def create_cache_from_destination_config( + destination_configuration: DestinationBigquery + | DestinationDuckdb + | DestinationPostgres + | DestinationSnowflake, +) -> CacheBase: + """Create a new cache from the destination.""" + conversion_fn_map: dict[str, Callable[[dict[str, str]], CacheBase]] = { + "DestinationBigquery": create_bigquery_cache, + "DestinationDuckdb": create_duckdb_cache, + "DestinationPostgres": create_postgres_cache, + "DestinationSnowflake": create_snowflake_cache, + } + destination_class_name = type(destination_configuration).__name__ + if destination_class_name not in conversion_fn_map: + raise ValueError( # noqa: TRY003 + "Cannot convert destination configuration to cache. Destination type not supported. ", + f"Supported destination types: {list(conversion_fn_map.keys())}", + ) + + conversion_fn = conversion_fn_map[destination_class_name] + return conversion_fn(destination_configuration) diff --git a/airbyte/cloud/connections.py b/airbyte/cloud/connections.py new file mode 100644 index 00000000..52003264 --- /dev/null +++ b/airbyte/cloud/connections.py @@ -0,0 +1,208 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Cloud Connections.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + +from airbyte._util import api_util +from airbyte.cloud.sync_results import SyncResult + + +if TYPE_CHECKING: + from airbyte_api.models.shared.connectionresponse import ConnectionResponse + from airbyte_api.models.shared.jobresponse import JobResponse + + from airbyte.cloud.workspaces import CloudWorkspace + + +class CloudConnection: + """A connection is an extract-load (EL) pairing of a source and destination.""" + + def __init__( + self, + workspace: CloudWorkspace, + connection_id: str, + source: str | None = None, + destination: str | None = None, + ) -> None: + self.connection_id = connection_id + """The ID of the connection.""" + + self.workspace = workspace + """The workspace that the connection belongs to.""" + + self._source_id = source + """The ID of the source.""" + + self._destination_id = destination + """The ID of the destination.""" + + self._connection_info: ConnectionResponse | None = None + + def _fetch_connection_info(self) -> ConnectionResponse: + """Populate the connection with data from the API.""" + return api_util.get_connection( + workspace_id=self.workspace.workspace_id, + connection_id=self.connection_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + ) + + # Properties + + @property + def source_id(self) -> str: + """The ID of the source.""" + if not self._source_id: + if not self._connection_info: + self._connection_info = self._fetch_connection_info() + + self._source_id = self._connection_info.source_id + + return cast(str, self._source_id) + + @property + def destination_id(self) -> str: + """The ID of the destination.""" + if not self._destination_id: + if not self._connection_info: + self._connection_info = self._fetch_connection_info() + + self._destination_id = self._connection_info.source_id + + return cast(str, self._destination_id) + + @property + def stream_names(self) -> list[str]: + """The stream names.""" + if not self._connection_info: + self._connection_info = self._fetch_connection_info() + + return [stream.name for stream in self._connection_info.configurations.streams] + + @property + def table_prefix(self) -> str: + """The table prefix.""" + if not self._connection_info: + self._connection_info = self._fetch_connection_info() + + return self._connection_info.prefix + + @property + def connection_url(self) -> str | None: + return f"{self.workspace.workspace_url}/connections/{self.connection_id}" + + @property + def job_history_url(self) -> str | None: + return f"{self.connection_url}/job-history" + + # Run Sync + + def run_sync( + self, + *, + wait: bool = True, + wait_timeout: int = 300, + ) -> SyncResult: + """Run a sync.""" + connection_response = api_util.run_connection( + connection_id=self.connection_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + workspace_id=self.workspace.workspace_id, + ) + sync_result = SyncResult( + workspace=self.workspace, + connection=self, + job_id=connection_response.job_id, + ) + + if wait: + sync_result.wait_for_completion( + wait_timeout=wait_timeout, + raise_failure=True, + raise_timeout=True, + ) + + return sync_result + + # Logs + + def get_previous_sync_logs( + self, + *, + limit: int = 10, + ) -> list[SyncResult]: + """Get the previous sync logs for a connection.""" + sync_logs: list[JobResponse] = api_util.get_job_logs( + connection_id=self.connection_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + workspace_id=self.workspace.workspace_id, + limit=limit, + ) + return [ + SyncResult( + workspace=self.workspace, + connection=self, + job_id=sync_log.job_id, + _latest_job_info=sync_log, + ) + for sync_log in sync_logs + ] + + def get_sync_result( + self, + job_id: str | None = None, + ) -> SyncResult | None: + """Get the sync result for the connection. + + If `job_id` is not provided, the most recent sync job will be used. + + Returns `None` if job_id is omitted and no previous jobs are found. + """ + if job_id is None: + # Get the most recent sync job + results = self.get_previous_sync_logs( + limit=1, + ) + if results: + return results[0] + + return None + + # Get the sync job by ID (lazy loaded) + return SyncResult( + workspace=self.workspace, + connection=self, + job_id=job_id, + ) + + # Deletions + + def _permanently_delete( + self, + *, + delete_source: bool = False, + delete_destination: bool = False, + ) -> None: + """Delete the connection. + + Args: + delete_source: Whether to also delete the source. + delete_destination: Whether to also delete the destination. + """ + self.workspace._permanently_delete_connection( # noqa: SLF001 # Non-public API (for now) + connection=self + ) + + if delete_source: + self.workspace._permanently_delete_source( # noqa: SLF001 # Non-public API (for now) + source=self.source_id + ) + + if delete_destination: + self.workspace._permanently_delete_destination( # noqa: SLF001 # Non-public API + destination=self.destination_id, + ) diff --git a/airbyte/cloud/experimental.py b/airbyte/cloud/experimental.py new file mode 100644 index 00000000..fbc3ace4 --- /dev/null +++ b/airbyte/cloud/experimental.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Experimental features for interacting with the Airbyte Cloud API. + +You can use this module to access experimental features in Airbyte Cloud, OSS, and Enterprise. These +features are subject to change and may not be available in all environments. **Future versions of +PyAirbyte may remove or change these features without notice.** + +To use this module, replace an import like this: + +```python +from airbyte.cloud import CloudConnection, CloudWorkspace +``` + +with an import like this: + +```python +from airbyte.cloud.experimental import CloudConnection, CloudWorkspace +``` + +You can toggle between the stable and experimental versions of these classes by changing the import +path. This allows you to test new features without requiring substantial changes to your codebase. + +""" +# ruff: noqa: SLF001 # This file accesses private members of other classes. + +from __future__ import annotations + +import warnings + +from airbyte.cloud.connections import CloudConnection as Stable_CloudConnection +from airbyte.cloud.workspaces import CloudWorkspace as Stable_CloudWorkspace + + +# This module is not imported anywhere by default, so this warning should only print if the user +# explicitly imports it. +warnings.warn( + message="The `airbyte.cloud.experimental` module is experimental and may change in the future.", + category=FutureWarning, + stacklevel=2, +) + + +class CloudWorkspace(Stable_CloudWorkspace): + __doc__ = ( + f"Experimental implementation of `.CloudWorkspace`.\n\n{Stable_CloudConnection.__doc__}" + ) + deploy_connection = Stable_CloudWorkspace._deploy_connection + deploy_source = Stable_CloudWorkspace._deploy_source + deploy_cache_as_destination = Stable_CloudWorkspace._deploy_cache_as_destination + permanently_delete_connection = Stable_CloudWorkspace._permanently_delete_connection + permanently_delete_source = Stable_CloudWorkspace._permanently_delete_source + permanently_delete_destination = Stable_CloudWorkspace._permanently_delete_destination + + +class CloudConnection(Stable_CloudConnection): + __doc__ = ( + f"Experimental implementation of `.CloudConnection`.\n\n{Stable_CloudConnection.__doc__}" + ) + permanently_delete = Stable_CloudConnection._permanently_delete diff --git a/airbyte/cloud/sync_results.py b/airbyte/cloud/sync_results.py new file mode 100644 index 00000000..72ba7547 --- /dev/null +++ b/airbyte/cloud/sync_results.py @@ -0,0 +1,248 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Sync results for Airbyte Cloud workspaces.""" + +from __future__ import annotations + +import time +from collections.abc import Iterator, Mapping +from dataclasses import dataclass +from datetime import datetime +from typing import TYPE_CHECKING, Any, final + +from airbyte_api.models.shared import ConnectionResponse, JobResponse, JobStatusEnum + +from airbyte._util import api_util +from airbyte.cloud._destination_util import create_cache_from_destination_config +from airbyte.datasets import CachedDataset +from airbyte.exceptions import AirbyteConnectionSyncError, AirbyteConnectionSyncTimeoutError + + +DEFAULT_SYNC_TIMEOUT_SECONDS = 30 * 60 # 30 minutes + + +if TYPE_CHECKING: + import sqlalchemy + + from airbyte.caches.base import CacheBase + from airbyte.cloud.connections import CloudConnection + from airbyte.cloud.workspaces import CloudWorkspace + + +FINAL_STATUSES = { + JobStatusEnum.SUCCEEDED, + JobStatusEnum.FAILED, + JobStatusEnum.CANCELLED, +} +FAILED_STATUSES = { + JobStatusEnum.FAILED, + JobStatusEnum.CANCELLED, +} + + +@dataclass +class SyncResult: + """The result of a sync operation.""" + + workspace: CloudWorkspace + connection: CloudConnection + job_id: str + table_name_prefix: str = "" + table_name_suffix: str = "" + _latest_job_info: JobResponse | None = None + _connection_response: ConnectionResponse | None = None + _cache: CacheBase | None = None + + @property + def job_url(self) -> str: + """Return the URL of the sync job.""" + return f"{self.connection.job_history_url}/{self.job_id}" + + def _get_connection_info(self, *, force_refresh: bool = False) -> ConnectionResponse: + """Return connection info for the sync job.""" + if self._connection_response and not force_refresh: + return self._connection_response + + self._connection_response = api_util.get_connection( + workspace_id=self.workspace.workspace_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + connection_id=self.connection.connection_id, + ) + return self._connection_response + + def _get_destination_configuration(self, *, force_refresh: bool = False) -> dict[str, Any]: + """Return the destination configuration for the sync job.""" + connection_info: ConnectionResponse = self._get_connection_info(force_refresh=force_refresh) + destination_response = api_util.get_destination( + destination_id=connection_info.destination_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + ) + return destination_response.configuration + + def is_job_complete(self) -> bool: + """Check if the sync job is complete.""" + return self.get_job_status() in FINAL_STATUSES + + def get_job_status(self) -> JobStatusEnum: + """Check if the sync job is still running.""" + return self._fetch_latest_job_info().status + + def _fetch_latest_job_info(self) -> JobResponse: + """Return the job info for the sync job.""" + if self._latest_job_info and self._latest_job_info.status in FINAL_STATUSES: + return self._latest_job_info + + self._latest_job_info = api_util.get_job_info( + job_id=self.job_id, + api_root=self.workspace.api_root, + api_key=self.workspace.api_key, + ) + return self._latest_job_info + + @property + def bytes_synced(self) -> int: + """Return the number of records processed.""" + return self._fetch_latest_job_info().bytes_synced + + @property + def records_synced(self) -> int: + """Return the number of records processed.""" + return self._fetch_latest_job_info().rows_synced + + @property + def start_time(self) -> datetime: + """Return the start time of the sync job in UTC.""" + # Parse from ISO 8601 format: + return datetime.fromisoformat(self._fetch_latest_job_info().start_time) + + def raise_failure_status( + self, + *, + refresh_status: bool = False, + ) -> None: + """Raise an exception if the sync job failed. + + By default, this method will use the latest status available. If you want to refresh the + status before checking for failure, set `refresh_status=True`. If the job has failed, this + method will raise a `AirbyteConnectionSyncError`. + + Otherwise, do nothing. + """ + if not refresh_status and self._latest_job_info: + latest_status = self._latest_job_info.status + else: + latest_status = self.get_job_status() + + if latest_status in FAILED_STATUSES: + raise AirbyteConnectionSyncError( + workspace=self.workspace, + connection_id=self.connection.connection_id, + job_id=self.job_id, + job_status=self.get_job_status(), + ) + + def wait_for_completion( + self, + *, + wait_timeout: int = DEFAULT_SYNC_TIMEOUT_SECONDS, + raise_timeout: bool = True, + raise_failure: bool = False, + ) -> JobStatusEnum: + """Wait for a job to finish running.""" + start_time = time.time() + while True: + latest_status = self.get_job_status() + if latest_status in FINAL_STATUSES: + if raise_failure: + # No-op if the job succeeded or is still running: + self.raise_failure_status() + + return latest_status + + if time.time() - start_time > wait_timeout: + if raise_timeout: + raise AirbyteConnectionSyncTimeoutError( + workspace=self.workspace, + connection_id=self.connection.connection_id, + job_id=self.job_id, + job_status=latest_status, + timeout=wait_timeout, + ) + + return latest_status # This will be a non-final status + + time.sleep(api_util.JOB_WAIT_INTERVAL_SECS) + + def get_sql_cache(self) -> CacheBase: + """Return a SQL Cache object for working with the data in a SQL-based destination's.""" + if self._cache: + return self._cache + + destination_configuration: dict[str, Any] = self._get_destination_configuration() + self._cache = create_cache_from_destination_config( + destination_configuration=destination_configuration + ) + return self._cache + + def get_sql_engine(self) -> sqlalchemy.engine.Engine: + """Return a SQL Engine for querying a SQL-based destination.""" + self.get_sql_cache().get_sql_engine() + + def get_sql_table_name(self, stream_name: str) -> str: + """Return the SQL table name of the named stream.""" + return self.get_sql_cache().processor.get_sql_table_name(stream_name=stream_name) + + def get_sql_table( + self, + stream_name: str, + ) -> sqlalchemy.Table: + """Return a SQLAlchemy table object for the named stream.""" + self.get_sql_cache().processor.get_sql_table(stream_name) + + def get_dataset(self, stream_name: str) -> CachedDataset: + """Return cached dataset.""" + return CachedDataset(self.get_sql_cache(), stream_name=stream_name) + + def get_sql_database_name(self) -> str: + """Return the SQL database name.""" + cache = self.get_sql_cache() + return cache.get_database_name() + + def get_sql_schema_name(self) -> str: + """Return the SQL schema name.""" + cache = self.get_sql_cache() + return cache.schema_name + + @property + def stream_names(self) -> list[str]: + """Return the set of stream names.""" + return self.connection.stream_names + + @final + @property + def streams( + self, + ) -> SyncResultStreams: + """Return a temporary table name.""" + return self.SyncResultStreams(self) + + class SyncResultStreams(Mapping[str, CachedDataset]): + """A mapping of stream names to cached datasets.""" + + def __init__( + self, + parent: SyncResult, + /, + ) -> None: + self.parent: SyncResult = parent + + def __getitem__(self, key: str) -> CachedDataset: + return self.parent.get_dataset(stream_name=key) + + def __iter__(self) -> Iterator[str]: + """TODO""" + return iter(self.parent.stream_names) + + def __len__(self) -> int: + return len(self.parent.stream_names) diff --git a/airbyte/cloud/workspaces.py b/airbyte/cloud/workspaces.py new file mode 100644 index 00000000..d9c556c2 --- /dev/null +++ b/airbyte/cloud/workspaces.py @@ -0,0 +1,375 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""PyAirbyte classes and methods for interacting with the Airbyte Cloud API. + +By overriding `api_root`, you can use this module to interact with self-managed Airbyte instances, +both OSS and Enterprise. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from airbyte import exceptions as exc +from airbyte._util.api_util import ( + CLOUD_API_ROOT, + create_connection, + create_destination, + create_source, + delete_connection, + delete_destination, + delete_source, + get_workspace, +) +from airbyte.cloud._destination_util import get_destination_config_from_cache +from airbyte.cloud.connections import CloudConnection +from airbyte.cloud.sync_results import SyncResult +from airbyte.sources.base import Source + + +if TYPE_CHECKING: + from airbyte_api.models.shared.destinationresponse import DestinationResponse + + from airbyte.caches.base import CacheBase + + +@dataclass +class CloudWorkspace: + """A remote workspace on the Airbyte Cloud. + + By overriding `api_root`, you can use this class to interact with self-managed Airbyte + instances, both OSS and Enterprise. + """ + + workspace_id: str + api_key: str + api_root: str = CLOUD_API_ROOT + + @property + def workspace_url(self) -> str | None: + return f"{self.api_root}/workspaces/{self.workspace_id}" + + # Test connection and creds + + def connect(self) -> None: + """Check that the workspace is reachable and raise an exception otherwise. + + Note: It is not necessary to call this method before calling other operations. It + serves primarily as a simple check to ensure that the workspace is reachable + and credentials are correct. + """ + _ = get_workspace( + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + ) + print(f"Successfully connected to workspace: {self.workspace_url}") + + # Deploy and delete sources + + # TODO: Make this a public API + def _deploy_source( + self, + source: Source, + ) -> str: + """Deploy a source to the workspace. + + Returns the newly deployed source ID. + """ + source_configuration = source.get_config().copy() + source_configuration["sourceType"] = source.name.replace("source-", "") + + deployed_source = create_source( + name=f"{source.name.replace('-', ' ').title()} (Deployed by PyAirbyte)", + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + config=source_configuration, + ) + + # Set the deployment Ids on the source object + source._deployed_api_root = self.api_root # noqa: SLF001 # Accessing nn-public API + source._deployed_workspace_id = self.workspace_id # noqa: SLF001 # Accessing nn-public API + source._deployed_source_id = deployed_source.source_id # noqa: SLF001 # Accessing nn-public API + + return deployed_source.source_id + + def _permanently_delete_source( + self, + source: str | Source, + ) -> None: + """Delete a source from the workspace. + + You can pass either the source ID `str` or a deployed `Source` object. + """ + if not isinstance(source, (str, Source)): + raise ValueError(f"Invalid source type: {type(source)}") # noqa: TRY004, TRY003 + + if isinstance(source, Source): + if not source._deployed_source_id: # noqa: SLF001 + raise ValueError("Source has not been deployed.") # noqa: TRY003 + + source_id = source._deployed_source_id # noqa: SLF001 + + elif isinstance(source, str): + source_id = source + + delete_source( + source_id=source_id, + api_root=self.api_root, + api_key=self.api_key, + ) + + # Deploy and delete destinations + + # TODO: Make this a public API + def _deploy_cache_as_destination( + self, + cache: CacheBase, + ) -> str: + """Deploy a cache to the workspace as a new destination. + + Returns the newly deployed destination ID. + """ + cache_type_name = cache.__class__.__name__.replace("Cache", "") + + deployed_destination: DestinationResponse = create_destination( + name=f"Destination {cache_type_name} (Deployed by PyAirbyte)", + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + config=get_destination_config_from_cache(cache), + ) + + # Set the deployment Ids on the source object + cache._deployed_api_root = self.api_root # noqa: SLF001 # Accessing nn-public API + cache._deployed_workspace_id = self.workspace_id # noqa: SLF001 # Accessing nn-public API + cache._deployed_destination_id = deployed_destination.destination_id # noqa: SLF001 # Accessing nn-public API + + return deployed_destination.destination_id + + def _permanently_delete_destination( + self, + *, + destination: str | None = None, + cache: CacheBase | None = None, + ) -> None: + """Delete a deployed destination from the workspace. + + You can pass either the `Cache` class or the deployed destination ID as a `str`. + """ + if destination is None and cache is None: + raise ValueError("You must provide either a destination ID or a cache object.") # noqa: TRY003 + if destination is not None and cache is not None: + raise ValueError( # noqa: TRY003 + "You must provide either a destination ID or a cache object, not both." + ) + + if cache: + if not cache._deployed_destination_id: # noqa: SLF001 + raise ValueError("Cache has not been deployed.") # noqa: TRY003 + + destination = cache._deployed_destination_id # noqa: SLF001 + + if destination is None: + raise ValueError("No destination ID provided.") # noqa: TRY003 + + delete_destination( + destination_id=destination, + api_root=self.api_root, + api_key=self.api_key, + ) + + # Deploy and delete connections + + # TODO: Make this a public API + def _deploy_connection( + self, + source: Source | str, + cache: CacheBase | None = None, + destination: str | None = None, + table_prefix: str | None = None, + selected_streams: list[str] | None = None, + ) -> CloudConnection: + """Deploy a source and cache to the workspace as a new connection. + + Returns the newly deployed connection ID as a `str`. + + Args: + source (Source | str): The source to deploy. You can pass either an already deployed + source ID `str` or a PyAirbyte `Source` object. If you pass a `Source` object, + it will be deployed automatically. + cache (CacheBase, optional): The cache to deploy as a new destination. You can provide + `cache` or `destination`, but not both. + destination (str, optional): The destination ID to use. You can provide + `cache` or `destination`, but not both. + """ + # Resolve source ID + source_id: str + if isinstance(source, Source): + selected_streams = selected_streams or source.get_selected_streams() + if source._deployed_source_id: # noqa: SLF001 + source_id = source._deployed_source_id # noqa: SLF001 + else: + source_id = self._deploy_source(source) + else: + source_id = source + if not selected_streams: + raise exc.PyAirbyteInputError( + guidance="You must provide `selected_streams` when deploying a source ID." + ) + + # Resolve destination ID + destination_id: str + if destination: + destination_id = destination + elif cache: + table_prefix = table_prefix if table_prefix is not None else (cache.table_prefix or "") + if not cache._deployed_destination_id: # noqa: SLF001 + destination_id = self._deploy_cache_as_destination(cache) + else: + destination_id = cache._deployed_destination_id # noqa: SLF001 + else: + raise exc.PyAirbyteInputError( + guidance="You must provide either a destination ID or a cache object." + ) + + assert source_id is not None + assert destination_id is not None + + deployed_connection = create_connection( + name="Connection (Deployed by PyAirbyte)", + source_id=source_id, + destination_id=destination_id, + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + selected_stream_names=selected_streams, + prefix=table_prefix or "", + ) + + if isinstance(source, Source): + source._deployed_api_root = self.api_root # noqa: SLF001 + source._deployed_workspace_id = self.workspace_id # noqa: SLF001 + source._deployed_source_id = source_id # noqa: SLF001 + if cache: + cache._deployed_api_root = self.api_root # noqa: SLF001 + cache._deployed_workspace_id = self.workspace_id # noqa: SLF001 + cache._deployed_destination_id = deployed_connection.destination_id # noqa: SLF001 + + return CloudConnection( + workspace=self, + connection_id=deployed_connection.connection_id, + source=deployed_connection.source_id, + destination=deployed_connection.destination_id, + ) + + def get_connection( + self, + connection_id: str, + ) -> CloudConnection: + """Get a connection by ID. + + This method does not fetch data from the API. It returns a `CloudConnection` object, + which will be loaded lazily as needed. + """ + return CloudConnection( + workspace=self, + connection_id=connection_id, + ) + + def _permanently_delete_connection( + self, + connection: str | CloudConnection, + *, + delete_source: bool = False, + delete_destination: bool = False, + ) -> None: + """Delete a deployed connection from the workspace.""" + if connection is None: + raise ValueError("No connection ID provided.") # noqa: TRY003 + + if isinstance(connection, str): + connection = CloudConnection( + workspace=self, + connection_id=connection, + ) + + delete_connection( + connection_id=connection.connection_id, + api_root=self.api_root, + api_key=self.api_key, + workspace_id=self.workspace_id, + ) + if delete_source: + self._permanently_delete_source(source=connection.source_id) + + if delete_destination: + self._permanently_delete_destination(destination=connection.destination_id) + + # Run syncs + + def run_sync( + self, + connection_id: str, + *, + wait: bool = True, + wait_timeout: int = 300, + ) -> SyncResult: + """Run a sync on a deployed connection.""" + connection = CloudConnection( + workspace=self, + connection_id=connection_id, + ) + return connection.run_sync(wait=wait, wait_timeout=wait_timeout) + + # Get sync results and previous sync logs + + def get_sync_result( + self, + connection_id: str, + job_id: str | None = None, + ) -> SyncResult | None: + """Get the sync result for a connection job. + + If `job_id` is not provided, the most recent sync job will be used. + + Returns `None` if job_id is omitted and no previous jobs are found. + """ + connection = CloudConnection( + workspace=self, + connection_id=connection_id, + ) + if job_id is None: + results = self.get_previous_sync_logs( + connection_id=connection_id, + limit=1, + ) + if results: + return results[0] + + return None + connection = CloudConnection( + workspace=self, + connection_id=connection_id, + ) + return SyncResult( + workspace=self, + connection=connection, + job_id=job_id, + ) + + def get_previous_sync_logs( + self, + connection_id: str, + *, + limit: int = 10, + ) -> list[SyncResult]: + """Get the previous sync logs for a connection.""" + connection = CloudConnection( + workspace=self, + connection_id=connection_id, + ) + return connection.get_previous_sync_logs( + limit=limit, + ) diff --git a/airbyte/datasets/_sql.py b/airbyte/datasets/_sql.py index b23cccad..4dba4f39 100644 --- a/airbyte/datasets/_sql.py +++ b/airbyte/datasets/_sql.py @@ -7,6 +7,9 @@ from overrides import overrides from sqlalchemy import and_, func, select, text +from typing_extensions import Literal + +from airbyte_protocol.models.airbyte_protocol import ConfiguredAirbyteStream from airbyte.datasets._base import DatasetBase @@ -15,8 +18,11 @@ from collections.abc import Iterator from pandas import DataFrame - from sqlalchemy import Selectable, Table + from sqlalchemy import Table from sqlalchemy.sql import ClauseElement + from sqlalchemy.sql.selectable import Selectable + + from airbyte_protocol.models import ConfiguredAirbyteStream from airbyte.caches.base import CacheBase @@ -33,16 +39,38 @@ def __init__( cache: CacheBase, stream_name: str, query_statement: Selectable, + stream_configuration: ConfiguredAirbyteStream | None | Literal[False] = None, ) -> None: + """Initialize the dataset with a cache, stream name, and query statement. + + This class is not intended to be created directly. Instead, you can retrieve + datasets from caches or Cloud connection objects, etc. + + The query statement should be a SQLAlchemy Selectable object that can be executed to + retrieve records from the dataset. + + If stream_configuration is not provided, we attempt to retrieve the stream configuration + from the cache processor. This is useful when constructing a dataset from a CachedDataset + object, which already has the stream configuration. + + If stream_configuration is set to False, we skip the stream configuration retrieval. + """ self._length: int | None = None self._cache: CacheBase = cache self._stream_name: str = stream_name self._query_statement: Selectable = query_statement - super().__init__( - stream_metadata=cache.processor._get_stream_config( # noqa: SLF001 # Member is private until we have a public API for it. - stream_name=stream_name - ), - ) + if stream_configuration is None: + try: + stream_configuration = cache.processor._get_stream_config( # noqa: SLF001 # Member is private until we have a public API for it. + stream_name=stream_name + ) + except Exception as ex: + Warning(f"Failed to get stream configuration for {stream_name}: {ex}") + + # Coalesce False to None + stream_configuration = stream_configuration or None + + super().__init__(stream_metadata=stream_configuration) @property def stream_name(self) -> str: diff --git a/airbyte/exceptions.py b/airbyte/exceptions.py index 38464c69..32280c8b 100644 --- a/airbyte/exceptions.py +++ b/airbyte/exceptions.py @@ -39,7 +39,12 @@ from dataclasses import dataclass from textwrap import indent -from typing import Any +from typing import TYPE_CHECKING, Any + + +if TYPE_CHECKING: + from airbyte._util.api_duck_types import AirbyteApiResponseDuckType + from airbyte.cloud.workspaces import CloudWorkspace NEW_ISSUE_URL = "https://github.com/airbytehq/airbyte/issues/new/choose" @@ -50,7 +55,7 @@ @dataclass -class AirbyteError(Exception): +class PyAirbyteError(Exception): """Base class for exceptions in Airbyte.""" guidance: str | None = None @@ -130,7 +135,7 @@ def safe_logging_dict(self) -> dict[str, Any]: @dataclass -class AirbyteLibInternalError(AirbyteError): +class PyAirbyteInternalError(PyAirbyteError): """An internal error occurred in PyAirbyte.""" guidance = "Please consider reporting this error to the Airbyte team." @@ -141,7 +146,7 @@ class AirbyteLibInternalError(AirbyteError): @dataclass -class AirbyteLibInputError(AirbyteError, ValueError): +class PyAirbyteInputError(PyAirbyteError, ValueError): """The input provided to PyAirbyte did not match expected validation rules. This inherits from ValueError so that it can be used as a drop-in replacement for @@ -155,7 +160,7 @@ class AirbyteLibInputError(AirbyteError, ValueError): @dataclass -class AirbyteLibNoStreamsSelectedError(AirbyteLibInputError): +class PyAirbyteNoStreamsSelectedError(PyAirbyteInputError): """No streams were selected for the source.""" guidance = ( @@ -169,19 +174,19 @@ class AirbyteLibNoStreamsSelectedError(AirbyteLibInputError): # PyAirbyte Cache Errors -class AirbyteLibCacheError(AirbyteError): +class PyAirbyteCacheError(PyAirbyteError): """Error occurred while accessing the cache.""" @dataclass -class AirbyteLibCacheTableValidationError(AirbyteLibCacheError): +class PyAirbyteCacheTableValidationError(PyAirbyteCacheError): """Cache table validation failed.""" violation: str | None = None @dataclass -class AirbyteConnectorConfigurationMissingError(AirbyteLibCacheError): +class AirbyteConnectorConfigurationMissingError(PyAirbyteCacheError): """Connector is missing configuration.""" connector_name: str | None = None @@ -191,7 +196,7 @@ class AirbyteConnectorConfigurationMissingError(AirbyteLibCacheError): @dataclass -class AirbyteSubprocessError(AirbyteError): +class AirbyteSubprocessError(PyAirbyteError): """Error when running subprocess.""" run_args: list[str] | None = None @@ -207,7 +212,7 @@ class AirbyteSubprocessFailedError(AirbyteSubprocessError): # Connector Registry Errors -class AirbyteConnectorRegistryError(AirbyteError): +class AirbyteConnectorRegistryError(PyAirbyteError): """Error when accessing the connector registry.""" @@ -231,7 +236,7 @@ class AirbyteConnectorNotPyPiPublishedError(AirbyteConnectorRegistryError): @dataclass -class AirbyteConnectorError(AirbyteError): +class AirbyteConnectorError(PyAirbyteError): """Error when running the connector.""" connector_name: str | None = None @@ -293,7 +298,7 @@ class AirbyteStreamNotFoundError(AirbyteConnectorError): @dataclass -class AirbyteLibSecretNotFoundError(AirbyteError): +class PyAirbyteSecretNotFoundError(PyAirbyteError): """Secret not found.""" guidance = "Please ensure that the secret is set." @@ -303,3 +308,91 @@ class AirbyteLibSecretNotFoundError(AirbyteError): secret_name: str | None = None sources: list[str] | None = None + + +# Airbyte API Errors + + +@dataclass +class AirbyteError(PyAirbyteError): + """An error occurred while communicating with the hosted Airbyte instance.""" + + response: AirbyteApiResponseDuckType | None = None + """The API response from the failed request.""" + + workspace: CloudWorkspace | None = None + """The workspace where the error occurred.""" + + @property + def workspace_url(self) -> str | None: + if self.workspace: + return self.workspace.workspace_url + + return None + + +@dataclass +class AirbyteConnectionError(AirbyteError): + """An connection error occurred while communicating with the hosted Airbyte instance.""" + + connection_id: str | None = None + """The connection ID where the error occurred.""" + + job_id: str | None = None + """The job ID where the error occurred (if applicable).""" + + job_status: str | None = None + """The latest status of the job where the error occurred (if applicable).""" + + @property + def connection_url(self) -> str | None: + if self.workspace_url and self.connection_id: + return f"{self.workspace_url}/connections/{self.connection_id}" + + return None + + @property + def job_history_url(self) -> str | None: + if self.connection_url: + return f"{self.connection_url}/job-history" + + return None + + @property + def job_url(self) -> str | None: + if self.job_history_url and self.job_id: + return f"{self.job_history_url}#{self.job_id}::0" + + return None + + +@dataclass +class AirbyteConnectionSyncError(AirbyteConnectionError): + """An error occurred while executing the remote Airbyte job.""" + + +@dataclass +class AirbyteConnectionSyncTimeoutError(AirbyteConnectionSyncError): + """An timeout occurred while waiting for the remote Airbyte job to complete.""" + + timeout: int | None = None + """The timeout in seconds that was reached.""" + + +# Airbyte Resource Errors (General) + + +@dataclass +class AirbyteMissingResourceError(AirbyteError): + """Remote Airbyte resources does not exist.""" + + resource_type: str | None = None + resource_name_or_id: str | None = None + + +@dataclass +class AirbyteMultipleResourcesError(AirbyteError): + """Could not locate the resource because multiple matching resources were found.""" + + resource_type: str | None = None + resource_name_or_id: str | None = None diff --git a/airbyte/records.py b/airbyte/records.py index c3261b35..9533b976 100644 --- a/airbyte/records.py +++ b/airbyte/records.py @@ -168,10 +168,13 @@ def __init__( """Initialize the dictionary with the given data. Args: - - normalize_keys: If `True`, the keys will be normalized using the given normalizer. - - expected_keys: If provided, the dictionary will be initialized with these given keys. - - expected_keys: If provided and `prune_extra_fields` is True, then unexpected fields - will be removed. This option is ignored if `expected_keys` is not provided. + from_dict: The dictionary to initialize the StreamRecord with. + prune_extra_fields: If `True`, unexpected fields will be removed. + normalize_keys: If `True`, the keys will be normalized using the given normalizer. + normalizer: The normalizer to use when normalizing keys. If not provided, the + LowerCaseNormalizer will be used. + expected_keys: If provided and `prune_extra_fields` is True, then unexpected fields + will be removed. This option is ignored if `expected_keys` is not provided. """ # If no normalizer is provided, use LowerCaseNormalizer. self._normalize_keys = normalize_keys diff --git a/airbyte/secrets.py b/airbyte/secrets.py deleted file mode 100644 index f0a4d11c..00000000 --- a/airbyte/secrets.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -"""Secrets management for PyAirbyte.""" -from __future__ import annotations - -import contextlib -import os -from enum import Enum, auto -from getpass import getpass -from typing import TYPE_CHECKING - -from dotenv import dotenv_values - -from airbyte import exceptions as exc - - -if TYPE_CHECKING: - from collections.abc import Callable - - -try: - from google.colab import userdata as colab_userdata -except ImportError: - colab_userdata = None - - -class SecretSource(Enum): - ENV = auto() - DOTENV = auto() - GOOGLE_COLAB = auto() - ANY = auto() - - PROMPT = auto() - - -def _get_secret_from_env( - secret_name: str, -) -> str | None: - if secret_name not in os.environ: - return None - - return os.environ[secret_name] - - -def _get_secret_from_dotenv( - secret_name: str, -) -> str | None: - try: - dotenv_vars: dict[str, str | None] = dotenv_values() - except Exception: - # Can't locate or parse a .env file - return None - - if secret_name not in dotenv_vars: - # Secret not found - return None - - return dotenv_vars[secret_name] - - -def _get_secret_from_colab( - secret_name: str, -) -> str | None: - if colab_userdata is None: - # The module doesn't exist. We probably aren't in Colab. - return None - - try: - return colab_userdata.get(secret_name) - except Exception: - # Secret name not found. Continue. - return None - - -def _get_secret_from_prompt( - secret_name: str, -) -> str | None: - with contextlib.suppress(Exception): - return getpass(f"Enter the value for secret '{secret_name}': ") - - return None - - -_SOURCE_FUNCTIONS: dict[SecretSource, Callable] = { - SecretSource.ENV: _get_secret_from_env, - SecretSource.DOTENV: _get_secret_from_dotenv, - SecretSource.GOOGLE_COLAB: _get_secret_from_colab, - SecretSource.PROMPT: _get_secret_from_prompt, -} - - -def get_secret( - secret_name: str, - source: SecretSource | list[SecretSource] = SecretSource.ANY, - *, - prompt: bool = True, -) -> str: - """Get a secret from the environment. - - The optional `source` argument of enum type `SecretSource` or list of `SecretSource` options. - If left blank, the `source` arg will be `SecretSource.ANY`. If `source` is set to a specific - source, then only that source will be checked. If a list of `SecretSource` entries is passed, - then the sources will be checked using the provided ordering. - - If `prompt` to `True` or if SecretSource.PROMPT is declared in the `source` arg, then the - user will be prompted to enter the secret if it is not found in any of the other sources. - """ - sources = [source] if not isinstance(source, list) else source - all_sources = set(_SOURCE_FUNCTIONS.keys()) - {SecretSource.PROMPT} - if SecretSource.ANY in sources: - sources += [s for s in all_sources if s not in sources] - sources.remove(SecretSource.ANY) - - if prompt or SecretSource.PROMPT in sources: - if SecretSource.PROMPT in sources: - sources.remove(SecretSource.PROMPT) - - sources.append(SecretSource.PROMPT) # Always check prompt last - - for source in sources: - fn = _SOURCE_FUNCTIONS[source] # Get the matching function for this source - val = fn(secret_name) - if val: - return val - - raise exc.AirbyteLibSecretNotFoundError( - secret_name=secret_name, - sources=[str(s) for s in sources], - ) - - -__all__ = [ - "get_secret", - "SecretSource", -] diff --git a/airbyte/secrets/__init__.py b/airbyte/secrets/__init__.py new file mode 100644 index 00000000..156772df --- /dev/null +++ b/airbyte/secrets/__init__.py @@ -0,0 +1,53 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Secrets management for PyAirbyte.""" + +from __future__ import annotations + +from airbyte.secrets import ( + base, + config, + custom, + env_vars, + google_colab, + google_gsm, + prompt, + util, +) +from airbyte.secrets.base import SecretHandle, SecretManager, SecretSourceEnum, SecretString +from airbyte.secrets.config import disable_secret_source, register_secret_manager +from airbyte.secrets.custom import CustomSecretManager +from airbyte.secrets.env_vars import DotenvSecretManager, EnvVarSecretManager +from airbyte.secrets.google_colab import ColabSecretManager +from airbyte.secrets.google_gsm import GoogleGSMSecretManager +from airbyte.secrets.prompt import SecretsPrompt +from airbyte.secrets.util import get_secret + + +__all__ = [ + # Submodules + "base", + "config", + "custom", + "env_vars", + "google_colab", + "google_gsm", + "prompt", + "util", + # Secret Access + "get_secret", + # Secret Classes + "SecretSourceEnum", + "SecretString", + "SecretHandle", + # Secret Managers + "SecretManager", + "EnvVarSecretManager", + "DotenvSecretManager", + "ColabSecretManager", + "SecretsPrompt", + "CustomSecretManager", + "GoogleGSMSecretManager", + # Registration Functions` + "register_secret_manager", + "disable_secret_source", +] diff --git a/airbyte/secrets/base.py b/airbyte/secrets/base.py new file mode 100644 index 00000000..d77ac633 --- /dev/null +++ b/airbyte/secrets/base.py @@ -0,0 +1,146 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +import json +from abc import ABC, abstractmethod +from enum import Enum +from typing import cast + +from airbyte import exceptions as exc + + +class SecretSourceEnum(str, Enum): + ENV = "env" + DOTENV = "dotenv" + GOOGLE_COLAB = "google_colab" + GOOGLE_GSM = "google_gsm" # Not enabled by default + + PROMPT = "prompt" + + +class SecretString(str): + """A string that represents a secret. + + This class is used to mark a string as a secret. When a secret is printed, it + will be masked to prevent accidental exposure of sensitive information. + """ + + __slots__ = () + + def __repr__(self) -> str: + return "" + + def is_empty(self) -> bool: + """Check if the secret is an empty string.""" + return len(self) == 0 + + def is_json(self) -> bool: + """Check if the secret string is a valid JSON string.""" + try: + json.loads(self) + except (json.JSONDecodeError, Exception): + return False + + return True + + def __bool__(self) -> bool: + """Override the boolean value of the secret string. + + Always returns `True` without inspecting contents.""" + return True + + def parse_json(self) -> dict: + """Parse the secret string as JSON.""" + try: + return json.loads(self) + except json.JSONDecodeError as ex: + raise exc.PyAirbyteInputError( + message="Failed to parse secret as JSON.", + context={ + "Message": ex.msg, + "Position": ex.pos, + "SecretString_Length": len(self), # Debug secret blank or an unexpected format. + }, + ) from None + + +class SecretManager(ABC): + """Abstract base class for secret managers. + + Secret managers are used to retrieve secrets from a secret store. + + By registering a secret manager, PyAirbyte can automatically locate and + retrieve secrets from the secret store when needed. This allows you to + securely store and access sensitive information such as API keys, passwords, + and other credentials without hardcoding them in your code. + + To create a custom secret manager, subclass this class and implement the + `get_secret` method. By default, the secret manager will be automatically + registered as a global secret source, but will not replace any existing + secret sources. To customize this behavior, override the `auto_register` and + `replace_existing` attributes in your subclass as needed. + + Note: Registered secrets managers always have priority over the default + secret sources such as environment variables, dotenv files, and Google Colab + secrets. If multiple secret managers are registered, the last one registered + will take priority. + """ + + replace_existing = False + as_backup = False + + def __init__(self) -> None: + """Instantiate the new secret manager.""" + if not hasattr(self, "name"): + # Default to the class name if no name is provided + self.name: str = self.__class__.__name__ + + @abstractmethod + def get_secret(self, secret_name: str) -> SecretString | None: + """Get a named secret from the secret manager. + + This method should be implemented by subclasses to retrieve secrets from + the secret store. If the secret is not found, the method should return `None`. + """ + ... + + def __str__(self) -> str: + return self.name + + def __eq__(self, value: object) -> bool: + if isinstance(value, SecretManager): + return self.name == value.name + + if isinstance(value, str): + return self.name == value + + if isinstance(value, SecretSourceEnum): + return self.name == str(value) + + return super().__eq__(value) + + +class SecretHandle: + """A handle for a secret in a secret manager. + + This class is used to store a reference to a secret in a secret manager. + The secret is not retrieved until the `get_value()` method is called on the handle. + """ + + def __init__( + self, + parent: SecretManager, + secret_name: str, + ) -> None: + """Instantiate a new secret handle.""" + self.parent = parent + self.secret_name = secret_name + + def get_value(self) -> SecretString: + """Get the secret from the secret manager. + + Subclasses can optionally override this method to provide a more optimized code path. + """ + return cast(SecretString, self.parent.get_secret(self.secret_name)) diff --git a/airbyte/secrets/config.py b/airbyte/secrets/config.py new file mode 100644 index 00000000..dc31a65e --- /dev/null +++ b/airbyte/secrets/config.py @@ -0,0 +1,82 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from airbyte._util import meta +from airbyte.secrets.base import SecretManager +from airbyte.secrets.env_vars import DotenvSecretManager, EnvVarSecretManager +from airbyte.secrets.google_colab import ColabSecretManager +from airbyte.secrets.prompt import SecretsPrompt + + +if TYPE_CHECKING: + from airbyte.secrets.base import SecretSourceEnum + from airbyte.secrets.custom import CustomSecretManager + + +_SECRETS_SOURCES: list[SecretManager] = [] + + +def _get_secret_sources() -> list[SecretManager]: + """Initialize the default secret sources.""" + if len(_SECRETS_SOURCES) == 0: + # Initialize the default secret sources + _SECRETS_SOURCES.extend( + [ + EnvVarSecretManager(), + DotenvSecretManager(), + ] + ) + if meta.is_colab(): + _SECRETS_SOURCES.append(ColabSecretManager()) + + if meta.is_interactive(): + _SECRETS_SOURCES.append(SecretsPrompt()) + + return _SECRETS_SOURCES.copy() + + +# Ensure the default secret sources are initialized +_ = _get_secret_sources() + + +def register_secret_manager( + secret_manager: CustomSecretManager, + *, + as_backup: bool = False, + replace_existing: bool = False, +) -> None: + """Register a custom secret manager.""" + if replace_existing: + clear_secret_sources() + + if as_backup: + # Add to end of list + _SECRETS_SOURCES.append(secret_manager) + else: + # Add to beginning of list + _SECRETS_SOURCES.insert(0, secret_manager) + + +def clear_secret_sources() -> None: + """Clear all secret sources.""" + _SECRETS_SOURCES.clear() + + +def disable_secret_source(source: SecretManager | SecretSourceEnum) -> None: + """Disable one of the default secrets sources. + + This function can accept either a `SecretManager` instance, a `SecretSourceEnum` enum value, or + a string representing the name of the source to disable. + """ + if isinstance(source, SecretManager) and source in _SECRETS_SOURCES: + _SECRETS_SOURCES.remove(source) + return + + # Else, remove by name + for s in _SECRETS_SOURCES: + if s.name == str(source): + _SECRETS_SOURCES.remove(s) diff --git a/airbyte/secrets/custom.py b/airbyte/secrets/custom.py new file mode 100644 index 00000000..1cee062c --- /dev/null +++ b/airbyte/secrets/custom.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +from abc import ABC + +from airbyte.secrets.base import SecretManager +from airbyte.secrets.config import clear_secret_sources, register_secret_manager + + +class CustomSecretManager(SecretManager, ABC): + """Custom secret manager that retrieves secrets from a custom source. + + This class is a convenience class that can be used to create custom secret + managers. By default, custom secrets managers are auto-registered during + creation. + """ + + auto_register = True + replace_existing = False + as_backup = False + + def __init__(self) -> None: + super().__init__() + if self.auto_register: + self.register() + + def register( + self, + *, + replace_existing: bool | None = None, + as_backup: bool | None = None, + ) -> None: + """Register the secret manager as global secret source. + + This makes the secret manager available to the `get_secret` function and + allows it to be used automatically as a source for secrets. + + If `replace_existing` is `True`, the secret manager will replace all existing + secrets sources, including the default secret managers such as environment + variables, dotenv files, and Google Colab secrets. If `replace_existing` is + None or not provided, the default behavior will be used from the `replace_existing` + of the class (`False` unless overridden by the subclass). + """ + if replace_existing is None: + replace_existing = self.replace_existing + + if as_backup is None: + as_backup = self.as_backup + + if replace_existing: + clear_secret_sources() + + register_secret_manager( + self, + as_backup=as_backup, + replace_existing=replace_existing, + ) diff --git a/airbyte/secrets/env_vars.py b/airbyte/secrets/env_vars.py new file mode 100644 index 00000000..5a12135b --- /dev/null +++ b/airbyte/secrets/env_vars.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +import os + +from dotenv import dotenv_values + +from airbyte.secrets.base import SecretManager, SecretSourceEnum, SecretString + + +class EnvVarSecretManager(SecretManager): + """Secret manager that retrieves secrets from environment variables.""" + + name = SecretSourceEnum.ENV.value + + def get_secret(self, secret_name: str) -> SecretString | None: + """Get a named secret from the environment.""" + if secret_name not in os.environ: + return None + + return SecretString(os.environ[secret_name]) + + +class DotenvSecretManager(SecretManager): + """Secret manager that retrieves secrets from a `.env` file.""" + + name = SecretSourceEnum.DOTENV.value + + def get_secret(self, secret_name: str) -> SecretString | None: + """Get a named secret from the `.env` file.""" + try: + dotenv_vars: dict[str, str | None] = dotenv_values() + except Exception: + # Can't locate or parse a .env file + return None + + if secret_name not in dotenv_vars: + # Secret not found + return None + + return SecretString(dotenv_vars[secret_name]) diff --git a/airbyte/secrets/google_colab.py b/airbyte/secrets/google_colab.py new file mode 100644 index 00000000..49b46097 --- /dev/null +++ b/airbyte/secrets/google_colab.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Secrets manager for Google Colab user secrets.""" + +from __future__ import annotations + +from airbyte.secrets.base import SecretManager, SecretSourceEnum, SecretString + + +class ColabSecretManager(SecretManager): + """Secret manager that retrieves secrets from Google Colab user secrets.""" + + name = SecretSourceEnum.GOOGLE_COLAB.value + + def __init__(self) -> None: + try: + from google.colab import ( # pyright: ignore[reportMissingImports] + userdata as colab_userdata, + ) + + self.colab_userdata = colab_userdata + except ImportError: + self.colab_userdata = None + + super().__init__() + + def get_secret(self, secret_name: str) -> SecretString | None: + """Get a named secret from Google Colab user secrets.""" + if self.colab_userdata is None: + # The module doesn't exist. We probably aren't in Colab. + return None + + try: + return SecretString(self.colab_userdata.get(secret_name)) + except Exception: + # Secret name not found. Continue. + return None diff --git a/airbyte/secrets/google_gsm.py b/airbyte/secrets/google_gsm.py new file mode 100644 index 00000000..c23c19e3 --- /dev/null +++ b/airbyte/secrets/google_gsm.py @@ -0,0 +1,217 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Secret manager that retrieves secrets from Google Secrets Manager (GSM). + +Usage Example: + +```python +gsm_secrets_manager = GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), +) +first_secret: SecretHandle = next( + gsm_secrets_manager.fetch_connector_secrets( + connector_name=connector_name, + ), + None, +) + +print(f"Found '{connector_name}' credential secret '${first_secret.secret_name}'.") +return first_secret.get_value().parse_json() +``` + +More compact example: + +```python +gsm_secrets_manager = GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), +) +connector_config: dict = ( + next( + gsm_secrets_manager.fetch_connector_secrets( + connector_name=connector_name, + ), + None, + ) + .get_value() + .parse_json() +) +``` +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import TYPE_CHECKING + +from google.cloud import secretmanager_v1 as secretmanager + +from airbyte import exceptions as exc +from airbyte.secrets.base import SecretHandle, SecretSourceEnum, SecretString +from airbyte.secrets.custom import CustomSecretManager + + +if TYPE_CHECKING: + from collections.abc import Iterable + + from google.cloud.secretmanager_v1.services.secret_manager_service.pagers import ( + ListSecretsPager, + ) + + +class GoogleGSMSecretManager(CustomSecretManager): + """Secret manager that retrieves secrets from Google Secrets Manager (GSM). + + This class inherits from `CustomSecretManager` and also adds methods + that are specific to this implementation: `fetch_secrets()`, + `fetch_secrets_by_label()` and `fetch_connector_secrets()`. + + This secret manager is not enabled by default. To use it, you must provide the project ID and + the credentials for a service account with the necessary permissions to access the secrets. + + The `fetch_connector_secret()` method assumes a label name of `connector` + matches the name of the connector (`source-github`, `destination-snowflake`, etc.) + """ + + name = SecretSourceEnum.GOOGLE_GSM.value + auto_register = False + as_backup = False + replace_existing = False + + CONNECTOR_LABEL = "connector" + """The label key used to filter secrets by connector name.""" + + def __init__( + self, + project: str, + *, + credentials_path: str | None = None, + credentials_json: str | SecretString | None = None, + auto_register: bool = False, + as_backup: bool = False, + ) -> None: + """Instantiate a new Google GSM secret manager instance. + + You can provide either the path to the credentials file or the JSON contents of the + credentials file. If both are provided, a `PyAirbyteInputError` will be raised. + """ + if credentials_path and credentials_json: + raise exc.PyAirbyteInputError( + guidance=("You can provide `credentials_path` or `credentials_json` but not both."), + ) + + self.project = project + + if credentials_json is not None and not isinstance(credentials_json, SecretString): + credentials_json = SecretString(credentials_json) + + if not credentials_json and not credentials_path: + if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ: + credentials_path = os.environ["GOOGLE_APPLICATION_CREDENTIALS"] + + elif "GCP_GSM_CREDENTIALS" in os.environ: + credentials_json = SecretString(os.environ["GCP_GSM_CREDENTIALS"]) + + if credentials_path: + credentials_json = SecretString(Path(credentials_path).read_text()) + + if not credentials_json: + raise exc.PyAirbyteInputError( + guidance=( + "No Google Cloud credentials found. You can provide the path to the " + "credentials file using the `credentials_path` argument, or provide the JSON " + "contents of the credentials file using the `credentials_json` argument." + ), + ) + + self.secret_client = secretmanager.SecretManagerServiceClient.from_service_account_info( + json.loads(credentials_json) + ) + + if auto_register: + self.auto_register = auto_register + + if as_backup: + self.as_backup = as_backup + + super().__init__() # Handles the registration if needed + + def get_secret(self, secret_name: str) -> SecretString | None: + """Get a named secret from Google Colab user secrets.""" + return SecretString( + self.secret_client.access_secret_version( + name=f"projects/{self.project}/secrets/{secret_name}/versions/latest" + ).payload.data.decode("UTF-8") + ) + + def fetch_secrets( + self, + *, + filter_string: str, + ) -> Iterable[SecretHandle]: + """List all available secrets in the secret manager. + + Example filter strings: + - `labels.connector=source-bigquery`: Filter for secrets with the labe 'source-bigquery'. + + Args: + filter_string (str): A filter string to apply to the list of secrets, following the + format described in the Google Secret Manager documentation: + https://cloud.google.com/secret-manager/docs/filtering + + Returns: + Iterable[SecretHandle]: An iterable of `SecretHandle` objects for the matching secrets. + """ + gsm_secrets: ListSecretsPager = self.secret_client.list_secrets( + secretmanager.ListSecretsRequest( + request={ + "filter": filter_string, + } + ) + ) + + return [ + SecretHandle( + parent=self, + secret_name=secret.name, + ) + for secret in gsm_secrets + ] + + def fetch_secrets_by_label( + self, + label_key: str, + label_value: str, + ) -> Iterable[SecretHandle]: + """List all available secrets in the secret manager. + + Args: + label_key (str): The key of the label to filter by. + label_value (str): The value of the label to filter by. + + Returns: + Iterable[SecretHandle]: An iterable of `SecretHandle` objects for the matching secrets. + """ + return self.fetch_secrets(filter_string=f"labels.{label_key}={label_value}") + + def fetch_connector_secrets( + self, + connector_name: str, + ) -> Iterable[SecretHandle]: + """Fetch secrets in the secret manager, using the connector name as a filter for the label. + + The label key used to filter the secrets is defined by the `CONNECTOR_LABEL` attribute, + which defaults to 'connector'. + + Args: + connector_name (str): The name of the connector to filter by. + + Returns: + Iterable[SecretHandle]: An iterable of `SecretHandle` objects for the matching secrets. + """ + return self.fetch_secrets_by_label( + label_key=self.CONNECTOR_LABEL, + label_value=connector_name, + ) diff --git a/airbyte/secrets/prompt.py b/airbyte/secrets/prompt.py new file mode 100644 index 00000000..01a0da9d --- /dev/null +++ b/airbyte/secrets/prompt.py @@ -0,0 +1,24 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +import contextlib +from getpass import getpass + +from airbyte.secrets.base import SecretManager, SecretSourceEnum, SecretString + + +class SecretsPrompt(SecretManager): + """Secret manager that prompts the user to enter a secret.""" + + name = SecretSourceEnum.PROMPT.value + + def get_secret( + self, + secret_name: str, + ) -> SecretString | None: + with contextlib.suppress(Exception): + return SecretString(getpass(f"Enter the value for secret '{secret_name}': ")) + + return None diff --git a/airbyte/secrets/util.py b/airbyte/secrets/util.py new file mode 100644 index 00000000..028f3156 --- /dev/null +++ b/airbyte/secrets/util.py @@ -0,0 +1,86 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""___""" + +from __future__ import annotations + +import warnings +from typing import Any, cast + +from airbyte import exceptions as exc +from airbyte.secrets.base import SecretManager, SecretSourceEnum, SecretString +from airbyte.secrets.config import _get_secret_sources + + +def get_secret( + secret_name: str, + /, + *, + sources: list[SecretManager | SecretSourceEnum] | None = None, + allow_prompt: bool = True, + **kwargs: dict[str, Any], +) -> SecretString: + """Get a secret from the environment. + + The optional `sources` argument of enum type `SecretSourceEnum` or list of `SecretSourceEnum` + options. If left blank, all available sources will be checked. If a list of `SecretSourceEnum` + entries is passed, then the sources will be checked using the provided ordering. + + If `allow_prompt` is `True` or if SecretSourceEnum.PROMPT is declared in the `source` arg, then + the user will be prompted to enter the secret if it is not found in any of the other sources. + """ + if "source" in kwargs: + warnings.warn( + message="The `source` argument is deprecated. Use the `sources` argument instead.", + category=DeprecationWarning, + stacklevel=2, + ) + sources = kwargs.pop("source") # type: ignore [assignment] + + available_sources: dict[str, SecretManager] = {} + for available_source in _get_secret_sources(): + # Add available sources to the dict. Order matters. + available_sources[available_source.name] = available_source + + if sources is None: + # If ANY is in the list, then we don't need to check any other sources. + # This is the default behavior. + sources = list(available_sources.values()) + + elif not isinstance(sources, list): + sources = [sources] # type: ignore [unreachable] # This is a 'just in case' catch. + + # Replace any SecretSourceEnum strings with the matching SecretManager object + for source in sources: + if isinstance(source, SecretSourceEnum): + if source not in available_sources: + raise exc.PyAirbyteInputError( + guidance="Invalid secret source name.", + input_value=source, + context={ + "Available Sources": list(available_sources.keys()), + }, + ) + + sources[sources.index(source)] = available_sources[source] + + secret_managers = cast(list[SecretManager], sources) + + if SecretSourceEnum.PROMPT in secret_managers: + prompt_source = secret_managers.pop( + # Mis-typed, but okay here since we have equality logic for the enum comparison: + secret_managers.index(SecretSourceEnum.PROMPT), # type: ignore [arg-type] + ) + + if allow_prompt: + # Always check prompt last. Add it to the end of the list. + secret_managers.append(prompt_source) + + for secret_mgr in secret_managers: + val = secret_mgr.get_secret(secret_name) + if val: + return SecretString(val) + + raise exc.PyAirbyteSecretNotFoundError( + secret_name=secret_name, + sources=[str(s) for s in available_sources], + ) diff --git a/airbyte/sources/__init__.py b/airbyte/sources/__init__.py index aff3b5ad..bd3938bb 100644 --- a/airbyte/sources/__init__.py +++ b/airbyte/sources/__init__.py @@ -2,6 +2,7 @@ from __future__ import annotations from airbyte.sources import base, util +from airbyte.sources.base import Source from airbyte.sources.registry import ( ConnectorMetadata, get_available_connectors, diff --git a/airbyte/sources/base.py b/airbyte/sources/base.py index 6cad3f16..bb2c99c8 100644 --- a/airbyte/sources/base.py +++ b/airbyte/sources/base.py @@ -2,9 +2,7 @@ from __future__ import annotations import json -import tempfile import warnings -from contextlib import contextmanager, suppress from pathlib import Path from typing import TYPE_CHECKING, Any, cast @@ -37,6 +35,7 @@ log_source_check_result, send_telemetry, ) +from airbyte._util.temp_files import as_temp_files from airbyte.caches.util import get_default_cache from airbyte.datasets._lazy import LazyDataset from airbyte.progress import progress @@ -56,25 +55,6 @@ from airbyte.documents import Document -@contextmanager -def as_temp_files(files_contents: list[dict | str]) -> Generator[list[str], Any, None]: - """Write the given contents to temporary files and yield the file paths as strings.""" - temp_files: list[Any] = [] - try: - for content in files_contents: - temp_file = tempfile.NamedTemporaryFile(mode="w+t", delete=False) - temp_file.write( - json.dumps(content) if isinstance(content, dict) else content, - ) - temp_file.flush() - temp_files.append(temp_file) - yield [file.name for file in temp_files] - finally: - for temp_file in temp_files: - with suppress(Exception): - Path(temp_file.name).unlink() - - class Source: """A class representing a source that can be called.""" @@ -104,6 +84,10 @@ def __init__( if streams is not None: self.select_streams(streams) + self._deployed_api_root: str | None = None + self._deployed_workspace_id: str | None = None + self._deployed_source_id: str | None = None + def set_streams(self, streams: list[str]) -> None: """Deprecated. See select_streams().""" warnings.warn( @@ -283,7 +267,7 @@ def print_config_spec( it will be printed to the console. """ if format not in ["yaml", "json"]: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Invalid format. Expected 'yaml' or 'json'", input_value=format, ) @@ -377,13 +361,13 @@ def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]: ] if len(found) == 0: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Stream name does not exist in catalog.", input_value=stream_name, ) if len(found) > 1: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Duplicate streams found with the same name.", context={ "found_streams": found, @@ -416,7 +400,7 @@ def get_records(self, stream: str) -> LazyDataset: ], ) if len(configured_catalog.streams) == 0: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Requested stream does not exist.", context={ "stream": stream, @@ -701,7 +685,7 @@ def read( try: write_strategy = WriteStrategy(write_strategy) except ValueError: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Invalid strategy", context={ "write_strategy": write_strategy, @@ -713,7 +697,7 @@ def read( self.select_streams(streams) if not self._selected_stream_names: - raise exc.AirbyteLibNoStreamsSelectedError( + raise exc.PyAirbyteNoStreamsSelectedError( connector_name=self.name, available_streams=self.get_available_streams(), ) diff --git a/airbyte/sources/registry.py b/airbyte/sources/registry.py index 4d825937..8b05566a 100644 --- a/airbyte/sources/registry.py +++ b/airbyte/sources/registry.py @@ -79,7 +79,7 @@ def _get_registry_cache(*, force_refresh: bool = False) -> dict[str, ConnectorMe new_cache[connector_metadata.name] = connector_metadata if len(new_cache) == 0: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Connector registry is empty.", context={ "registry_url": _get_registry_url(), @@ -97,7 +97,7 @@ def get_connector_metadata(name: str) -> ConnectorMetadata: """ cache = copy(_get_registry_cache()) if not cache: - raise exc.AirbyteLibInternalError( + raise exc.PyAirbyteInternalError( message="Connector registry could not be loaded.", context={ "registry_url": _get_registry_url(), diff --git a/airbyte/sources/util.py b/airbyte/sources/util.py index 518ea132..51ee952c 100644 --- a/airbyte/sources/util.py +++ b/airbyte/sources/util.py @@ -74,11 +74,11 @@ def get_source( """ if local_executable: if pip_url: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Param 'pip_url' is not supported when 'local_executable' is set." ) if version: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( message="Param 'version' is not supported when 'local_executable' is set." ) diff --git a/airbyte/validate.py b/airbyte/validate.py index 89789a80..9a4650d2 100644 --- a/airbyte/validate.py +++ b/airbyte/validate.py @@ -154,7 +154,7 @@ def validate(connector_dir: str, sample_config: str, *, validate_install_only: b install_only_test(connector_name) else: if not sample_config: - raise exc.AirbyteLibInputError( + raise exc.PyAirbyteInputError( input_value="--sample-config is required without --validate-install-only set" ) full_tests(connector_name, sample_config) diff --git a/docs/generate.py b/docs/generate.py index dbcea00d..c0b0fb45 100755 --- a/docs/generate.py +++ b/docs/generate.py @@ -1,5 +1,5 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. -"""Generate docs for all public modules in AirbyteLib and save them to docs/generated. +"""Generate docs for all public modules in PyAirbyte and save them to docs/generated. Usage: poetry run python docs/generate.py @@ -19,26 +19,13 @@ def run() -> None: - """Generate docs for all public modules in AirbyteLib and save them to docs/generated.""" - public_modules = ["airbyte"] + """Generate docs for all public modules in PyAirbyte and save them to docs/generated.""" + public_modules = ["airbyte", "airbyte/cloud/experimental.py"] # recursively delete the docs/generated folder if it exists if pathlib.Path("docs/generated").exists(): shutil.rmtree("docs/generated") - # All files and folders that don't start with "_" are treated as public. - for submodule in os.listdir("airbyte"): - submodule_path = pathlib.Path(f"airbyte/{submodule}") - if not submodule.startswith("_"): - public_modules.append(submodule_path) - if submodule_path.is_file(): - continue - - for subsubmodule in os.listdir(submodule_path): - subsubmodule_path = submodule_path / subsubmodule - if not subsubmodule.startswith("_"): - public_modules.append(subsubmodule_path) - pdoc.render.configure( template_directory="docs", show_source=True, diff --git a/examples/run_bigquery_faker.py b/examples/run_bigquery_faker.py index eb1f7139..a763d983 100644 --- a/examples/run_bigquery_faker.py +++ b/examples/run_bigquery_faker.py @@ -9,19 +9,27 @@ import tempfile import warnings +from typing import cast import airbyte as ab -from airbyte._util.google_secrets import get_gcp_secret_json from airbyte.caches.bigquery import BigQueryCache +from airbyte.secrets.base import SecretString +from airbyte.secrets.google_gsm import GoogleGSMSecretManager warnings.filterwarnings("ignore", message="Cannot create BigQuery Storage client") -bigquery_destination_secret = get_gcp_secret_json( - project_name="dataline-integration-testing", - secret_name="SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS", -) +AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" +SECRET_NAME = "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" + +bigquery_destination_secret: dict = cast( + SecretString, + GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), + ).get_secret(SECRET_NAME), +).parse_json() def main() -> None: diff --git a/examples/run_integ_test_source.py b/examples/run_integ_test_source.py index 51fa1de7..3b50f68f 100644 --- a/examples/run_integ_test_source.py +++ b/examples/run_integ_test_source.py @@ -14,10 +14,16 @@ import sys import airbyte as ab -from airbyte._util.google_secrets import get_gcp_secret_json +from airbyte.secrets.google_gsm import GoogleGSMSecretManager -GCP_SECRETS_PROJECT_NAME = "dataline-integration-testing" +AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" +SECRET_NAME = "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" + +secret_mgr = GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), +) def get_secret_name(connector_name: str) -> str: @@ -39,10 +45,11 @@ def main( secret_name: str | None, streams: list[str] | None, ) -> None: - config = get_gcp_secret_json( + secret = secret_mgr.get_secret( secret_name=secret_name, - project_name=GCP_SECRETS_PROJECT_NAME, ) + assert secret is not None, f"Secret {secret_name} not found." + config = secret.parse_json() source = ab.get_source( connector_name, config=config, diff --git a/examples/run_snowflake_faker.py b/examples/run_snowflake_faker.py index 3e5f7b8f..b4047743 100644 --- a/examples/run_snowflake_faker.py +++ b/examples/run_snowflake_faker.py @@ -8,30 +8,37 @@ from __future__ import annotations import airbyte as ab -from airbyte._util.google_secrets import get_gcp_secret_json from airbyte.caches import SnowflakeCache +from airbyte.secrets.google_gsm import GoogleGSMSecretManager -source = ab.get_source( - "source-faker", - config={"count": 10000, "seed": 0, "parallelism": 1, "always_updated": False}, - install_if_missing=True, +AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" +secret_mgr = GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), ) -secret = get_gcp_secret_json( - project_name="dataline-integration-testing", +secret = secret_mgr.get_secret( secret_name="AIRBYTE_LIB_SNOWFLAKE_CREDS", ) +assert secret is not None, "Secret not found." +secret_config = secret.parse_json() + cache = SnowflakeCache( - account=secret["account"], - username=secret["username"], - password=secret["password"], - database=secret["database"], - warehouse=secret["warehouse"], - role=secret["role"], + account=secret_config["account"], + username=secret_config["username"], + password=secret_config["password"], + database=secret_config["database"], + warehouse=secret_config["warehouse"], + role=secret_config["role"], ) +source = ab.get_source( + "source-faker", + config={"count": 10000, "seed": 0, "parallelism": 1, "always_updated": False}, + install_if_missing=True, +) source.check() source.select_streams(["products"]) diff --git a/poetry.lock b/poetry.lock index 4cabdaa8..9c67f0d9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,39 @@ # This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +[[package]] +name = "airbyte-api" +version = "0.47.3" +description = "Python Client SDK for Airbyte API" +optional = false +python-versions = ">=3.8" +files = [] +develop = false + +[package.dependencies] +certifi = ">=2023.7.22" +charset-normalizer = ">=3.2.0" +dataclasses-json-speakeasy = ">=0.5.11" +idna = ">=3.4" +jsonpath-python = ">=1.0.6" +marshmallow = ">=3.19.0" +mypy-extensions = ">=1.0.0" +packaging = ">=23.1" +python-dateutil = ">=2.8.2" +requests = ">=2.31.0" +six = ">=1.16.0" +typing_extensions = ">=4.7.1" +typing-inspect = ">=0.9.0" +urllib3 = ">=1.26.18" + +[package.extras] +dev = ["pylint (==2.16.2)"] + +[package.source] +type = "git" +url = "https://github.com/airbytehq/airbyte-api-python-sdk.git" +reference = "856599a4861ee1f0ee4e994feff22e44ffb4cbd4" +resolved_reference = "856599a4861ee1f0ee4e994feff22e44ffb4cbd4" + [[package]] name = "airbyte-cdk" version = "0.73.0" @@ -404,6 +438,21 @@ ssh = ["bcrypt (>=3.1.5)"] test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] test-randomorder = ["pytest-randomly"] +[[package]] +name = "dataclasses-json-speakeasy" +version = "0.5.11" +description = "Easily serialize dataclasses to and from JSON." +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "dataclasses_json_speakeasy-0.5.11-py3-none-any.whl", hash = "sha256:ac52a069a01e8521015d682f37849bfdf056c36fa3f81497055e201fec684104"}, + {file = "dataclasses_json_speakeasy-0.5.11.tar.gz", hash = "sha256:418a987cea2ccf4e4be662f39faa5cc79b47b147c9d1a69d6928d6a27e0c17e8"}, +] + +[package.dependencies] +marshmallow = ">=3.18.0,<4.0.0" +typing-inspect = ">=0.4.0,<1" + [[package]] name = "deprecated" version = "1.2.14" @@ -546,13 +595,13 @@ python-dateutil = ">=2.4" [[package]] name = "filelock" -version = "3.13.3" +version = "3.13.4" description = "A platform independent file lock." optional = false python-versions = ">=3.8" files = [ - {file = "filelock-3.13.3-py3-none-any.whl", hash = "sha256:5ffa845303983e7a0b7ae17636509bc97997d58afeafa72fb141a17b152284cb"}, - {file = "filelock-3.13.3.tar.gz", hash = "sha256:a79895a25bbefdf55d1a2a0a80968f7dbb28edcd6d4234a0afb3f37ecde4b546"}, + {file = "filelock-3.13.4-py3-none-any.whl", hash = "sha256:404e5e9253aa60ad457cae1be07c0f0ca90a63931200a47d9b6a6af84fd7b45f"}, + {file = "filelock-3.13.4.tar.gz", hash = "sha256:d13f466618bfde72bd2c18255e269f72542c6e70e7bac83a0232d6b1cc5c8cf4"}, ] [package.extras] @@ -640,13 +689,13 @@ requests = ["requests (>=2.20.0,<3.0.0.dev0)"] [[package]] name = "google-cloud-bigquery" -version = "3.19.0" +version = "3.20.1" description = "Google BigQuery API client library" optional = false python-versions = ">=3.7" files = [ - {file = "google-cloud-bigquery-3.19.0.tar.gz", hash = "sha256:8e311dae49768e1501fcdc5e916bff4b7e169471e5707919f4a6f78a02b3b5a6"}, - {file = "google_cloud_bigquery-3.19.0-py2.py3-none-any.whl", hash = "sha256:c6b8850247a4b132066e49f6e45f850c22824482838688d744a4398eea1120ed"}, + {file = "google-cloud-bigquery-3.20.1.tar.gz", hash = "sha256:318aa3abab5f1900ee24f63ba8bd02b9cdafaa942d738b4dc14a4ef2cc2d925f"}, + {file = "google_cloud_bigquery-3.20.1-py2.py3-none-any.whl", hash = "sha256:d3e62fe61138c658b8853c402e2d8fb9346c84e602e21e3a26584be10fc5b0a4"}, ] [package.dependencies] @@ -1043,6 +1092,17 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] +[[package]] +name = "jsonpath-python" +version = "1.0.6" +description = "A more powerful JSONPath implementation in modern python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "jsonpath-python-1.0.6.tar.gz", hash = "sha256:dd5be4a72d8a2995c3f583cf82bf3cd1a9544cfdabf2d22595b67aff07349666"}, + {file = "jsonpath_python-1.0.6-py3-none-any.whl", hash = "sha256:1e3b78df579f5efc23565293612decee04214609208a2335884b3ee3f786b575"}, +] + [[package]] name = "jsonref" version = "0.3.0" @@ -1168,6 +1228,25 @@ files = [ {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, ] +[[package]] +name = "marshmallow" +version = "3.21.1" +description = "A lightweight library for converting complex datatypes to and from native Python datatypes." +optional = false +python-versions = ">=3.8" +files = [ + {file = "marshmallow-3.21.1-py3-none-any.whl", hash = "sha256:f085493f79efb0644f270a9bf2892843142d80d7174bbbd2f3713f2a589dc633"}, + {file = "marshmallow-3.21.1.tar.gz", hash = "sha256:4e65e9e0d80fc9e609574b9983cf32579f305c718afb30d7233ab818571768c3"}, +] + +[package.dependencies] +packaging = ">=17.0" + +[package.extras] +dev = ["marshmallow[tests]", "pre-commit (>=3.5,<4.0)", "tox"] +docs = ["alabaster (==0.7.16)", "autodocsumm (==0.2.12)", "sphinx (==7.2.6)", "sphinx-issues (==4.0.0)", "sphinx-version-warning (==1.1.2)"] +tests = ["pytest", "pytz", "simplejson"] + [[package]] name = "mdurl" version = "0.1.2" @@ -1685,58 +1764,58 @@ pyasn1 = ">=0.4.6,<0.7.0" [[package]] name = "pycparser" -version = "2.21" +version = "2.22" description = "C parser in Python" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.8" files = [ - {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, - {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, + {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, + {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] [[package]] name = "pydantic" -version = "1.10.14" +version = "1.10.15" description = "Data validation and settings management using python type hints" optional = false python-versions = ">=3.7" files = [ - {file = "pydantic-1.10.14-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7f4fcec873f90537c382840f330b90f4715eebc2bc9925f04cb92de593eae054"}, - {file = "pydantic-1.10.14-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e3a76f571970fcd3c43ad982daf936ae39b3e90b8a2e96c04113a369869dc87"}, - {file = "pydantic-1.10.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82d886bd3c3fbeaa963692ef6b643159ccb4b4cefaf7ff1617720cbead04fd1d"}, - {file = "pydantic-1.10.14-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:798a3d05ee3b71967844a1164fd5bdb8c22c6d674f26274e78b9f29d81770c4e"}, - {file = "pydantic-1.10.14-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:23d47a4b57a38e8652bcab15a658fdb13c785b9ce217cc3a729504ab4e1d6bc9"}, - {file = "pydantic-1.10.14-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f9f674b5c3bebc2eba401de64f29948ae1e646ba2735f884d1594c5f675d6f2a"}, - {file = "pydantic-1.10.14-cp310-cp310-win_amd64.whl", hash = "sha256:24a7679fab2e0eeedb5a8924fc4a694b3bcaac7d305aeeac72dd7d4e05ecbebf"}, - {file = "pydantic-1.10.14-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9d578ac4bf7fdf10ce14caba6f734c178379bd35c486c6deb6f49006e1ba78a7"}, - {file = "pydantic-1.10.14-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fa7790e94c60f809c95602a26d906eba01a0abee9cc24150e4ce2189352deb1b"}, - {file = "pydantic-1.10.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aad4e10efa5474ed1a611b6d7f0d130f4aafadceb73c11d9e72823e8f508e663"}, - {file = "pydantic-1.10.14-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1245f4f61f467cb3dfeced2b119afef3db386aec3d24a22a1de08c65038b255f"}, - {file = "pydantic-1.10.14-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:21efacc678a11114c765eb52ec0db62edffa89e9a562a94cbf8fa10b5db5c046"}, - {file = "pydantic-1.10.14-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:412ab4a3f6dbd2bf18aefa9f79c7cca23744846b31f1d6555c2ee2b05a2e14ca"}, - {file = "pydantic-1.10.14-cp311-cp311-win_amd64.whl", hash = "sha256:e897c9f35281f7889873a3e6d6b69aa1447ceb024e8495a5f0d02ecd17742a7f"}, - {file = "pydantic-1.10.14-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d604be0f0b44d473e54fdcb12302495fe0467c56509a2f80483476f3ba92b33c"}, - {file = "pydantic-1.10.14-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a42c7d17706911199798d4c464b352e640cab4351efe69c2267823d619a937e5"}, - {file = "pydantic-1.10.14-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:596f12a1085e38dbda5cbb874d0973303e34227b400b6414782bf205cc14940c"}, - {file = "pydantic-1.10.14-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bfb113860e9288d0886e3b9e49d9cf4a9d48b441f52ded7d96db7819028514cc"}, - {file = "pydantic-1.10.14-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:bc3ed06ab13660b565eed80887fcfbc0070f0aa0691fbb351657041d3e874efe"}, - {file = "pydantic-1.10.14-cp37-cp37m-win_amd64.whl", hash = "sha256:ad8c2bc677ae5f6dbd3cf92f2c7dc613507eafe8f71719727cbc0a7dec9a8c01"}, - {file = "pydantic-1.10.14-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c37c28449752bb1f47975d22ef2882d70513c546f8f37201e0fec3a97b816eee"}, - {file = "pydantic-1.10.14-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:49a46a0994dd551ec051986806122767cf144b9702e31d47f6d493c336462597"}, - {file = "pydantic-1.10.14-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53e3819bd20a42470d6dd0fe7fc1c121c92247bca104ce608e609b59bc7a77ee"}, - {file = "pydantic-1.10.14-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0fbb503bbbbab0c588ed3cd21975a1d0d4163b87e360fec17a792f7d8c4ff29f"}, - {file = "pydantic-1.10.14-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:336709883c15c050b9c55a63d6c7ff09be883dbc17805d2b063395dd9d9d0022"}, - {file = "pydantic-1.10.14-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4ae57b4d8e3312d486e2498d42aed3ece7b51848336964e43abbf9671584e67f"}, - {file = "pydantic-1.10.14-cp38-cp38-win_amd64.whl", hash = "sha256:dba49d52500c35cfec0b28aa8b3ea5c37c9df183ffc7210b10ff2a415c125c4a"}, - {file = "pydantic-1.10.14-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c66609e138c31cba607d8e2a7b6a5dc38979a06c900815495b2d90ce6ded35b4"}, - {file = "pydantic-1.10.14-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d986e115e0b39604b9eee3507987368ff8148222da213cd38c359f6f57b3b347"}, - {file = "pydantic-1.10.14-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:646b2b12df4295b4c3148850c85bff29ef6d0d9621a8d091e98094871a62e5c7"}, - {file = "pydantic-1.10.14-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:282613a5969c47c83a8710cc8bfd1e70c9223feb76566f74683af889faadc0ea"}, - {file = "pydantic-1.10.14-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:466669501d08ad8eb3c4fecd991c5e793c4e0bbd62299d05111d4f827cded64f"}, - {file = "pydantic-1.10.14-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:13e86a19dca96373dcf3190fcb8797d40a6f12f154a244a8d1e8e03b8f280593"}, - {file = "pydantic-1.10.14-cp39-cp39-win_amd64.whl", hash = "sha256:08b6ec0917c30861e3fe71a93be1648a2aa4f62f866142ba21670b24444d7fd8"}, - {file = "pydantic-1.10.14-py3-none-any.whl", hash = "sha256:8ee853cd12ac2ddbf0ecbac1c289f95882b2d4482258048079d13be700aa114c"}, - {file = "pydantic-1.10.14.tar.gz", hash = "sha256:46f17b832fe27de7850896f3afee50ea682220dd218f7e9c88d436788419dca6"}, + {file = "pydantic-1.10.15-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:22ed12ee588b1df028a2aa5d66f07bf8f8b4c8579c2e96d5a9c1f96b77f3bb55"}, + {file = "pydantic-1.10.15-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:75279d3cac98186b6ebc2597b06bcbc7244744f6b0b44a23e4ef01e5683cc0d2"}, + {file = "pydantic-1.10.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50f1666a9940d3d68683c9d96e39640f709d7a72ff8702987dab1761036206bb"}, + {file = "pydantic-1.10.15-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82790d4753ee5d00739d6cb5cf56bceb186d9d6ce134aca3ba7befb1eedbc2c8"}, + {file = "pydantic-1.10.15-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:d207d5b87f6cbefbdb1198154292faee8017d7495a54ae58db06762004500d00"}, + {file = "pydantic-1.10.15-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e49db944fad339b2ccb80128ffd3f8af076f9f287197a480bf1e4ca053a866f0"}, + {file = "pydantic-1.10.15-cp310-cp310-win_amd64.whl", hash = "sha256:d3b5c4cbd0c9cb61bbbb19ce335e1f8ab87a811f6d589ed52b0254cf585d709c"}, + {file = "pydantic-1.10.15-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c3d5731a120752248844676bf92f25a12f6e45425e63ce22e0849297a093b5b0"}, + {file = "pydantic-1.10.15-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c365ad9c394f9eeffcb30a82f4246c0006417f03a7c0f8315d6211f25f7cb654"}, + {file = "pydantic-1.10.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3287e1614393119c67bd4404f46e33ae3be3ed4cd10360b48d0a4459f420c6a3"}, + {file = "pydantic-1.10.15-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:be51dd2c8596b25fe43c0a4a59c2bee4f18d88efb8031188f9e7ddc6b469cf44"}, + {file = "pydantic-1.10.15-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6a51a1dd4aa7b3f1317f65493a182d3cff708385327c1c82c81e4a9d6d65b2e4"}, + {file = "pydantic-1.10.15-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4e316e54b5775d1eb59187f9290aeb38acf620e10f7fd2f776d97bb788199e53"}, + {file = "pydantic-1.10.15-cp311-cp311-win_amd64.whl", hash = "sha256:0d142fa1b8f2f0ae11ddd5e3e317dcac060b951d605fda26ca9b234b92214986"}, + {file = "pydantic-1.10.15-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7ea210336b891f5ea334f8fc9f8f862b87acd5d4a0cbc9e3e208e7aa1775dabf"}, + {file = "pydantic-1.10.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3453685ccd7140715e05f2193d64030101eaad26076fad4e246c1cc97e1bb30d"}, + {file = "pydantic-1.10.15-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bea1f03b8d4e8e86702c918ccfd5d947ac268f0f0cc6ed71782e4b09353b26f"}, + {file = "pydantic-1.10.15-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:005655cabc29081de8243126e036f2065bd7ea5b9dff95fde6d2c642d39755de"}, + {file = "pydantic-1.10.15-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:af9850d98fc21e5bc24ea9e35dd80a29faf6462c608728a110c0a30b595e58b7"}, + {file = "pydantic-1.10.15-cp37-cp37m-win_amd64.whl", hash = "sha256:d31ee5b14a82c9afe2bd26aaa405293d4237d0591527d9129ce36e58f19f95c1"}, + {file = "pydantic-1.10.15-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5e09c19df304b8123938dc3c53d3d3be6ec74b9d7d0d80f4f4b5432ae16c2022"}, + {file = "pydantic-1.10.15-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7ac9237cd62947db00a0d16acf2f3e00d1ae9d3bd602b9c415f93e7a9fc10528"}, + {file = "pydantic-1.10.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:584f2d4c98ffec420e02305cf675857bae03c9d617fcfdc34946b1160213a948"}, + {file = "pydantic-1.10.15-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bbc6989fad0c030bd70a0b6f626f98a862224bc2b1e36bfc531ea2facc0a340c"}, + {file = "pydantic-1.10.15-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d573082c6ef99336f2cb5b667b781d2f776d4af311574fb53d908517ba523c22"}, + {file = "pydantic-1.10.15-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6bd7030c9abc80134087d8b6e7aa957e43d35714daa116aced57269a445b8f7b"}, + {file = "pydantic-1.10.15-cp38-cp38-win_amd64.whl", hash = "sha256:3350f527bb04138f8aff932dc828f154847fbdc7a1a44c240fbfff1b57f49a12"}, + {file = "pydantic-1.10.15-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:51d405b42f1b86703555797270e4970a9f9bd7953f3990142e69d1037f9d9e51"}, + {file = "pydantic-1.10.15-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a980a77c52723b0dc56640ced396b73a024d4b74f02bcb2d21dbbac1debbe9d0"}, + {file = "pydantic-1.10.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67f1a1fb467d3f49e1708a3f632b11c69fccb4e748a325d5a491ddc7b5d22383"}, + {file = "pydantic-1.10.15-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:676ed48f2c5bbad835f1a8ed8a6d44c1cd5a21121116d2ac40bd1cd3619746ed"}, + {file = "pydantic-1.10.15-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:92229f73400b80c13afcd050687f4d7e88de9234d74b27e6728aa689abcf58cc"}, + {file = "pydantic-1.10.15-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2746189100c646682eff0bce95efa7d2e203420d8e1c613dc0c6b4c1d9c1fde4"}, + {file = "pydantic-1.10.15-cp39-cp39-win_amd64.whl", hash = "sha256:394f08750bd8eaad714718812e7fab615f873b3cdd0b9d84e76e51ef3b50b6b7"}, + {file = "pydantic-1.10.15-py3-none-any.whl", hash = "sha256:28e552a060ba2740d0d2aabe35162652c1459a0b9069fe0db7f4ee0e18e74d58"}, + {file = "pydantic-1.10.15.tar.gz", hash = "sha256:ca832e124eda231a60a041da4f013e3ff24949d94a01154b137fc2f2a43c3ffb"}, ] [package.dependencies] @@ -1963,13 +2042,13 @@ cli = ["click (>=5.0)"] [[package]] name = "python-ulid" -version = "2.3.0" +version = "2.4.0.post0" description = "Universally unique lexicographically sortable identifier" optional = false python-versions = ">=3.9" files = [ - {file = "python_ulid-2.3.0-py3-none-any.whl", hash = "sha256:1b6ac5b1fae214502feb50c7535ffa3f7f496f3f2abe73296be6bd0a6976bca5"}, - {file = "python_ulid-2.3.0.tar.gz", hash = "sha256:28108e5edf56ee981dd75ea12ae7279a8a23bf01514144dda7a64c38143204a5"}, + {file = "python_ulid-2.4.0.post0-py3-none-any.whl", hash = "sha256:e2c739e27e6d760136e5f411f311cdd3ec9c4c89696932fe803fa09a4dcd6ebe"}, + {file = "python_ulid-2.4.0.post0.tar.gz", hash = "sha256:45779c68b9060beb6fca72338a0620114489e1bbe274935149f14d1f776d4c43"}, ] [package.extras] @@ -2559,13 +2638,13 @@ files = [ [[package]] name = "types-jsonschema" -version = "4.21.0.20240311" +version = "4.21.0.20240331" description = "Typing stubs for jsonschema" optional = false python-versions = ">=3.8" files = [ - {file = "types-jsonschema-4.21.0.20240311.tar.gz", hash = "sha256:f7165ce70abd91df490c73b089873afd2899c5e56430ee495b64f851ad01f287"}, - {file = "types_jsonschema-4.21.0.20240311-py3-none-any.whl", hash = "sha256:e872f5661513824edf9698f73a66c9c114713d93eab58699bd0532e7e6db5750"}, + {file = "types-jsonschema-4.21.0.20240331.tar.gz", hash = "sha256:3a5ed0a72ab7bc304ca4accbb709272c620f396abf2fb19570b80d949e357eb6"}, + {file = "types_jsonschema-4.21.0.20240331-py3-none-any.whl", hash = "sha256:78dec1d88c5aec77e46e6bddce2a082157ce3059ec7aab19169b13b2ee553a51"}, ] [package.dependencies] @@ -2620,15 +2699,30 @@ files = [ [[package]] name = "typing-extensions" -version = "4.10.0" +version = "4.11.0" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.10.0-py3-none-any.whl", hash = "sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475"}, - {file = "typing_extensions-4.10.0.tar.gz", hash = "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"}, + {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"}, + {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"}, ] +[[package]] +name = "typing-inspect" +version = "0.9.0" +description = "Runtime inspection utilities for typing module." +optional = false +python-versions = "*" +files = [ + {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"}, + {file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"}, +] + +[package.dependencies] +mypy-extensions = ">=0.3.0" +typing-extensions = ">=3.7.4" + [[package]] name = "tzdata" version = "2024.1" @@ -2776,4 +2870,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "0b8786d136f0f18fa6b387555902acab38b79f65cb2576734bd8c826815162b1" +content-hash = "13b6f429df688ba505ffc513a714af167da17dc2acb34cd0749cda8d54183a73" diff --git a/pyproject.toml b/pyproject.toml index d42fc6a0..bb48efa4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,14 @@ ulid = "^1.1" # TODO: Remove this arbitrary python constraint once `sqlalchemy-bigquery` has done so. sqlalchemy-bigquery = { version = "1.9.0", python = "<3.13" } +[tool.poetry.dependencies.airbyte-api] +git = "https://github.com/airbytehq/airbyte-api-python-sdk.git" +# Pinned to a specific commit to avoid breaking changes. +# TODO: Use a PyPi version of this after this resolves: +# https://github.com/airbytehq/airbyte-api-python-sdk/issues/67 +# rev = "aj/manual_rename_dir" This is the branch, but the commit is: +rev = "856599a4861ee1f0ee4e994feff22e44ffb4cbd4" + [tool.poetry.group.dev.dependencies] docker = "^7.0.0" faker = "^21.0.0" @@ -67,9 +75,12 @@ requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"] build-backend = "poetry_dynamic_versioning.backend" [tool.pytest.ini_options] +addopts = "--strict-markers" markers = [ "slow: marks tests as slow (deselect with '-m \"not slow\"')", + "super_slow: these super slow tests will not run in CI; they will only ever run on-demand", "requires_creds: marks a test as requiring credentials (skip when secrets unavailable)", + "linting: marks a test as a linting test", ] [tool.ruff.pylint] diff --git a/tests/conftest.py b/tests/conftest.py index 30d8a7ed..0d9e2f8e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,23 +14,24 @@ from requests.exceptions import HTTPError import ulid -from airbyte._util.google_secrets import get_gcp_secret from airbyte._util.meta import is_windows from airbyte.caches.base import CacheBase from airbyte.caches.bigquery import BigQueryCache from airbyte.caches.duckdb import DuckDBCache +from airbyte.caches.motherduck import MotherDuckCache from airbyte.caches.snowflake import SnowflakeCache import docker import psycopg2 as psycopg import pytest from _pytest.nodes import Item -from sqlalchemy import create_engine from airbyte.caches import PostgresCache from airbyte._executor import _get_bin_dir from airbyte.caches.util import new_local_cache -from airbyte.sources.base import as_temp_files +from airbyte.secrets import CustomSecretManager + +import airbyte as ab logger = logging.getLogger(__name__) @@ -41,23 +42,6 @@ LOCAL_TEST_REGISTRY_URL = "./tests/integration_tests/fixtures/registry.json" -AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" - - -def get_ci_secret( - secret_name, - project_name: str = AIRBYTE_INTERNAL_GCP_PROJECT, -) -> str: - return get_gcp_secret(project_name=project_name, secret_name=secret_name) - - -def get_ci_secret_json( - secret_name, - project_name: str = AIRBYTE_INTERNAL_GCP_PROJECT, -) -> dict: - return json.loads(get_ci_secret(secret_name=secret_name, project_name=project_name)) - - def pytest_collection_modifyitems(items: list[Item]) -> None: """Override default pytest behavior, sorting our tests in a sensible execution order. @@ -72,13 +56,13 @@ def pytest_collection_modifyitems(items: list[Item]) -> None: def test_priority(item: Item) -> int: if item.get_closest_marker(name="slow"): return 9 # slow tests have the lowest priority - elif 'lint_tests' in str(item.fspath): + elif "lint_tests" in str(item.fspath): return 1 # lint tests have high priority - elif 'unit_tests' in str(item.fspath): + elif "unit_tests" in str(item.fspath): return 2 # unit tests have highest priority - elif 'docs_tests' in str(item.fspath): + elif "docs_tests" in str(item.fspath): return 3 # doc tests have medium priority - elif 'integration_tests' in str(item.fspath): + elif "integration_tests" in str(item.fspath): return 4 # integration tests have the lowest priority else: return 5 # all other tests have lower priority @@ -92,6 +76,15 @@ def test_priority(item: Item) -> int: if True or not is_docker_available(): item.add_marker(pytest.mark.skip(reason="Skipping tests (Docker not available)")) + # Every test in the cloud directory is slow abd requires credentials + if "integration_tests/cloud" in str(item.fspath): + item.add_marker(pytest.mark.slow) + item.add_marker(pytest.mark.requires_creds) + + if "super_slow" in item.keywords: + # Super slow tests are also slow + item.add_marker("slow") + def is_port_in_use(port): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: @@ -211,52 +204,6 @@ def new_postgres_cache(): postgres.remove() -@pytest.fixture -def new_snowflake_cache(): - secret = get_ci_secret_json( - "AIRBYTE_LIB_SNOWFLAKE_CREDS", - ) - config = SnowflakeCache( - account=secret["account"], - username=secret["username"], - password=secret["password"], - database=secret["database"], - warehouse=secret["warehouse"], - role=secret["role"], - schema_name=f"test{str(ulid.ULID()).lower()[-6:]}", - ) - - yield config - - engine = create_engine(config.get_sql_alchemy_url()) - with engine.begin() as connection: - connection.execute(f"DROP SCHEMA IF EXISTS {config.schema_name}") - - -@pytest.fixture -@pytest.mark.requires_creds -def new_bigquery_cache(): - dest_bigquery_config = get_ci_secret_json( - "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" - ) - - dataset_name = f"test_deleteme_{str(ulid.ULID()).lower()[-6:]}" - credentials_json = dest_bigquery_config["credentials_json"] - with as_temp_files([credentials_json]) as (credentials_path,): - cache = BigQueryCache( - credentials_path=credentials_path, - project_name=dest_bigquery_config["project_id"], - dataset_name=dataset_name - ) - yield cache - - url = cache.get_sql_alchemy_url() - engine = create_engine(url) - with suppress(Exception): - with engine.begin() as connection: - connection.execute(f"DROP SCHEMA IF EXISTS {cache.schema_name}") - - @pytest.fixture(autouse=True) def source_test_registry(monkeypatch): """ @@ -310,36 +257,3 @@ def source_test_installation(): @pytest.fixture(scope="function") def new_duckdb_cache() -> DuckDBCache: return new_local_cache() - - -@pytest.fixture(scope="function") -def new_generic_cache(request) -> CacheBase: - """This is a placeholder fixture that will be overridden by pytest_generate_tests().""" - return request.getfixturevalue(request.param) - - -def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: - """Override default pytest behavior, parameterizing our tests based on the available cache types. - - This is useful for running the same tests with different cache types, to ensure that the tests - can pass across all cache types. - """ - all_cache_type_fixtures: dict[str, str] = { - # Ordered by priority (fastest first) - "DuckDB": "new_duckdb_cache", - "Postgres": "new_postgres_cache", - "BigQuery": "new_bigquery_cache", - "Snowflake": "new_snowflake_cache", - } - if is_windows(): - # Postgres tests require Linux containers - all_cache_type_fixtures.pop("Postgres") - - if "new_generic_cache" in metafunc.fixturenames: - metafunc.parametrize( - "new_generic_cache", - all_cache_type_fixtures.values(), - ids=all_cache_type_fixtures.keys(), - indirect=True, - scope="function", - ) diff --git a/tests/integration_tests/cloud/__init__.py b/tests/integration_tests/cloud/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration_tests/cloud/conftest.py b/tests/integration_tests/cloud/conftest.py new file mode 100644 index 00000000..d00915f3 --- /dev/null +++ b/tests/integration_tests/cloud/conftest.py @@ -0,0 +1,99 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Fixtures for Cloud Workspace integration tests.""" +from __future__ import annotations + +from enum import auto +import os +from pathlib import Path +import sys +import pytest +from airbyte._util.api_util import CLOUD_API_ROOT +from dotenv import dotenv_values +from airbyte._executor import _get_bin_dir +from airbyte.caches.base import CacheBase +from airbyte.cloud import CloudWorkspace +from airbyte._util.temp_files import as_temp_files +from airbyte.secrets.base import SecretString +from airbyte.secrets.google_gsm import GoogleGSMSecretManager + + +AIRBYTE_CLOUD_WORKSPACE_ID = "19d7a891-8e0e-40ac-8a8c-5faf8d11e47c" + +ENV_MOTHERDUCK_API_KEY = "PYAIRBYTE_MOTHERDUCK_API_KEY" +AIRBYTE_CLOUD_API_KEY_SECRET_NAME = "PYAIRBYTE_CLOUD_INTEROP_API_KEY" + + +@pytest.fixture(autouse=True) +def add_venv_bin_to_path(monkeypatch: pytest.MonkeyPatch) -> None: + """Patch the PATH to include the virtual environment's bin directory.""" + # Get the path to the bin directory of the virtual environment + venv_bin_path = str(_get_bin_dir(Path(sys.prefix))) + + # Add the bin directory to the PATH + new_path = f"{venv_bin_path}{os.pathsep}{os.environ['PATH']}" + monkeypatch.setenv('PATH', new_path) + + +@pytest.fixture +def workspace_id() -> str: + return AIRBYTE_CLOUD_WORKSPACE_ID + + +@pytest.fixture +def airbyte_cloud_api_root() -> str: + return CLOUD_API_ROOT + + +@pytest.fixture +def airbyte_cloud_api_key(ci_secret_manager: GoogleGSMSecretManager) -> SecretString: + secret: SecretString | None = ci_secret_manager.get_secret(AIRBYTE_CLOUD_API_KEY_SECRET_NAME) + assert secret, f"Secret '{AIRBYTE_CLOUD_API_KEY_SECRET_NAME}' not found." + return secret + + +@pytest.fixture +def motherduck_api_key(motherduck_secrets: dict) -> SecretString: + return SecretString(motherduck_secrets["motherduck_api_key"]) + + +@pytest.fixture +def cloud_workspace( + workspace_id: str, + airbyte_cloud_api_key: SecretString, + airbyte_cloud_api_root: str, +) -> CloudWorkspace: + return CloudWorkspace( + workspace_id=workspace_id, + api_key=airbyte_cloud_api_key, + api_root=airbyte_cloud_api_root, + ) + + +@pytest.fixture(scope="function") +def new_deployable_cache(request) -> CacheBase: + """This is a placeholder fixture that will be overridden by pytest_generate_tests().""" + return request.getfixturevalue(request.param) + + +def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: + """Override default pytest behavior, parameterizing our tests based on the available cache types. + + This is useful for running the same tests with different cache types, to ensure that the tests + can pass across all cache types. + """ + deployable_cache_fixtures: dict[str, str] = { + # Ordered by priority (fastest first) + # "MotherDuck": "new_motherduck_cache", + # "Postgres": "new_remote_postgres_cache", + "BigQuery": "new_bigquery_cache", + "Snowflake": "new_snowflake_cache", + } + + if "new_deployable_cache" in metafunc.fixturenames: + metafunc.parametrize( + "new_deployable_cache", + deployable_cache_fixtures.values(), + ids=deployable_cache_fixtures.keys(), + indirect=True, + scope="function", + ) diff --git a/tests/integration_tests/cloud/test_cloud_api_util.py b/tests/integration_tests/cloud/test_cloud_api_util.py new file mode 100644 index 00000000..83dea4f6 --- /dev/null +++ b/tests/integration_tests/cloud/test_cloud_api_util.py @@ -0,0 +1,137 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. + +"""Integration tests which test CRUD operations on the Airbyte API. + +These tests are designed to be run against a running instance of the Airbyte API. +""" +from __future__ import annotations + +import ulid + +from airbyte._util import api_util +from airbyte_api.models.shared import SourceFaker, DestinationDuckdb + + +def test_create_and_delete_source( + workspace_id: str, + airbyte_cloud_api_root: str, + airbyte_cloud_api_key: str, +) -> None: + new_resource_name = "deleteme-source-faker" + str(ulid.ULID()).lower()[-6:] + source_config = SourceFaker() + source = api_util.create_source( + name=new_resource_name, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, + workspace_id=workspace_id, + config=source_config, + ) + assert source.name == new_resource_name + assert source.source_type == "faker" + assert source.source_id + + api_util.delete_source( + source_id=source.source_id, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, + workspace_id=workspace_id, + ) + + +def test_create_and_delete_destination( + workspace_id: str, + airbyte_cloud_api_root: str, + airbyte_cloud_api_key: str, + motherduck_api_key: str, +) -> None: + new_resource_name = "deleteme-destination-faker" + str(ulid.ULID()).lower()[-6:] + destination_config = DestinationDuckdb( + destination_path="temp_db", + motherduck_api_key=motherduck_api_key, + ) + + destination = api_util.create_destination( + name=new_resource_name, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, + workspace_id=workspace_id, + config=destination_config, + ) + assert destination.name == new_resource_name + assert destination.destination_type == "duckdb" + assert destination.destination_id + + api_util.delete_destination( + destination_id=destination.destination_id, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, + workspace_id=workspace_id, + ) + + +def test_create_and_delete_connection( + workspace_id: str, + airbyte_cloud_api_root: str, + airbyte_cloud_api_key: str, + motherduck_api_key: str, +) -> None: + new_source_name = "deleteme-source-faker" + str(ulid.ULID()).lower()[-6:] + new_destination_name = "deleteme-destination-dummy" + str(ulid.ULID()).lower()[-6:] + new_connection_name = "deleteme-connection-dummy" + str(ulid.ULID()).lower()[-6:] + source = api_util.create_source( + name=new_source_name, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, + workspace_id=workspace_id, + config=SourceFaker(), + ) + assert source.name == new_source_name + assert source.source_type == "faker" + assert source.source_id + + destination = api_util.create_destination( + name=new_destination_name, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, + workspace_id=workspace_id, + config=DestinationDuckdb( + destination_path="temp_db", + motherduck_api_key=motherduck_api_key, + ), + ) + assert destination.name == new_destination_name + assert destination.destination_type == "duckdb" + assert destination.destination_id + + connection = api_util.create_connection( + name=new_connection_name, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, + workspace_id=workspace_id, + source_id=source.source_id, + destination_id=destination.destination_id, + prefix="", + selected_stream_names=["users", "purchases", "products"], + ) + assert connection.source_id == source.source_id + assert connection.destination_id == destination.destination_id + assert connection.connection_id + + api_util.delete_connection( + connection_id=connection.connection_id, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, + workspace_id=workspace_id, + ) + api_util.delete_source( + source_id=source.source_id, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, + workspace_id=workspace_id, + ) + api_util.delete_destination( + destination_id=destination.destination_id, + api_root=airbyte_cloud_api_root, + api_key=airbyte_cloud_api_key, + workspace_id=workspace_id, + ) diff --git a/tests/integration_tests/cloud/test_cloud_sql_reads.py b/tests/integration_tests/cloud/test_cloud_sql_reads.py new file mode 100644 index 00000000..15e69064 --- /dev/null +++ b/tests/integration_tests/cloud/test_cloud_sql_reads.py @@ -0,0 +1,160 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Integration tests for reading from cache.""" +from __future__ import annotations +from contextlib import suppress + +import pytest +import pandas as pd +from sqlalchemy.engine.base import Engine + +import airbyte as ab +from airbyte import cloud +from airbyte.cloud.sync_results import SyncResult + +@pytest.fixture +def deployable_source() -> ab.Source: + return ab.get_source( + "source-faker", + local_executable="source-faker", + config={"count": 100}, + install_if_missing=False, + ) + + +@pytest.fixture +def previous_job_run_id() -> str: + return "10136196" + + +@pytest.mark.super_slow +def test_deploy_and_run_and_read( + cloud_workspace: cloud.CloudWorkspace, + new_deployable_cache: ab.BigQueryCache | ab.SnowflakeCache, + deployable_source: ab.Source, +) -> None: + """Test reading from a cache.""" + + # Deploy source, destination, and connection: + source_id = cloud_workspace._deploy_source(source=deployable_source) + destination_id = cloud_workspace._deploy_cache_as_destination(cache=new_deployable_cache) + connection: cloud.CloudConnection = cloud_workspace._deploy_connection( + source=deployable_source, + cache=new_deployable_cache, + table_prefix=new_deployable_cache.table_prefix, + selected_streams=deployable_source.get_selected_streams(), + ) + + # Run sync and get result: + sync_result: SyncResult = connection.run_sync() + + # TODO: Remove this second run after Destination bug is resolved: + # https://github.com/airbytehq/airbyte/issues/36875 + sync_result: SyncResult = connection.run_sync() + + # Check sync result: + assert sync_result.is_job_complete() + assert set(sync_result.stream_names) == set(["users", "products", "purchases"]) + + dataset: ab.CachedDataset = sync_result.get_dataset(stream_name="users") + assert dataset.stream_name == "users" + data_as_list = list(dataset) + assert len(data_as_list) == 100 + + # Cleanup + with suppress(Exception): + cloud_workspace._permanently_delete_connection( + connection_id=connection_id, + delete_source=True, + delete_destination=True, + ) + with suppress(Exception): + cloud_workspace._permanently_delete_source(source_id=source_id) + with suppress(Exception): + cloud_workspace._permanently_delete_destination(destination_id=destination_id) + + +@pytest.mark.parametrize( + "deployed_connection_id", + [ + pytest.param("c7b4d838-a612-495a-9d91-a14e477add51", id="Faker->Snowflake"), + pytest.param("0e1d6b32-b8e3-4b68-91a3-3a314599c782", id="Faker->BigQuery"), + pytest.param("", id="Faker->Postgres", marks=pytest.mark.skip(reason="Not yet supported")), + pytest.param("", id="Faker->MotherDuck", marks=pytest.mark.skip(reason="Not yet supported")), + ], +) +def test_read_from_deployed_connection( + cloud_workspace: cloud.CloudWorkspace, + deployed_connection_id: str, +) -> None: + """Test reading from a cache.""" + # Run sync and get result: + sync_result: SyncResult = cloud_workspace.get_sync_result(connection_id=deployed_connection_id) + + # Test sync result: + assert sync_result.is_job_complete() + + cache = sync_result.get_sql_cache() + sqlalchemy_url = cache.get_sql_alchemy_url() + engine: Engine = sync_result.get_sql_engine() + # assert sync_result.stream_names == ["users", "products", "purchases"] + + dataset: ab.CachedDataset = sync_result.get_dataset(stream_name="users") + assert dataset.stream_name == "users" + data_as_list = list(dataset) + assert len(data_as_list) == 100 + + # TODO: Fails on BigQuery: https://github.com/airbytehq/PyAirbyte/issues/165 + # pandas_df = dataset.to_pandas() + + pandas_df = pd.DataFrame(data_as_list) + + assert pandas_df.shape == (100, 20) + + # Check that no values are null + for col in pandas_df.columns: + assert pandas_df[col].notnull().all() + + +@pytest.mark.parametrize( + "deployed_connection_id", + [ + pytest.param("c7b4d838-a612-495a-9d91-a14e477add51", id="Faker->Snowflake"), + pytest.param("0e1d6b32-b8e3-4b68-91a3-3a314599c782", id="Faker->BigQuery"), + pytest.param("", id="Faker->Postgres", marks=pytest.mark.skip(reason="Not yet supported")), + pytest.param("", id="Faker->MotherDuck", marks=pytest.mark.skip(reason="Not yet supported")), + ], +) +def test_read_from_previous_job( + cloud_workspace: cloud.CloudWorkspace, + deployed_connection_id: str, + previous_job_run_id: str, +) -> None: + """Test reading from a cache.""" + # Run sync and get result: + sync_result: SyncResult = cloud_workspace.get_sync_result( + connection_id=deployed_connection_id, + job_id=previous_job_run_id, + ) + + # Test sync result: + assert sync_result.is_job_complete() + + cache = sync_result.get_sql_cache() + sqlalchemy_url = cache.get_sql_alchemy_url() + engine: Engine = sync_result.get_sql_engine() + + assert "users" in sync_result.stream_names + dataset: ab.CachedDataset = sync_result.get_dataset(stream_name="users") + assert dataset.stream_name == "users" + data_as_list = list(dataset) + assert len(data_as_list) == 100 + + # TODO: Fails on BigQuery: https://github.com/airbytehq/PyAirbyte/issues/165 + # pandas_df = dataset.to_pandas() + + pandas_df = pd.DataFrame(data_as_list) + + assert pandas_df.shape == (100, 20) + for col in pandas_df.columns: + # Check that no values are null + assert pandas_df[col].notnull().all() diff --git a/tests/integration_tests/cloud/test_cloud_sync.py b/tests/integration_tests/cloud/test_cloud_sync.py new file mode 100644 index 00000000..2c79cc24 --- /dev/null +++ b/tests/integration_tests/cloud/test_cloud_sync.py @@ -0,0 +1,77 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Cloud Workspace integration tests. + +These tests are designed to be run against a running instance of the Airbyte API. +""" +from __future__ import annotations + +import pytest + +import airbyte as ab +from airbyte.caches import MotherDuckCache +from airbyte.cloud import CloudWorkspace +from airbyte.cloud.sync_results import SyncResult + + +@pytest.fixture +def pre_created_connection_id() -> str: + return "80857d37-1f21-4500-a802-f5ac08d1a3dd" + + +@pytest.mark.super_slow +def test_run_connection( + cloud_workspace: CloudWorkspace, + pre_created_connection_id: str, +) -> None: + """Test running a connection.""" + sync_result: SyncResult = cloud_workspace.run_sync(connection_id=pre_created_connection_id) + assert sync_result.is_job_complete() + assert sync_result.stream_names + + + +@pytest.mark.super_slow +def test_get_previous_sync_result( + cloud_workspace: CloudWorkspace, + pre_created_connection_id: str, +) -> None: + """Test running a connection.""" + sync_result: SyncResult = cloud_workspace.get_previous_sync_logs( + connection_id=pre_created_connection_id, + ) + assert sync_result.is_job_complete() + assert sync_result.get_job_status() + assert sync_result.stream_names + + + +@pytest.mark.super_slow +@pytest.mark.skip(reason="This test is not yet complete. It is hanging currently.") +def test_deploy_and_run_connection( + cloud_workspace: CloudWorkspace, + motherduck_api_key: str, +) -> None: + """Test deploying a source and cache to a workspace as a new connection.""" + source = ab.get_source( + "source-faker", + local_executable="source-faker", + config={"count": 100}, + install_if_missing=False, + ) + source.check() + + cache = MotherDuckCache( + api_key=motherduck_api_key, + database="temp", + schema_name="public", + ) + + connection_id: str = cloud_workspace._deploy_connection(source=source, cache=cache) + sync_result = cloud_workspace.run_sync(connection_id=connection_id) + _ = sync_result + + cache = sync_result.get_sql_cache() + assert cache.stream_names + assert cache.streams["users"].to_pandas() + + cloud_workspace._permanently_delete_connection(connection_id=connection_id) diff --git a/tests/integration_tests/cloud/test_cloud_workspaces.py b/tests/integration_tests/cloud/test_cloud_workspaces.py new file mode 100644 index 00000000..fb6cd930 --- /dev/null +++ b/tests/integration_tests/cloud/test_cloud_workspaces.py @@ -0,0 +1,76 @@ +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +"""Cloud Workspace integration tests. + +These tests are designed to be run against a running instance of the Airbyte API. +""" +from __future__ import annotations + +import airbyte as ab +from airbyte.caches import MotherDuckCache +from airbyte.cloud import CloudWorkspace +from airbyte.cloud.connections import CloudConnection + + +def test_deploy_source( + cloud_workspace: CloudWorkspace, +) -> None: + """Test deploying a source to a workspace.""" + source = ab.get_source( + "source-faker", + local_executable="source-faker", + config={"count": 100}, + install_if_missing=False, + ) + source.check() + source_id: str = cloud_workspace._deploy_source(source) + + cloud_workspace._permanently_delete_source(source=source_id) + + +def test_deploy_cache_as_destination( + cloud_workspace: CloudWorkspace, + motherduck_api_key: str, +) -> None: + """Test deploying a cache to a workspace as a destination.""" + cache = MotherDuckCache( + api_key=motherduck_api_key, + database="temp", + schema_name="public", + ) + destination_id: str = cloud_workspace._deploy_cache_as_destination(cache=cache) + cloud_workspace._permanently_delete_destination(destination=destination_id) + + +def test_deploy_connection( + cloud_workspace: CloudWorkspace, + motherduck_api_key: str, +) -> None: + """Test deploying a source and cache to a workspace as a new connection.""" + source = ab.get_source( + "source-faker", + local_executable="source-faker", + config={"count": 100}, + install_if_missing=False, + ) + source.check() + + cache = MotherDuckCache( + api_key=motherduck_api_key, + database="temp", + schema_name="public", + table_prefix="abc_deleteme_", + # table_suffix="", # Suffix not supported in CloudConnection + ) + + connection: CloudConnection = cloud_workspace._deploy_connection( + source=source, + cache=cache, + ) + assert set(connection.stream_names) == set(["users", "products", "purchases"]) + assert connection.table_prefix == "abc_deleteme_" + # assert connection.table_suffix == "" # Suffix not supported in CloudConnection + cloud_workspace._permanently_delete_connection( + connection=connection, + delete_source=True, + delete_destination=True, + ) diff --git a/tests/integration_tests/conftest.py b/tests/integration_tests/conftest.py new file mode 100644 index 00000000..e2976348 --- /dev/null +++ b/tests/integration_tests/conftest.py @@ -0,0 +1,174 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Fixtures for integration tests.""" + +from __future__ import annotations +from contextlib import suppress +import os + +import pytest +import ulid +from sqlalchemy import create_engine + +from airbyte._util import meta +from airbyte.caches.base import CacheBase +from airbyte.caches.bigquery import BigQueryCache +from airbyte.caches.motherduck import MotherDuckCache +from airbyte.caches.snowflake import SnowflakeCache +from airbyte.secrets import CustomSecretManager, GoogleGSMSecretManager, SecretHandle +from airbyte._util.temp_files import as_temp_files + +import airbyte as ab + + +AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing" + + +@pytest.fixture(scope="session") +def ci_secret_manager() -> GoogleGSMSecretManager: + secret = ab.get_secret("GCP_GSM_CREDENTIALS") + if not secret or secret.is_empty(): + pytest.skip("GCP_GSM_CREDENTIALS secret not found.") + + return GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), + ) + + +def get_connector_config(self, connector_name: str, index: int = 0) -> dict | None: + """Retrieve the connector configuration from GSM.""" + gsm_secrets_manager = GoogleGSMSecretManager( + project=AIRBYTE_INTERNAL_GCP_PROJECT, + credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"), + ) + first_secret: SecretHandle = next(gsm_secrets_manager.fetch_connector_secrets( + connector_name=connector_name, + ), None) + + print(f"Found '{connector_name}' credential secret '${first_secret.secret_name}'.") + return first_secret.get_value().parse_json() + + +@pytest.fixture(scope="session") +def motherduck_secrets(ci_secret_manager: GoogleGSMSecretManager) -> dict: + return ci_secret_manager.get_secret( + "SECRET_DESTINATION_DUCKDB__MOTHERDUCK__CREDS", + ).parse_json() + + +@pytest.fixture +def new_motherduck_cache( + motherduck_secrets, +) -> MotherDuckCache: + return MotherDuckCache( + database="integration_tests_deleteany", + schema_name=f"test_deleteme_{str(ulid.ULID()).lower()[-6:]}", + api_key=motherduck_secrets["motherduck_api_key"], + ) + + +@pytest.fixture(scope="session") +def snowflake_creds(ci_secret_manager: GoogleGSMSecretManager) -> dict: + return ci_secret_manager.get_secret( + "AIRBYTE_LIB_SNOWFLAKE_CREDS", + ).parse_json() + + +@pytest.fixture +def new_snowflake_cache(snowflake_creds: dict): + config = SnowflakeCache( + account=snowflake_creds["account"], + username=snowflake_creds["username"], + password=snowflake_creds["password"], + database=snowflake_creds["database"], + warehouse=snowflake_creds["warehouse"], + role=snowflake_creds["role"], + schema_name=f"test{str(ulid.ULID()).lower()[-6:]}", + ) + sqlalchemy_url = config.get_sql_alchemy_url() + + yield config + + engine = create_engine(config.get_sql_alchemy_url()) + with engine.begin() as connection: + connection.execute(f"DROP SCHEMA IF EXISTS {config.schema_name}") + + +@pytest.fixture +def new_bigquery_cache(ci_secret_manager: GoogleGSMSecretManager): + dest_bigquery_config = ci_secret_manager.get_secret( + "SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" + ).parse_json() + + dataset_name = f"test_deleteme_{str(ulid.ULID()).lower()[-6:]}" + credentials_json = dest_bigquery_config["credentials_json"] + with as_temp_files([credentials_json]) as (credentials_path,): + cache = BigQueryCache( + credentials_path=credentials_path, + project_name=dest_bigquery_config["project_id"], + dataset_name=dataset_name, + ) + yield cache + + url = cache.get_sql_alchemy_url() + engine = create_engine(url) + with suppress(Exception): + with engine.begin() as connection: + connection.execute(f"DROP SCHEMA IF EXISTS {cache.schema_name}") + + +@pytest.fixture(autouse=True, scope="session") +def bigquery_credentials_file(ci_secret_manager: GoogleGSMSecretManager): + dest_bigquery_config = ci_secret_manager.get_secret( + secret_name="SECRET_DESTINATION-BIGQUERY_CREDENTIALS__CREDS" + ).parse_json() + + credentials_json = dest_bigquery_config["credentials_json"] + with as_temp_files(files_contents=[credentials_json]) as (credentials_path,): + os.environ["BIGQUERY_CREDENTIALS_PATH"] = credentials_path + + yield + + return + + +@pytest.fixture(autouse=True, scope="session") +def with_snowflake_password_env_var(snowflake_creds: dict): + os.environ["SNOWFLAKE_PASSWORD"] = snowflake_creds["password"] + + yield + + return + + +@pytest.fixture(scope="function") +def new_generic_cache(request) -> CacheBase: + """This is a placeholder fixture that will be overridden by pytest_generate_tests().""" + return request.getfixturevalue(request.param) + + +def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: + """Override default pytest behavior, parameterizing our tests based on the available cache types. + + This is useful for running the same tests with different cache types, to ensure that the tests + can pass across all cache types. + """ + all_cache_type_fixtures: dict[str, str] = { + # Ordered by priority (fastest first) + "DuckDB": "new_duckdb_cache", + "Postgres": "new_postgres_cache", + "BigQuery": "new_bigquery_cache", + "Snowflake": "new_snowflake_cache", + } + if meta.is_windows(): + # Postgres tests require Linux containers + all_cache_type_fixtures.pop("Postgres") + + if "new_generic_cache" in metafunc.fixturenames: + metafunc.parametrize( + "new_generic_cache", + all_cache_type_fixtures.values(), + ids=all_cache_type_fixtures.keys(), + indirect=True, + scope="function", + ) diff --git a/tests/unit_tests/test_bigquery_cache.py b/tests/integration_tests/test_bigquery_cache.py similarity index 100% rename from tests/unit_tests/test_bigquery_cache.py rename to tests/integration_tests/test_bigquery_cache.py diff --git a/tests/integration_tests/test_duckdb_cache.py b/tests/integration_tests/test_duckdb_cache.py index abdcaf11..b13ceab9 100644 --- a/tests/integration_tests/test_duckdb_cache.py +++ b/tests/integration_tests/test_duckdb_cache.py @@ -34,10 +34,9 @@ FAKER_SCALE_B = 300 -# Patch PATH to include the source-faker executable. - @pytest.fixture(autouse=True) def add_venv_bin_to_path(monkeypatch): + """Patch the PATH to include the virtual environment's bin directory.""" # Get the path to the bin directory of the virtual environment venv_bin_path = str(_get_bin_dir(Path(sys.prefix))) @@ -76,9 +75,3 @@ def duckdb_cache() -> Generator[DuckDBCache, None, None]: yield cache # TODO: Delete cache DB file after test is complete. return - - -def test_duckdb_cache(duckdb_cache: DuckDBCache) -> None: - """Test that the duckdb cache is available.""" - assert duckdb_cache - assert isinstance(duckdb_cache, DuckDBCache) diff --git a/tests/integration_tests/test_source_faker_integration.py b/tests/integration_tests/test_source_faker_integration.py index 1a2318c6..dde24bbf 100644 --- a/tests/integration_tests/test_source_faker_integration.py +++ b/tests/integration_tests/test_source_faker_integration.py @@ -128,12 +128,6 @@ def test_which_source_faker() -> None: f"Can't find source-faker on PATH: {os.environ['PATH']}" -def test_duckdb_cache(duckdb_cache: DuckDBCache) -> None: - """Test that the duckdb cache is available.""" - assert duckdb_cache - assert isinstance(duckdb_cache, DuckDBCache) - - def test_faker_pks( source_faker_seed_a: ab.Source, duckdb_cache: DuckDBCache, diff --git a/tests/integration_tests/test_source_test_fixture.py b/tests/integration_tests/test_source_test_fixture.py index 5672ebe6..e42c93e4 100644 --- a/tests/integration_tests/test_source_test_fixture.py +++ b/tests/integration_tests/test_source_test_fixture.py @@ -607,7 +607,7 @@ def test_lazy_dataset_from_source( pop_internal_columns_from_dataset(list_from_iter_b) # Make sure that we get a key error if we try to access a stream that doesn't exist - with pytest.raises(exc.AirbyteLibInputError): + with pytest.raises(exc.PyAirbyteInputError): source.get_records(not_a_stream_name) # Make sure we can iterate on all available streams diff --git a/tests/unit_tests/test_exceptions.py b/tests/unit_tests/test_exceptions.py index 919ea7ed..11f0a1cb 100644 --- a/tests/unit_tests/test_exceptions.py +++ b/tests/unit_tests/test_exceptions.py @@ -21,7 +21,9 @@ def test_exceptions(): assert message.count("\n") == 0 assert message != "" assert message.strip() == message - assert name.startswith("Airbyte") + assert any( + [name.startswith(prefix) for prefix in ["Airbyte", "PyAirbyte"]] + ), f"{name} does not start with Airbyte or PyAirbyte" assert name.endswith("Error")