From 01363dd736f595c3c5350412749ef40d3023a9fc Mon Sep 17 00:00:00 2001 From: Jason T Brown Date: Mon, 6 Jan 2025 21:28:21 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Source=20Azure=20Blob=20Storage:=20?= =?UTF-8?q?add=20client=5Fcredentials=20auth=20(#50398)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Octavia Squidington III Co-authored-by: Marcos Marx Co-authored-by: marcosmarxm --- .../integration_tests/spec.json | 73 ++++++++++++------- .../source-azure-blob-storage/metadata.yaml | 2 +- .../source-azure-blob-storage/pyproject.toml | 2 +- .../source_azure_blob_storage/spec.py | 21 +++++- .../stream_reader.py | 56 ++++++++++++-- .../unit_tests/test_authenticator.py | 22 +++++- .../unit_tests/test_config_migration.py | 25 +++++-- .../sources/azure-blob-storage.md | 33 +++++++-- 8 files changed, 186 insertions(+), 48 deletions(-) diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json b/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json index 6d18352e7754..cf2e9e44688b 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json +++ b/airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json @@ -35,9 +35,7 @@ "default": ["**"], "order": 1, "type": "array", - "items": { - "type": "string" - } + "items": { "type": "string" } }, "legacy_prefix": { "title": "Legacy Prefix", @@ -136,9 +134,7 @@ "description": "A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.", "default": [], "type": "array", - "items": { - "type": "string" - }, + "items": { "type": "string" }, "uniqueItems": true }, "strings_can_be_null": { @@ -162,9 +158,7 @@ "header_definition": { "title": "CSV Header Definition", "description": "How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.", - "default": { - "header_definition_type": "From CSV" - }, + "default": { "header_definition_type": "From CSV" }, "oneOf": [ { "title": "From CSV", @@ -206,9 +200,7 @@ "title": "Column Names", "description": "The column names that will be used while emitting the CSV records", "type": "array", - "items": { - "type": "string" - } + "items": { "type": "string" } } }, "required": ["column_names", "header_definition_type"] @@ -221,9 +213,7 @@ "description": "A set of case-sensitive strings that should be interpreted as true values.", "default": ["y", "yes", "t", "true", "on", "1"], "type": "array", - "items": { - "type": "string" - }, + "items": { "type": "string" }, "uniqueItems": true }, "false_values": { @@ -231,9 +221,7 @@ "description": "A set of case-sensitive strings that should be interpreted as false values.", "default": ["n", "no", "f", "false", "off", "0"], "type": "array", - "items": { - "type": "string" - }, + "items": { "type": "string" }, "uniqueItems": true }, "inference_type": { @@ -313,9 +301,7 @@ "processing": { "title": "Processing", "description": "Processing configuration", - "default": { - "mode": "local" - }, + "default": { "mode": "local" }, "type": "object", "oneOf": [ { @@ -401,6 +387,43 @@ "auth_type" ] }, + { + "title": "Authenticate via Client Credentials", + "type": "object", + "properties": { + "auth_type": { + "title": "Auth Type", + "default": "client_credentials", + "const": "client_credentials", + "enum": ["client_credentials"], + "type": "string" + }, + "app_tenant_id": { + "title": "Tenant ID", + "description": "Tenant ID of the Microsoft Azure Application", + "airbyte_secret": true, + "type": "string" + }, + "app_client_id": { + "title": "Client ID", + "description": "Client ID of your Microsoft developer application", + "airbyte_secret": true, + "type": "string" + }, + "app_client_secret": { + "title": "Client Secret", + "description": "Client Secret of your Microsoft developer application", + "airbyte_secret": true, + "type": "string" + } + }, + "required": [ + "app_tenant_id", + "app_client_id", + "app_client_secret", + "auth_type" + ] + }, { "title": "Authenticate via Storage Account Key", "type": "object", @@ -485,12 +508,8 @@ "type": "object", "additionalProperties": false, "properties": { - "client_id": { - "type": "string" - }, - "client_secret": { - "type": "string" - } + "client_id": { "type": "string" }, + "client_secret": { "type": "string" } } }, "complete_oauth_server_output_specification": { diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml b/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml index 414b161dc19a..8ab59ad83768 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml +++ b/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml @@ -12,7 +12,7 @@ data: connectorSubtype: file connectorType: source definitionId: fdaaba68-4875-4ed9-8fcd-4ae1e0a25093 - dockerImageTag: 0.4.4 + dockerImageTag: 0.5.0 dockerRepository: airbyte/source-azure-blob-storage documentationUrl: https://docs.airbyte.com/integrations/sources/azure-blob-storage githubIssueLabel: source-azure-blob-storage diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/pyproject.toml b/airbyte-integrations/connectors/source-azure-blob-storage/pyproject.toml index 8fbfa597d162..5dfb28755435 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/pyproject.toml +++ b/airbyte-integrations/connectors/source-azure-blob-storage/pyproject.toml @@ -3,7 +3,7 @@ requires = [ "poetry-core>=1.0.0",] build-backend = "poetry.core.masonry.api" [tool.poetry] -version = "0.4.4" +version = "0.5.0" name = "source-azure-blob-storage" description = "Source implementation for Azure Blob Storage." authors = [ "Airbyte ",] diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/spec.py b/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/spec.py index 51ce990fd215..c92a71d73175 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/spec.py +++ b/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/spec.py @@ -36,6 +36,25 @@ class Config(OneOfOptionConfig): ) +class ClientCredentials(BaseModel): + class Config(OneOfOptionConfig): + title = "Authenticate via Client Credentials" + discriminator = "auth_type" + + auth_type: Literal["client_credentials"] = Field("client_credentials", const=True) + app_tenant_id: str = Field(title="Tenant ID", description="Tenant ID of the Microsoft Azure Application", airbyte_secret=True) + app_client_id: str = Field( + title="Client ID", + description="Client ID of your Microsoft developer application", + airbyte_secret=True, + ) + app_client_secret: str = Field( + title="Client Secret", + description="Client Secret of your Microsoft developer application", + airbyte_secret=True, + ) + + class StorageAccountKey(BaseModel): class Config(OneOfOptionConfig): title = "Authenticate via Storage Account Key" @@ -61,7 +80,7 @@ class SourceAzureBlobStorageSpec(AbstractFileBasedSpec): def documentation_url(cls) -> AnyUrl: return AnyUrl("https://docs.airbyte.com/integrations/sources/azure-blob-storage", scheme="https") - credentials: Union[Oauth2, StorageAccountKey] = Field( + credentials: Union[Oauth2, ClientCredentials, StorageAccountKey] = Field( title="Authentication", description="Credentials for connecting to the Azure Blob Storage", discriminator="auth_type", diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/stream_reader.py b/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/stream_reader.py index 5a835782741e..be5febe283f0 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/stream_reader.py +++ b/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/stream_reader.py @@ -3,10 +3,10 @@ import logging from io import IOBase -from typing import Iterable, List, Optional, Union +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union import pytz -from azure.core.credentials import AccessToken +from azure.core.credentials import AccessToken, TokenCredential from azure.core.exceptions import ResourceNotFoundError from azure.storage.blob import BlobServiceClient, ContainerClient from smart_open import open @@ -19,7 +19,46 @@ from .spec import SourceAzureBlobStorageSpec -class AzureOauth2Authenticator(Oauth2Authenticator): +class AzureClientCredentialsAuthenticator(Oauth2Authenticator, TokenCredential): + def __init__(self, tenant_id: str, client_id: str, client_secret: str, **kwargs): + super().__init__( + token_refresh_endpoint=f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token", + client_id=client_id, + client_secret=client_secret, + grant_type="client_credentials", + scopes=["https://storage.azure.com/.default"], + refresh_token=None, + ) + + def build_refresh_request_body(self) -> Mapping[str, Any]: + """ + Returns the request body to set on the refresh request + + Override to define additional parameters + """ + payload: MutableMapping[str, Any] = { + "grant_type": self.get_grant_type(), + "client_id": self.get_client_id(), + "client_secret": self.get_client_secret(), + } + + if self.get_scopes(): + payload["scope"] = " ".join(self.get_scopes()) + + if self.get_refresh_request_body(): + for key, val in self.get_refresh_request_body().items(): + # We defer to existing oauth constructs over custom configured fields + if key not in payload: + payload[key] = val + + return payload + + def get_token(self, *args, **kwargs) -> AccessToken: + """Parent class handles Oauth Refresh token logic.""" + return AccessToken(token=self.get_access_token(), expires_on=int(self.get_token_expiry_date().timestamp())) + + +class AzureOauth2Authenticator(Oauth2Authenticator, TokenCredential): """ Authenticator for Azure Blob Storage SDK to align with azure.core.credentials.TokenCredential protocol """ @@ -63,17 +102,24 @@ def azure_blob_service_client(self): return BlobServiceClient(self.account_url, credential=self._credentials) @property - def azure_credentials(self) -> Union[str, AzureOauth2Authenticator]: + def azure_credentials(self) -> Union[str, AzureOauth2Authenticator, AzureClientCredentialsAuthenticator]: if not self._credentials: if self.config.credentials.auth_type == "storage_account_key": self._credentials = self.config.credentials.azure_blob_storage_account_key - else: + elif self.config.credentials.auth_type == "oauth2": self._credentials = AzureOauth2Authenticator( token_refresh_endpoint=f"https://login.microsoftonline.com/{self.config.credentials.tenant_id}/oauth2/v2.0/token", client_id=self.config.credentials.client_id, client_secret=self.config.credentials.client_secret, refresh_token=self.config.credentials.refresh_token, ) + elif self.config.credentials.auth_type == "client_credentials": + self._credentials = AzureClientCredentialsAuthenticator( + tenant_id=self.config.credentials.app_tenant_id, + client_id=self.config.credentials.app_client_id, + client_secret=self.config.credentials.app_client_secret, + ) + return self._credentials def get_matching_files( diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/unit_tests/test_authenticator.py b/airbyte-integrations/connectors/source-azure-blob-storage/unit_tests/test_authenticator.py index 0ff89dec1f18..0fa869fd1739 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/unit_tests/test_authenticator.py +++ b/airbyte-integrations/connectors/source-azure-blob-storage/unit_tests/test_authenticator.py @@ -2,7 +2,7 @@ from azure.core.credentials import AccessToken -from source_azure_blob_storage.stream_reader import AzureOauth2Authenticator +from source_azure_blob_storage.stream_reader import AzureClientCredentialsAuthenticator, AzureOauth2Authenticator def test_custom_authenticator(requests_mock): @@ -24,3 +24,23 @@ def test_custom_authenticator(requests_mock): new_token = authenticator.get_token() assert isinstance(new_token, AccessToken) assert new_token.token == "access_token" + + +def test_client_authenticator(requests_mock): + authenticator = AzureClientCredentialsAuthenticator( + token_refresh_endpoint="https://login.microsoftonline.com/tenant_id/oauth2/v2.0/token", + tenant_id="tenant_id", + client_id="client_id", + client_secret="client_secret", + ) + token_response = { + "token_type": "Bearer", + "scope": "https://storage.azure.com/.default", + "expires_in": 3600, + "ext_expires_in": 3600, + "access_token": "access_token_123", + } + requests_mock.post("https://login.microsoftonline.com/tenant_id/oauth2/v2.0/token", json=token_response) + new_token = authenticator.get_token() + assert isinstance(new_token, AccessToken) + assert new_token.token == "access_token_123" diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/unit_tests/test_config_migration.py b/airbyte-integrations/connectors/source-azure-blob-storage/unit_tests/test_config_migration.py index 160a7b2bf190..b71ee1372fef 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/unit_tests/test_config_migration.py +++ b/airbyte-integrations/connectors/source-azure-blob-storage/unit_tests/test_config_migration.py @@ -3,22 +3,35 @@ import json import os +from pathlib import Path +from shutil import copytree +from tempfile import TemporaryDirectory from typing import Any, Mapping +from pytest import fixture from source_azure_blob_storage import SourceAzureBlobStorage, SourceAzureBlobStorageSpec, SourceAzureBlobStorageStreamReader from source_azure_blob_storage.config_migrations import MigrateCredentials, MigrateLegacyConfig from airbyte_cdk.sources.file_based.stream.cursor import DefaultFileBasedCursor +@fixture +def temp_configs(): + config_path = f"{os.path.dirname(__file__)}/test_configs/" + with TemporaryDirectory() as _tempdir: + configs_dir = Path(_tempdir) / "test_configs" + copytree(config_path, configs_dir) + yield configs_dir + + # HELPERS def load_config(config_path: str) -> Mapping[str, Any]: with open(config_path, "r") as config: return json.load(config) -def test_legacy_config_migration(): - config_path = f"{os.path.dirname(__file__)}/test_configs/test_legacy_config.json" +def test_legacy_config_migration(temp_configs): + config_path = str((Path(temp_configs) / "test_legacy_config.json").resolve()) migration_instance = MigrateLegacyConfig source = SourceAzureBlobStorage( SourceAzureBlobStorageStreamReader(), @@ -47,9 +60,11 @@ def test_legacy_config_migration(): assert test_migrated_config == expected_config -def test_credentials_config_migration(): - config_path = f"{os.path.dirname(__file__)}/test_configs/test_config_without_credentials.json" +def test_credentials_config_migration(temp_configs): + config_path = str((Path(temp_configs) / "test_config_without_credentials.json").resolve()) initial_config = load_config(config_path) + expected = initial_config["azure_blob_storage_account_key"] + migration_instance = MigrateCredentials source = SourceAzureBlobStorage( SourceAzureBlobStorageStreamReader(), @@ -61,4 +76,4 @@ def test_credentials_config_migration(): ) migration_instance.migrate(["check", "--config", config_path], source) test_migrated_config = load_config(config_path) - assert test_migrated_config["credentials"]["azure_blob_storage_account_key"] == initial_config["azure_blob_storage_account_key"] + assert test_migrated_config["credentials"]["azure_blob_storage_account_key"] == expected diff --git a/docs/integrations/sources/azure-blob-storage.md b/docs/integrations/sources/azure-blob-storage.md index 2d6baf9ba2d5..bb802349dcb3 100644 --- a/docs/integrations/sources/azure-blob-storage.md +++ b/docs/integrations/sources/azure-blob-storage.md @@ -2,7 +2,7 @@ -This page contains the setup guide and reference information for the [Azure Blob Storage](https://learn.microsoft.com/en-us/azure/?product=popular) source connector. +This page contains the setup guide and reference information for the [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs/) source connector. @@ -41,10 +41,10 @@ Minimum permissions (role [Storage Blob Data Reader](https://learn.microsoft.com ### Step 1: Set up Azure Blob Storage -- Create a storage account with the permissions [details](https://learn.microsoft.com/en-us/azure/storage/common/storage-account-create?tabs=azure-portal) +- Create a storage account and grant roles [details](https://learn.microsoft.com/en-us/azure/storage/common/storage-account-create?tabs=azure-portal) :::warning -To use Oauth 2.0 Authentication method, Access Control (IAM) should be setup. +To use Oauth2 or Client Credentials Authentication methods, Access Control (IAM) should be setup. It is recommended to use role [Storage Blob Data Reader](https://learn.microsoft.com/en-gb/azure/storage/blobs/assign-azure-role-data-access?tabs=portal) @@ -62,6 +62,20 @@ Follow these steps to set up an IAM role: ::: +
+ +Follow these steps to set up a Service Principal to use the Client Credentials authentication method. + + +In the Azure portal, navigate to your Service Principal's App Registration. + +Note the `Directory (tenant) ID` and `Application (client) ID` in the Overview panel. + +In the `Manage / Certificates & secrets` panel, click `Client Secrets` and create a new secret. Note the `Value` of the secret. + +
+ + ### Step 2: Set up the Azure Blob Storage connector in Airbyte @@ -93,10 +107,14 @@ Follow these steps to set up an IAM role: 2. Click Sources and then click + New source. 3. On the Set up the source page, select Azure Blob Storage from the Source type dropdown. 4. Enter a name for the Azure Blob Storage connector. -5. Enter the name of your Azure **Account**. -6. Enter your Tenant ID and Click **Authenticate your Azure Blob Storage account**. -7. Log in and authorize the Azure Blob Storage account. -8. Enter the name of the **Container** containing your files to replicate. +5. Enter the name of your Azure **Storage Account** and **container**. +6. Choose the Authentication method. + 1. If you are accessing through a Storage Account Key, choose `Authenticate via Storage Account Key` and enter the key. + 1. If you are accessing through a Service Principal, choose the `Authenticate via Client Credentials`. + 0. See [above](#step-1-set-up-azure-blob-storage) regarding setting IAM role bindings for the Service Principal and getting detail of the app registration + 1. Enter the `Directory (tenant) ID` value from app registration in Azure Portal into the `Tenant ID` field. + 2. Enter the `Application (client) ID` from Azure Portal into the `Tenant ID` field. Note this is **not** the secret ID + 3. Enter the Secret `Value` from Azure Portal into the `Client Secret` field. 9. Add a stream 1. Write the **File Type** 2. In the **Format** box, use the dropdown menu to select the format of the files you'd like to replicate. The supported formats are **CSV**, **Parquet**, **Avro** and **JSONL**. Toggling the **Optional fields** button within the **Format** box will allow you to enter additional configurations based on the selected format. For a detailed breakdown of these settings, refer to the [File Format section](#file-format-settings) below. @@ -283,6 +301,7 @@ The Azure Blob Storage connector should not encounter any [Microsoft API limitat | Version | Date | Pull Request | Subject | |:--------|:-----------|:---------------------------------------------------------|:---------------------------------------------------------------------------------------------| +| 0.5.0 | 2025-01-02 | [50398](https://github.com/airbytehq/airbyte/pull/50398) | Add client_credentials auth for Azure Service Principals | | 0.4.4 | 2024-06-06 | [39275](https://github.com/airbytehq/airbyte/pull/39275) | [autopull] Upgrade base image to v1.2.2 | | 0.4.3 | 2024-05-29 | [38701](https://github.com/airbytehq/airbyte/pull/38701) | Avoid error on empty stream when running discover | | 0.4.2 | 2024-04-23 | [37504](https://github.com/airbytehq/airbyte/pull/37504) | Update specification |