From a7d2997e6b5c97f97d6f03e4f4d9590286d5398a Mon Sep 17 00:00:00 2001 From: Mikita Sakalouski <38785549+mikita-sakalouski@users.noreply.github.com> Date: Fri, 29 Nov 2024 09:19:22 +0100 Subject: [PATCH] [FEATURE] DataBricksSecret for getting secrets from DataBricks scope (#133) ## Description `DataBricksSecret` class can be used to get secrets from DataBricks scopes. ## Related Issue #66 ## Motivation and Context Support secret scope in Databricks ## How Has This Been Tested? Add mocked test ## Screenshots (if appropriate): ## Types of changes - [ ] Bug fix (non-breaking change which fixes an issue) - [x] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) ## Checklist: - [x] My code follows the code style of this project. - [ ] My change requires a change to the documentation. - [ ] I have updated the documentation accordingly. - [x] I have read the **CONTRIBUTING** document. - [x] I have added tests to cover my changes. - [ ] All new and existing tests passed. --------- Co-authored-by: Danny Meijer <10511979+dannymeijer@users.noreply.github.com> Co-authored-by: Danny Meijer --- docs/tutorials/getting-started.md | 17 ---- src/koheesio/__about__.py | 2 +- .../integrations/snowflake/__init__.py | 2 +- .../integrations/spark/databricks/__init__.py | 0 .../integrations/spark/databricks/secrets.py | 79 +++++++++++++++++++ .../integrations/spark/databricks/utils.py | 16 ++++ src/koheesio/integrations/spark/snowflake.py | 8 +- tests/spark/conftest.py | 18 +++++ .../integrations/databrikcs/test_secrets.py | 48 +++++++++++ tests/spark/test_spark.py | 2 +- 10 files changed, 168 insertions(+), 24 deletions(-) create mode 100644 src/koheesio/integrations/spark/databricks/__init__.py create mode 100644 src/koheesio/integrations/spark/databricks/secrets.py create mode 100644 src/koheesio/integrations/spark/databricks/utils.py create mode 100644 tests/spark/integrations/databrikcs/test_secrets.py diff --git a/docs/tutorials/getting-started.md b/docs/tutorials/getting-started.md index 8fd80ff1..6fe88897 100644 --- a/docs/tutorials/getting-started.md +++ b/docs/tutorials/getting-started.md @@ -19,23 +19,6 @@ ``` -
- poetry - - If you're using Poetry, add the following entry to the `pyproject.toml` file: - - ```toml title="pyproject.toml" - [[tool.poetry.source]] - name = "nike" - url = "https://artifactory.nike.com/artifactory/api/pypi/python-virtual/simple" - secondary = true - ``` - - ```bash - poetry add koheesio - ``` -
-
pip diff --git a/src/koheesio/__about__.py b/src/koheesio/__about__.py index 7a8c687d..20f1a1b9 100644 --- a/src/koheesio/__about__.py +++ b/src/koheesio/__about__.py @@ -12,7 +12,7 @@ LICENSE_INFO = "Licensed as Apache 2.0" SOURCE = "https://github.com/Nike-Inc/koheesio" -__version__ = "0.9.0" +__version__ = "0.9.0rc7" __logo__ = ( 75, ( diff --git a/src/koheesio/integrations/snowflake/__init__.py b/src/koheesio/integrations/snowflake/__init__.py index 9e1603d9..dcabbcb5 100644 --- a/src/koheesio/integrations/snowflake/__init__.py +++ b/src/koheesio/integrations/snowflake/__init__.py @@ -449,7 +449,7 @@ class GrantPrivilegesOnObject(SnowflakeRunQueryPython): object="MY_TABLE", type="TABLE", warehouse="MY_WH", - user="gid.account@nike.com", + user="gid.account@abc.com", password=Secret("super-secret-password"), role="APPLICATION.SNOWFLAKE.ADMIN", permissions=["SELECT", "INSERT"], diff --git a/src/koheesio/integrations/spark/databricks/__init__.py b/src/koheesio/integrations/spark/databricks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/koheesio/integrations/spark/databricks/secrets.py b/src/koheesio/integrations/spark/databricks/secrets.py new file mode 100644 index 00000000..1f04d54d --- /dev/null +++ b/src/koheesio/integrations/spark/databricks/secrets.py @@ -0,0 +1,79 @@ +"""Module for retrieving secrets from DataBricks Scopes. + +Secrets are stored as SecretContext and can be accessed accordingly. + +See DataBricksSecret for more information. +""" + +from typing import Dict, Optional +import re + +from pyspark.sql import SparkSession + +from koheesio.integrations.spark.databricks.utils import get_dbutils +from koheesio.models import Field, model_validator +from koheesio.secrets import Secret + + +class DataBricksSecret(Secret): + """ + Retrieve secrets from DataBricks secret scope and wrap them into Context class for easy access. + All secrets are stored under the "secret" root and "parent". "Parent" either derived from the + secure scope by replacing "/" and "-", or manually provided by the user. + Secrets are wrapped into the pydantic.SecretStr. + + Examples + --------- + + ```python + context = {"secrets": {"parent": {"webhook": SecretStr("**********"), "description": SecretStr("**********")}}} + ``` + + Values can be decoded like this: + ```python + context.secrets.parent.webhook.get_secret_value() + ``` + or if working with dictionary is preferable: + ```python + for key, value in context.get_all().items(): + value.get_secret_value() + ``` + """ + + scope: str = Field(description="Scope") + alias: Optional[Dict[str, str]] = Field(default_factory=dict, description="Alias for secret keys") + + @model_validator(mode="before") + def _set_parent_to_scope(cls, values): + """ + Set default value for `parent` parameter on model initialization when it was not + explicitly set by the user. In this scenario scope will be used: + + 'secret-scope' -> secret_scope + """ + regex = re.compile(r"[/-]") + path = values.get("scope") + + if not values.get("parent"): + values["parent"] = regex.sub("_", path) + + return values + + @property + def _client(self): + """ + Instantiated Databricks client. + """ + + return get_dbutils(SparkSession.getActiveSession()) # type: ignore + + def _get_secrets(self): + """Dictionary of secrets.""" + all_keys = (secret_meta.key for secret_meta in self._client.secrets.list(scope=self.scope)) + secret_data = {} + + for key in all_keys: + key_name = key if not (self.alias and self.alias.get(key)) else self.alias[key] # pylint: disable=E1101 + secret_data[key_name] = self._client.secrets.get(scope=self.scope, key=key) + + return secret_data diff --git a/src/koheesio/integrations/spark/databricks/utils.py b/src/koheesio/integrations/spark/databricks/utils.py new file mode 100644 index 00000000..9a2a8849 --- /dev/null +++ b/src/koheesio/integrations/spark/databricks/utils.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from pyspark.sql import SparkSession + +from koheesio.spark.utils import on_databricks + + +def get_dbutils(spark_session: SparkSession) -> DBUtils: # type: ignore # noqa: F821 + if not on_databricks(): + raise RuntimeError("dbutils not available") + + from pyspark.dbutils import DBUtils # pylint: disable=E0611,E0401 # type: ignore + + dbutils = DBUtils(spark_session) + + return dbutils diff --git a/src/koheesio/integrations/spark/snowflake.py b/src/koheesio/integrations/spark/snowflake.py index a6e6941f..ba292afe 100644 --- a/src/koheesio/integrations/spark/snowflake.py +++ b/src/koheesio/integrations/spark/snowflake.py @@ -302,7 +302,7 @@ class Query(SnowflakeReader): database="MY_DB", schema_="MY_SCHEMA", warehouse="MY_WH", - user="gid.account@nike.com", + user="gid.account@abc.com", password=Secret("super-secret-password"), role="APPLICATION.SNOWFLAKE.ADMIN", query="SELECT * FROM MY_TABLE", @@ -412,7 +412,7 @@ class CreateOrReplaceTableFromDataFrame(SnowflakeTransformation): database="MY_DB", schema="MY_SCHEMA", warehouse="MY_WH", - user="gid.account@nike.com", + user="gid.account@abc.com", password="super-secret-password", role="APPLICATION.SNOWFLAKE.ADMIN", table="MY_TABLE", @@ -477,7 +477,7 @@ class GetTableSchema(SnowflakeStep): database="MY_DB", schema_="MY_SCHEMA", warehouse="MY_WH", - user="gid.account@nike.com", + user="gid.account@abc.com", password="super-secret-password", role="APPLICATION.SNOWFLAKE.ADMIN", table="MY_TABLE", @@ -512,7 +512,7 @@ class AddColumn(SnowflakeStep): database="MY_DB", schema_="MY_SCHEMA", warehouse="MY_WH", - user="gid.account@nike.com", + user="gid.account@abc.com", password=Secret("super-secret-password"), role="APPLICATION.SNOWFLAKE.ADMIN", table="MY_TABLE", diff --git a/tests/spark/conftest.py b/tests/spark/conftest.py index f918ae40..9809b9be 100644 --- a/tests/spark/conftest.py +++ b/tests/spark/conftest.py @@ -1,3 +1,4 @@ +from typing import Any from collections import namedtuple import datetime from decimal import Decimal @@ -347,3 +348,20 @@ def df_with_all_types(spark): data=[[v[0] for v in data.values()]], schema=StructType([StructField(name=v[1], dataType=v[2]) for v in data.values()]), ) + + +class ScopeSecrets: + class SecretMeta: + def __init__(self, key: str): + self.key = key + + def __init__(self, secrets: dict): + self.secrets = secrets + + def get(self, scope: str, key: str) -> Any: + return self.secrets.get(key) + + def list(self, scope: str): + keys = [ScopeSecrets.SecretMeta(key=key) for key in self.secrets.keys()] + + return keys diff --git a/tests/spark/integrations/databrikcs/test_secrets.py b/tests/spark/integrations/databrikcs/test_secrets.py new file mode 100644 index 00000000..11af1170 --- /dev/null +++ b/tests/spark/integrations/databrikcs/test_secrets.py @@ -0,0 +1,48 @@ +from unittest.mock import patch + +from conftest import ScopeSecrets + +from koheesio.integrations.spark.databricks.secrets import DataBricksSecret + + +class TestDatabricksSecret: + def test_set_parent_to_scope(self): + # Test when parent is not provided + secret = DataBricksSecret(scope="secret-scope") + assert secret.parent == "secret_scope" + + # Test when parent is provided + secret = DataBricksSecret(scope="secret-scope", parent="custom_parent") + assert secret.parent == "custom_parent" + + @patch("koheesio.integrations.spark.databricks.secrets.DataBricksSecret._client") + def test_get_secrets_no_alias(self, mock_databricks_client): + with patch("koheesio.integrations.spark.databricks.utils.on_databricks", return_value=True): + dd = { + "key1": "value_of_key1", + "key2": "value_of_key2", + } + databricks = DataBricksSecret(scope="dummy", parent="kafka") + mock_databricks_client.secrets = ScopeSecrets(dd) + secrets = databricks._get_secrets() + + assert secrets["key1"] == "value_of_key1" + assert secrets["key2"] == "value_of_key2" + + @patch("koheesio.integrations.spark.databricks.secrets.DataBricksSecret._client") + def test_get_secrets_alias(self, mock_databricks_client): + with patch("koheesio.integrations.spark.databricks.utils.on_databricks", return_value=True): + dd = { + "key1": "value_of_key1", + "key2": "value_of_key2", + } + alias = { + "key1": "new_name_key1", + "key2": "new_name_key2", + } + databricks = DataBricksSecret(scope="dummy", parent="kafka", alias=alias) + mock_databricks_client.secrets = ScopeSecrets(dd) + secrets = databricks._get_secrets() + + assert secrets["new_name_key1"] == "value_of_key1" + assert secrets["new_name_key2"] == "value_of_key2" diff --git a/tests/spark/test_spark.py b/tests/spark/test_spark.py index e19b3e02..003060b5 100644 --- a/tests/spark/test_spark.py +++ b/tests/spark/test_spark.py @@ -26,7 +26,7 @@ def test_import_error_no_error(self): with mock.patch.dict("sys.modules", {"pyspark": None}): from koheesio.sso.okta import OktaAccessToken - OktaAccessToken(url="https://nike.okta.com", client_id="client_id", client_secret=secret) + OktaAccessToken(url="https://abc.okta.com", client_id="client_id", client_secret=secret) def test_import_error_with_error(self): with mock.patch.dict("sys.modules", {"pyspark.sql": None, "koheesio.steps.spark": None}):