diff --git a/docs/tutorials/getting-started.md b/docs/tutorials/getting-started.md index 8fd80ff..6fe8889 100644 --- a/docs/tutorials/getting-started.md +++ b/docs/tutorials/getting-started.md @@ -19,23 +19,6 @@ ``` -
- poetry - - If you're using Poetry, add the following entry to the `pyproject.toml` file: - - ```toml title="pyproject.toml" - [[tool.poetry.source]] - name = "nike" - url = "https://artifactory.nike.com/artifactory/api/pypi/python-virtual/simple" - secondary = true - ``` - - ```bash - poetry add koheesio - ``` -
-
pip diff --git a/src/koheesio/__about__.py b/src/koheesio/__about__.py index 7a8c687..20f1a1b 100644 --- a/src/koheesio/__about__.py +++ b/src/koheesio/__about__.py @@ -12,7 +12,7 @@ LICENSE_INFO = "Licensed as Apache 2.0" SOURCE = "https://github.com/Nike-Inc/koheesio" -__version__ = "0.9.0" +__version__ = "0.9.0rc7" __logo__ = ( 75, ( diff --git a/src/koheesio/integrations/snowflake/__init__.py b/src/koheesio/integrations/snowflake/__init__.py index 9e1603d..dcabbcb 100644 --- a/src/koheesio/integrations/snowflake/__init__.py +++ b/src/koheesio/integrations/snowflake/__init__.py @@ -449,7 +449,7 @@ class GrantPrivilegesOnObject(SnowflakeRunQueryPython): object="MY_TABLE", type="TABLE", warehouse="MY_WH", - user="gid.account@nike.com", + user="gid.account@abc.com", password=Secret("super-secret-password"), role="APPLICATION.SNOWFLAKE.ADMIN", permissions=["SELECT", "INSERT"], diff --git a/src/koheesio/integrations/spark/databricks/__init__.py b/src/koheesio/integrations/spark/databricks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/koheesio/integrations/spark/databricks/secrets.py b/src/koheesio/integrations/spark/databricks/secrets.py new file mode 100644 index 0000000..1f04d54 --- /dev/null +++ b/src/koheesio/integrations/spark/databricks/secrets.py @@ -0,0 +1,79 @@ +"""Module for retrieving secrets from DataBricks Scopes. + +Secrets are stored as SecretContext and can be accessed accordingly. + +See DataBricksSecret for more information. +""" + +from typing import Dict, Optional +import re + +from pyspark.sql import SparkSession + +from koheesio.integrations.spark.databricks.utils import get_dbutils +from koheesio.models import Field, model_validator +from koheesio.secrets import Secret + + +class DataBricksSecret(Secret): + """ + Retrieve secrets from DataBricks secret scope and wrap them into Context class for easy access. + All secrets are stored under the "secret" root and "parent". "Parent" either derived from the + secure scope by replacing "/" and "-", or manually provided by the user. + Secrets are wrapped into the pydantic.SecretStr. + + Examples + --------- + + ```python + context = {"secrets": {"parent": {"webhook": SecretStr("**********"), "description": SecretStr("**********")}}} + ``` + + Values can be decoded like this: + ```python + context.secrets.parent.webhook.get_secret_value() + ``` + or if working with dictionary is preferable: + ```python + for key, value in context.get_all().items(): + value.get_secret_value() + ``` + """ + + scope: str = Field(description="Scope") + alias: Optional[Dict[str, str]] = Field(default_factory=dict, description="Alias for secret keys") + + @model_validator(mode="before") + def _set_parent_to_scope(cls, values): + """ + Set default value for `parent` parameter on model initialization when it was not + explicitly set by the user. In this scenario scope will be used: + + 'secret-scope' -> secret_scope + """ + regex = re.compile(r"[/-]") + path = values.get("scope") + + if not values.get("parent"): + values["parent"] = regex.sub("_", path) + + return values + + @property + def _client(self): + """ + Instantiated Databricks client. + """ + + return get_dbutils(SparkSession.getActiveSession()) # type: ignore + + def _get_secrets(self): + """Dictionary of secrets.""" + all_keys = (secret_meta.key for secret_meta in self._client.secrets.list(scope=self.scope)) + secret_data = {} + + for key in all_keys: + key_name = key if not (self.alias and self.alias.get(key)) else self.alias[key] # pylint: disable=E1101 + secret_data[key_name] = self._client.secrets.get(scope=self.scope, key=key) + + return secret_data diff --git a/src/koheesio/integrations/spark/databricks/utils.py b/src/koheesio/integrations/spark/databricks/utils.py new file mode 100644 index 0000000..9a2a884 --- /dev/null +++ b/src/koheesio/integrations/spark/databricks/utils.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from pyspark.sql import SparkSession + +from koheesio.spark.utils import on_databricks + + +def get_dbutils(spark_session: SparkSession) -> DBUtils: # type: ignore # noqa: F821 + if not on_databricks(): + raise RuntimeError("dbutils not available") + + from pyspark.dbutils import DBUtils # pylint: disable=E0611,E0401 # type: ignore + + dbutils = DBUtils(spark_session) + + return dbutils diff --git a/src/koheesio/integrations/spark/snowflake.py b/src/koheesio/integrations/spark/snowflake.py index a6e6941..ba292af 100644 --- a/src/koheesio/integrations/spark/snowflake.py +++ b/src/koheesio/integrations/spark/snowflake.py @@ -302,7 +302,7 @@ class Query(SnowflakeReader): database="MY_DB", schema_="MY_SCHEMA", warehouse="MY_WH", - user="gid.account@nike.com", + user="gid.account@abc.com", password=Secret("super-secret-password"), role="APPLICATION.SNOWFLAKE.ADMIN", query="SELECT * FROM MY_TABLE", @@ -412,7 +412,7 @@ class CreateOrReplaceTableFromDataFrame(SnowflakeTransformation): database="MY_DB", schema="MY_SCHEMA", warehouse="MY_WH", - user="gid.account@nike.com", + user="gid.account@abc.com", password="super-secret-password", role="APPLICATION.SNOWFLAKE.ADMIN", table="MY_TABLE", @@ -477,7 +477,7 @@ class GetTableSchema(SnowflakeStep): database="MY_DB", schema_="MY_SCHEMA", warehouse="MY_WH", - user="gid.account@nike.com", + user="gid.account@abc.com", password="super-secret-password", role="APPLICATION.SNOWFLAKE.ADMIN", table="MY_TABLE", @@ -512,7 +512,7 @@ class AddColumn(SnowflakeStep): database="MY_DB", schema_="MY_SCHEMA", warehouse="MY_WH", - user="gid.account@nike.com", + user="gid.account@abc.com", password=Secret("super-secret-password"), role="APPLICATION.SNOWFLAKE.ADMIN", table="MY_TABLE", diff --git a/tests/spark/conftest.py b/tests/spark/conftest.py index f918ae4..9809b9b 100644 --- a/tests/spark/conftest.py +++ b/tests/spark/conftest.py @@ -1,3 +1,4 @@ +from typing import Any from collections import namedtuple import datetime from decimal import Decimal @@ -347,3 +348,20 @@ def df_with_all_types(spark): data=[[v[0] for v in data.values()]], schema=StructType([StructField(name=v[1], dataType=v[2]) for v in data.values()]), ) + + +class ScopeSecrets: + class SecretMeta: + def __init__(self, key: str): + self.key = key + + def __init__(self, secrets: dict): + self.secrets = secrets + + def get(self, scope: str, key: str) -> Any: + return self.secrets.get(key) + + def list(self, scope: str): + keys = [ScopeSecrets.SecretMeta(key=key) for key in self.secrets.keys()] + + return keys diff --git a/tests/spark/integrations/databrikcs/test_secrets.py b/tests/spark/integrations/databrikcs/test_secrets.py new file mode 100644 index 0000000..11af117 --- /dev/null +++ b/tests/spark/integrations/databrikcs/test_secrets.py @@ -0,0 +1,48 @@ +from unittest.mock import patch + +from conftest import ScopeSecrets + +from koheesio.integrations.spark.databricks.secrets import DataBricksSecret + + +class TestDatabricksSecret: + def test_set_parent_to_scope(self): + # Test when parent is not provided + secret = DataBricksSecret(scope="secret-scope") + assert secret.parent == "secret_scope" + + # Test when parent is provided + secret = DataBricksSecret(scope="secret-scope", parent="custom_parent") + assert secret.parent == "custom_parent" + + @patch("koheesio.integrations.spark.databricks.secrets.DataBricksSecret._client") + def test_get_secrets_no_alias(self, mock_databricks_client): + with patch("koheesio.integrations.spark.databricks.utils.on_databricks", return_value=True): + dd = { + "key1": "value_of_key1", + "key2": "value_of_key2", + } + databricks = DataBricksSecret(scope="dummy", parent="kafka") + mock_databricks_client.secrets = ScopeSecrets(dd) + secrets = databricks._get_secrets() + + assert secrets["key1"] == "value_of_key1" + assert secrets["key2"] == "value_of_key2" + + @patch("koheesio.integrations.spark.databricks.secrets.DataBricksSecret._client") + def test_get_secrets_alias(self, mock_databricks_client): + with patch("koheesio.integrations.spark.databricks.utils.on_databricks", return_value=True): + dd = { + "key1": "value_of_key1", + "key2": "value_of_key2", + } + alias = { + "key1": "new_name_key1", + "key2": "new_name_key2", + } + databricks = DataBricksSecret(scope="dummy", parent="kafka", alias=alias) + mock_databricks_client.secrets = ScopeSecrets(dd) + secrets = databricks._get_secrets() + + assert secrets["new_name_key1"] == "value_of_key1" + assert secrets["new_name_key2"] == "value_of_key2" diff --git a/tests/spark/test_spark.py b/tests/spark/test_spark.py index e19b3e0..003060b 100644 --- a/tests/spark/test_spark.py +++ b/tests/spark/test_spark.py @@ -26,7 +26,7 @@ def test_import_error_no_error(self): with mock.patch.dict("sys.modules", {"pyspark": None}): from koheesio.sso.okta import OktaAccessToken - OktaAccessToken(url="https://nike.okta.com", client_id="client_id", client_secret=secret) + OktaAccessToken(url="https://abc.okta.com", client_id="client_id", client_secret=secret) def test_import_error_with_error(self): with mock.patch.dict("sys.modules", {"pyspark.sql": None, "koheesio.steps.spark": None}):