diff --git a/cognite/extractorutils/configtools/__init__.py b/cognite/extractorutils/configtools/__init__.py index 6dbcbb0f..92c7928f 100644 --- a/cognite/extractorutils/configtools/__init__.py +++ b/cognite/extractorutils/configtools/__init__.py @@ -85,10 +85,15 @@ class MyConfig(BaseConfig): However, all of these things will be automatically done for you if you are using the base Extractor class. """ +from cognite.extractorutils.exceptions import InvalidConfigError + from .elements import ( + AuthenticatorConfig, BaseConfig, + CertificateConfig, CogniteConfig, ConfigType, + ConnectionConfig, EitherIdConfig, FileSizeConfig, LocalStateStoreConfig, @@ -99,4 +104,4 @@ class MyConfig(BaseConfig): StateStoreConfig, TimeIntervalConfig, ) -from .loaders import ConfigResolver, load_yaml +from .loaders import ConfigResolver, KeyVaultAuthenticationMethod, KeyVaultLoader, load_yaml, load_yaml_dict diff --git a/cognite/extractorutils/configtools/elements.py b/cognite/extractorutils/configtools/elements.py index 395c505e..7afb307c 100644 --- a/cognite/extractorutils/configtools/elements.py +++ b/cognite/extractorutils/configtools/elements.py @@ -53,6 +53,10 @@ @dataclass class CertificateConfig: + """ + Configuration parameters for certificates + """ + path: str password: Optional[str] authority_url: Optional[str] = None @@ -94,6 +98,11 @@ class ConnectionConfig: @dataclass class EitherIdConfig: + """ + Configuration parameter representing an ID in CDF, which can either be an external or internal ID. + An EitherId can only hold one ID type, not both. + """ + id: Optional[int] external_id: Optional[str] @@ -103,6 +112,10 @@ def either_id(self) -> EitherId: class TimeIntervalConfig(yaml.YAMLObject): + """ + Configuration parameter for setting a time interval + """ + def __init__(self, expression: str) -> None: self._interval, self._expression = TimeIntervalConfig._parse_expression(expression) @@ -167,6 +180,10 @@ def __repr__(self) -> str: class FileSizeConfig(yaml.YAMLObject): + """ + Configuration parameter for setting a file size + """ + def __init__(self, expression: str) -> None: self._bytes, self._expression = FileSizeConfig._parse_expression(expression) @@ -569,23 +586,39 @@ class BaseConfig(_BaseConfig): @dataclass class RawDestinationConfig: + """ + Configuration parameters for using Raw + """ + database: str table: str @dataclass class RawStateStoreConfig(RawDestinationConfig): + """ + Configuration of a state store based on CDF RAW + """ + upload_interval: TimeIntervalConfig = TimeIntervalConfig("30s") @dataclass class LocalStateStoreConfig: + """ + Configuration of a state store using a local JSON file + """ + path: str save_interval: TimeIntervalConfig = TimeIntervalConfig("30s") @dataclass class StateStoreConfig: + """ + Configuration of the State Store, containing ``LocalStateStoreConfig`` or ``RawStateStoreConfig`` + """ + raw: Optional[RawStateStoreConfig] = None local: Optional[LocalStateStoreConfig] = None diff --git a/cognite/extractorutils/configtools/loaders.py b/cognite/extractorutils/configtools/loaders.py index ff23944b..79f7388c 100644 --- a/cognite/extractorutils/configtools/loaders.py +++ b/cognite/extractorutils/configtools/loaders.py @@ -47,6 +47,10 @@ class KeyVaultAuthenticationMethod(Enum): class KeyVaultLoader: + """ + Class responsible for configuring keyvault for clients using Azure + """ + def __init__(self, config: Optional[dict]): self.config = config diff --git a/cognite/extractorutils/exceptions.py b/cognite/extractorutils/exceptions.py index 39cfb893..58e1b055 100644 --- a/cognite/extractorutils/exceptions.py +++ b/cognite/extractorutils/exceptions.py @@ -15,7 +15,7 @@ class InvalidConfigError(Exception): """ - Exception thrown from ``load_yaml`` if config file is invalid. This can be due to + Exception thrown from ``load_yaml`` and ``load_yaml_dict`` if config file is invalid. This can be due to * Missing fields * Incompatible types diff --git a/cognite/extractorutils/util.py b/cognite/extractorutils/util.py index fa9139f9..92b444c2 100644 --- a/cognite/extractorutils/util.py +++ b/cognite/extractorutils/util.py @@ -79,7 +79,7 @@ class EitherId: Args: id: Internal ID - externalId or external_id: external ID + external_id: external ID. It can be `external_id` or `externalId` Raises: TypeError: If none of both of id types are set. @@ -175,24 +175,25 @@ def add_extraction_pipeline( added_message: str = "", ) -> Callable[[Callable[..., _T1]], Callable[..., _T1]]: """ - This is to be used as a decorator for extractor functions to add extraction pipeline information + This is to be used as a decorator for extractor functions to add extraction pipeline information. Args: - extraction_pipeline_ext_id: - cognite_client: - heartbeat_waiting_time: - added_message: + extraction_pipeline_ext_id: External ID of the extraction pipeline + cognite_client: Client to use when communicating with CDF + heartbeat_waiting_time: Target interval between heartbeats, in seconds Usage: If you have a function named "extract_data(*args, **kwargs)" and want to connect it to an extraction pipeline, you can use this decorator function as: - @add_extraction_pipeline( - extraction_pipeline_ext_id=, - cognite_client=, - logger=, - ) - def extract_data(*args, **kwargs): - + + .. code-block:: python + + @add_extraction_pipeline( + extraction_pipeline_ext_id=, + cognite_client=, + ) + def extract_data(*args, **kwargs): + """ # TODO 1. Consider refactoring this decorator to share methods with the Extractor context manager in .base.py diff --git a/docs/source/conf.py b/docs/source/conf.py index 01d06b85..64420c64 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -65,7 +65,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. diff --git a/docs/source/docs.rst b/docs/source/docs.rst index 7692d015..4d8e9ee9 100644 --- a/docs/source/docs.rst +++ b/docs/source/docs.rst @@ -19,12 +19,15 @@ Base class for extractors The ``configtools`` module exists of tools for loading and verifying config files for extractors. -Extractor configurasions are conventionally written in *hyphen-cased YAML*. These are typically loaded and serialized as *dataclasses* in Python. +Extractor configurations are conventionally written in *hyphen-cased YAML*. These are typically loaded and serialized as *dataclasses* in Python. Config loader ^^^^^^^^^^^^^ .. autofunction:: cognite.extractorutils.configtools.load_yaml +.. autofunction:: cognite.extractorutils.configtools.load_yaml_dict +.. autoclass:: cognite.extractorutils.configtools.KeyVaultAuthenticationMethod +.. autoclass:: cognite.extractorutils.configtools.KeyVaultLoader Base classes @@ -55,6 +58,14 @@ The ``configtools`` module contains several prebuilt config classes for many com :undoc-members: .. autoclass:: cognite.extractorutils.configtools.CogniteConfig :undoc-members: +.. autoclass:: cognite.extractorutils.configtools.EitherIdConfig + :undoc-members: +.. autoclass:: cognite.extractorutils.configtools.ConnectionConfig + :undoc-members: +.. autoclass:: cognite.extractorutils.configtools.AuthenticatorConfig + :undoc-members: +.. autoclass:: cognite.extractorutils.configtools.CertificateConfig + :undoc-members: .. autoclass:: cognite.extractorutils.configtools.LoggingConfig :undoc-members: .. autoclass:: cognite.extractorutils.configtools.MetricsConfig @@ -67,6 +78,12 @@ The ``configtools`` module contains several prebuilt config classes for many com :undoc-members: .. autoclass:: cognite.extractorutils.configtools.LocalStateStoreConfig :undoc-members: +.. autoclass:: cognite.extractorutils.configtools.TimeIntervalConfig + :undoc-members: +.. autoclass:: cognite.extractorutils.configtools.FileSizeConfig + :undoc-members: +.. autoclass:: cognite.extractorutils.configtools.ConfigType + :undoc-members: Exceptions @@ -75,17 +92,6 @@ Exceptions .. autoexception:: cognite.extractorutils.configtools.InvalidConfigError -``throttle`` - Tools for throttling ------------------------------------ - -.. automodule:: cognite.extractorutils.throttle - :members: - :undoc-members: - :inherited-members: - :show-inheritance: - - - ``metrics`` - Automatic pushers of performance metrics ------------------------------------------------------ @@ -115,6 +121,32 @@ Exceptions :inherited-members: :show-inheritance: +.. automodule:: cognite.extractorutils.uploader._base + :members: + :undoc-members: + :inherited-members: + :show-inheritance: + +.. automodule:: cognite.extractorutils.uploader.assets + :members: + :show-inheritance: + +.. automodule:: cognite.extractorutils.uploader.events + :members: + :show-inheritance: + +.. automodule:: cognite.extractorutils.uploader.files + :members: + :show-inheritance: + +.. automodule:: cognite.extractorutils.uploader.raw + :members: + :show-inheritance: + +.. automodule:: cognite.extractorutils.uploader.time_series + :members: + :show-inheritance: + ``util`` - Miscellaneous utilities ---------------------------------- diff --git a/docs/source/package.rst b/docs/source/package.rst index c9a52acc..866cdfd7 100644 --- a/docs/source/package.rst +++ b/docs/source/package.rst @@ -7,11 +7,11 @@ The final step is to ship our extractor. Some times we can depend on a Python in our extractor in, in other cases out extractor needs to be fully self-contained. How we ship our extractor will differ slightly between those scenarios. -When developing the extractor, running it is fairly easy: +When developing the extractor, running it is fairly easy: [#]_ .. code-block:: bash - poetry run + poetry run Sometimes we could just send our project to the production environment like this, for example by cloning the git repo (which would also make updating to future versions very easy). However, when shipping the extractor to a production @@ -68,6 +68,8 @@ executable you have to run ``cogex build`` from Windows, likewise for Linux or M The resulting executable will contain your extractor, all dependencies [#]_ and the Python runtime. +.. [#] Replace ```` with the name of your specific extractor, and ```` with the (absolute or + relative) path to a config file. .. [#] Note that this could be in violation of some licenses, particularly GPL or it's derivatives. Make sure that the licenses of your dependencies allows this type of linking. diff --git a/docs/source/read_csv.rst b/docs/source/read_csv.rst index a71b49b1..ad06b5b3 100644 --- a/docs/source/read_csv.rst +++ b/docs/source/read_csv.rst @@ -40,7 +40,7 @@ If we wanted to make our extractor even more generic, we could have made the ``d We now have a ``reader`` object. This is an iterator that will return each row in the CSV file as a dictionary where column names are keys and the row are values. We can now iterate over this reader and add all the rows to the upload -queue using the :meth:`add_to_upload_queue ` method. +queue using the :meth:`add_to_upload_queue ` method. .. code-block:: python diff --git a/docs/source/uploader.rst b/docs/source/uploader.rst index 977f5793..49d75579 100644 --- a/docs/source/uploader.rst +++ b/docs/source/uploader.rst @@ -13,8 +13,12 @@ Using an upload queue --------------------- We begin by looking at the upload queue. Since this extractor will write to CDF RAW, we will use the -:meth:`RawUploadQueue `. Similar queues exists for time series data -points, events, sequence rows and files. +:meth:`RawUploadQueue `. Similar queues exists for +:meth:`time series data points `, +:meth:`events `, +:meth:`assets `, +:meth:`sequence rows ` +and :meth:`files `. The reason for using an upload queue is to batch together data into larger requests to CDF. This will increase performance, some times quite dramatically since network latencies can often be a bottleneck for extractors. We can add @@ -27,7 +31,7 @@ manager (with a ``with`` statement). The advantage is that we can then set one o happen (such as every *x* seconds or when the queue has *y* elements in it), and don't think about it again. If the queue is not empty when we exit the context, a final upload will be made to make sure no data is left behind. -To create a :meth:`RawUploadQueue `, we write therefore start with +To create a :meth:`RawUploadQueue `, we write therefore start with .. code-block:: python @@ -47,7 +51,7 @@ function. This function is called from the ``__main__.py`` file, and is provided * ``cognite`` is an initiated ``CogniteClient`` that is set up to to use the CDF project that the user configured in their config file. -* ``states`` is a state store object, we will not cover these in this tutorial, but in short it allows us to keep track +* ``states`` is a :meth:`State Store ` object, we will not cover these in this tutorial, but in short it allows us to keep track of extraction state between runs to avoid duplicate work * ``config`` is the config file the user have provided, which has been loaded and stored as an instance of the Config class we made in the :ref:`Read CSV files` chapter. @@ -72,6 +76,9 @@ this: extract_file(file, queue) +This will call the ``start()`` and ``stop()`` methods from :meth:`AbstractUploadQueue ` +class automatically once all files are processed or the limit of the queue, defined by the ``max_queue_size`` keyword argument, is reached. + Extraction pipeline runs ------------------------ @@ -104,8 +111,16 @@ section of the config file, containing either an ``external-id`` or (internal) ` .. code-block:: yaml cognite: - project: publicdata - api-key: ${COGNITE_API_KEY} - - extraction-pipeline: - external-id: abc123 + # Read these from environment variables + host: ${COGNITE_BASE_URL} + project: ${COGNITE_PROJECT} + + idp-authentication: + token-url: ${COGNITE_TOKEN_URL} + client-id: ${COGNITE_CLIENT_ID} + secret: ${COGNITE_CLIENT_SECRET} + scopes: + - ${COGNITE_BASE_URL}/.default + + extraction-pipeline: + external-id: abc123