Skip to content

Commit

Permalink
Docs 02/24
Browse files Browse the repository at this point in the history
[DOG-713]
  • Loading branch information
cel055 committed Feb 19, 2024
1 parent 9bd35c5 commit 30d7143
Show file tree
Hide file tree
Showing 10 changed files with 132 additions and 40 deletions.
7 changes: 6 additions & 1 deletion cognite/extractorutils/configtools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,15 @@ class MyConfig(BaseConfig):
However, all of these things will be automatically done for you if you are using the base Extractor class.
"""

from cognite.extractorutils.exceptions import InvalidConfigError

from .elements import (
AuthenticatorConfig,
BaseConfig,
CertificateConfig,
CogniteConfig,
ConfigType,
ConnectionConfig,
EitherIdConfig,
FileSizeConfig,
LocalStateStoreConfig,
Expand All @@ -99,4 +104,4 @@ class MyConfig(BaseConfig):
StateStoreConfig,
TimeIntervalConfig,
)
from .loaders import ConfigResolver, load_yaml
from .loaders import ConfigResolver, KeyVaultAuthenticationMethod, KeyVaultLoader, load_yaml, load_yaml_dict
33 changes: 33 additions & 0 deletions cognite/extractorutils/configtools/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@

@dataclass
class CertificateConfig:
"""
Configuration parameters for certificates
"""

path: str
password: Optional[str]
authority_url: Optional[str] = None
Expand Down Expand Up @@ -94,6 +98,11 @@ class ConnectionConfig:

@dataclass
class EitherIdConfig:
"""
Configuration parameter representing an ID in CDF, which can either be an external or internal ID.
An EitherId can only hold one ID type, not both.
"""

id: Optional[int]
external_id: Optional[str]

Expand All @@ -103,6 +112,10 @@ def either_id(self) -> EitherId:


class TimeIntervalConfig(yaml.YAMLObject):
"""
Configuration parameter for setting a time interval
"""

def __init__(self, expression: str) -> None:
self._interval, self._expression = TimeIntervalConfig._parse_expression(expression)

Expand Down Expand Up @@ -167,6 +180,10 @@ def __repr__(self) -> str:


class FileSizeConfig(yaml.YAMLObject):
"""
Configuration parameter for setting a file size
"""

def __init__(self, expression: str) -> None:
self._bytes, self._expression = FileSizeConfig._parse_expression(expression)

Expand Down Expand Up @@ -569,23 +586,39 @@ class BaseConfig(_BaseConfig):

@dataclass
class RawDestinationConfig:
"""
Configuration parameters for using Raw
"""

database: str
table: str


@dataclass
class RawStateStoreConfig(RawDestinationConfig):
"""
Configuration of a state store based on CDF RAW
"""

upload_interval: TimeIntervalConfig = TimeIntervalConfig("30s")


@dataclass
class LocalStateStoreConfig:
"""
Configuration of a state store using a local JSON file
"""

path: str
save_interval: TimeIntervalConfig = TimeIntervalConfig("30s")


@dataclass
class StateStoreConfig:
"""
Configuration of the State Store, containing ``LocalStateStoreConfig`` or ``RawStateStoreConfig``
"""

raw: Optional[RawStateStoreConfig] = None
local: Optional[LocalStateStoreConfig] = None

Expand Down
4 changes: 4 additions & 0 deletions cognite/extractorutils/configtools/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ class KeyVaultAuthenticationMethod(Enum):


class KeyVaultLoader:
"""
Class responsible for configuring keyvault for clients using Azure
"""

def __init__(self, config: Optional[dict]):
self.config = config

Expand Down
2 changes: 1 addition & 1 deletion cognite/extractorutils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

class InvalidConfigError(Exception):
"""
Exception thrown from ``load_yaml`` if config file is invalid. This can be due to
Exception thrown from ``load_yaml`` and ``load_yaml_dict`` if config file is invalid. This can be due to
* Missing fields
* Incompatible types
Expand Down
27 changes: 14 additions & 13 deletions cognite/extractorutils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class EitherId:
Args:
id: Internal ID
externalId or external_id: external ID
external_id: external ID. It can be `external_id` or `externalId`
Raises:
TypeError: If none of both of id types are set.
Expand Down Expand Up @@ -197,24 +197,25 @@ def add_extraction_pipeline(
added_message: str = "",
) -> Callable[[Callable[..., _T1]], Callable[..., _T1]]:
"""
This is to be used as a decorator for extractor functions to add extraction pipeline information
This is to be used as a decorator for extractor functions to add extraction pipeline information.
Args:
extraction_pipeline_ext_id:
cognite_client:
heartbeat_waiting_time:
added_message:
extraction_pipeline_ext_id: External ID of the extraction pipeline
cognite_client: Client to use when communicating with CDF
heartbeat_waiting_time: Target interval between heartbeats, in seconds
Usage:
If you have a function named "extract_data(*args, **kwargs)" and want to connect it to an extraction
pipeline, you can use this decorator function as:
@add_extraction_pipeline(
extraction_pipeline_ext_id=<INSERT EXTERNAL ID>,
cognite_client=<INSERT COGNITE CLIENT OBJECT>,
logger=<INSERT LOGGER>,
)
def extract_data(*args, **kwargs):
<INSERT FUNCTION BODY>
.. code-block:: python
@add_extraction_pipeline(
extraction_pipeline_ext_id=<INSERT EXTERNAL ID>,
cognite_client=<INSERT COGNITE CLIENT OBJECT>,
)
def extract_data(*args, **kwargs):
<INSERT FUNCTION BODY>
"""

# TODO 1. Consider refactoring this decorator to share methods with the Extractor context manager in .base.py
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
language = "en"

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
Expand Down
56 changes: 44 additions & 12 deletions docs/source/docs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@ Base class for extractors

The ``configtools`` module exists of tools for loading and verifying config files for extractors.

Extractor configurasions are conventionally written in *hyphen-cased YAML*. These are typically loaded and serialized as *dataclasses* in Python.
Extractor configurations are conventionally written in *hyphen-cased YAML*. These are typically loaded and serialized as *dataclasses* in Python.


Config loader
^^^^^^^^^^^^^
.. autofunction:: cognite.extractorutils.configtools.load_yaml
.. autofunction:: cognite.extractorutils.configtools.load_yaml_dict
.. autoclass:: cognite.extractorutils.configtools.KeyVaultAuthenticationMethod
.. autoclass:: cognite.extractorutils.configtools.KeyVaultLoader


Base classes
Expand Down Expand Up @@ -55,6 +58,14 @@ The ``configtools`` module contains several prebuilt config classes for many com
:undoc-members:
.. autoclass:: cognite.extractorutils.configtools.CogniteConfig
:undoc-members:
.. autoclass:: cognite.extractorutils.configtools.EitherIdConfig
:undoc-members:
.. autoclass:: cognite.extractorutils.configtools.ConnectionConfig
:undoc-members:
.. autoclass:: cognite.extractorutils.configtools.AuthenticatorConfig
:undoc-members:
.. autoclass:: cognite.extractorutils.configtools.CertificateConfig
:undoc-members:
.. autoclass:: cognite.extractorutils.configtools.LoggingConfig
:undoc-members:
.. autoclass:: cognite.extractorutils.configtools.MetricsConfig
Expand All @@ -67,6 +78,12 @@ The ``configtools`` module contains several prebuilt config classes for many com
:undoc-members:
.. autoclass:: cognite.extractorutils.configtools.LocalStateStoreConfig
:undoc-members:
.. autoclass:: cognite.extractorutils.configtools.TimeIntervalConfig
:undoc-members:
.. autoclass:: cognite.extractorutils.configtools.FileSizeConfig
:undoc-members:
.. autoclass:: cognite.extractorutils.configtools.ConfigType
:undoc-members:


Exceptions
Expand All @@ -75,17 +92,6 @@ Exceptions
.. autoexception:: cognite.extractorutils.configtools.InvalidConfigError


``throttle`` - Tools for throttling
-----------------------------------

.. automodule:: cognite.extractorutils.throttle
:members:
:undoc-members:
:inherited-members:
:show-inheritance:



``metrics`` - Automatic pushers of performance metrics
------------------------------------------------------

Expand Down Expand Up @@ -115,6 +121,32 @@ Exceptions
:inherited-members:
:show-inheritance:

.. automodule:: cognite.extractorutils.uploader._base
:members:
:undoc-members:
:inherited-members:
:show-inheritance:

.. automodule:: cognite.extractorutils.uploader.assets
:members:
:show-inheritance:

.. automodule:: cognite.extractorutils.uploader.events
:members:
:show-inheritance:

.. automodule:: cognite.extractorutils.uploader.files
:members:
:show-inheritance:

.. automodule:: cognite.extractorutils.uploader.raw
:members:
:show-inheritance:

.. automodule:: cognite.extractorutils.uploader.time_series
:members:
:show-inheritance:


``util`` - Miscellaneous utilities
----------------------------------
Expand Down
6 changes: 4 additions & 2 deletions docs/source/package.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ The final step is to ship our extractor. Some times we can depend on a Python in
our extractor in, in other cases out extractor needs to be fully self-contained. How we ship our extractor will differ
slightly between those scenarios.

When developing the extractor, running it is fairly easy:
When developing the extractor, running it is fairly easy: [#]_

.. code-block:: bash
poetry run <extractor_name>
poetry run <extractor_name> <config-file>
Sometimes we could just send our project to the production environment like this, for example by cloning the git repo
(which would also make updating to future versions very easy). However, when shipping the extractor to a production
Expand Down Expand Up @@ -68,6 +68,8 @@ executable you have to run ``cogex build`` from Windows, likewise for Linux or M

The resulting executable will contain your extractor, all dependencies [#]_ and the Python runtime.

.. [#] Replace ``<extractor_name>`` with the name of your specific extractor, and ``<config-file>`` with the (absolute or
relative) path to a config file.
.. [#] Note that this could be in violation of some licenses, particularly GPL or it's derivatives. Make sure that the
licenses of your dependencies allows this type of linking.
2 changes: 1 addition & 1 deletion docs/source/read_csv.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ If we wanted to make our extractor even more generic, we could have made the ``d

We now have a ``reader`` object. This is an iterator that will return each row in the CSV file as a dictionary where
column names are keys and the row are values. We can now iterate over this reader and add all the rows to the upload
queue using the :meth:`add_to_upload_queue <cognite.extractorutils.uploader.RawUploadQueue.add_to_upload_queue>` method.
queue using the :meth:`add_to_upload_queue <cognite.extractorutils.uploader.raw.RawUploadQueue.add_to_upload_queue>` method.

.. code-block:: python
Expand Down
33 changes: 24 additions & 9 deletions docs/source/uploader.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@ Using an upload queue
---------------------

We begin by looking at the upload queue. Since this extractor will write to CDF RAW, we will use the
:meth:`RawUploadQueue <cognite.extractorutils.uploader.RawUploadQueue>`. Similar queues exists for time series data
points, events, sequence rows and files.
:meth:`RawUploadQueue <cognite.extractorutils.uploader.raw.RawUploadQueue>`. Similar queues exists for
:meth:`time series data points <cognite.extractorutils.uploader.time_series.TimeSeriesUploadQueue>`,
:meth:`events <cognite.extractorutils.uploader.events.EventUploadQueue>`,
:meth:`assets <cognite.extractorutils.uploader.assets.AssetUploadQueue>`,
:meth:`sequence rows <cognite.extractorutils.uploader.assets.SequenceUploadQueue>`
and :meth:`files <cognite.extractorutils.uploader.files.FileUploadQueue>`.

The reason for using an upload queue is to batch together data into larger requests to CDF. This will increase
performance, some times quite dramatically since network latencies can often be a bottleneck for extractors. We can add
Expand All @@ -27,7 +31,7 @@ manager (with a ``with`` statement). The advantage is that we can then set one o
happen (such as every *x* seconds or when the queue has *y* elements in it), and don't think about it again. If the
queue is not empty when we exit the context, a final upload will be made to make sure no data is left behind.

To create a :meth:`RawUploadQueue <cognite.extractorutils.uploader.RawUploadQueue>`, we write therefore start with
To create a :meth:`RawUploadQueue <cognite.extractorutils.uploader.raw.RawUploadQueue>`, we write therefore start with

.. code-block:: python
Expand All @@ -47,7 +51,7 @@ function. This function is called from the ``__main__.py`` file, and is provided

* ``cognite`` is an initiated ``CogniteClient`` that is set up to to use the CDF project that the user configured in
their config file.
* ``states`` is a state store object, we will not cover these in this tutorial, but in short it allows us to keep track
* ``states`` is a :meth:`State Store <cognite.extractorutils.statestore>` object, we will not cover these in this tutorial, but in short it allows us to keep track
of extraction state between runs to avoid duplicate work
* ``config`` is the config file the user have provided, which has been loaded and stored as an instance of the Config
class we made in the :ref:`Read CSV files` chapter.
Expand All @@ -72,6 +76,9 @@ this:
extract_file(file, queue)
This will call the ``start()`` and ``stop()`` methods from :meth:`AbstractUploadQueue <cognite.extractorutils.uploader._base.AbstractUploadQueue>`
class automatically once all files are processed or the limit of the queue, defined by the ``max_queue_size`` keyword argument, is reached.


Extraction pipeline runs
------------------------
Expand Down Expand Up @@ -104,8 +111,16 @@ section of the config file, containing either an ``external-id`` or (internal) `
.. code-block:: yaml
cognite:
project: publicdata
api-key: ${COGNITE_API_KEY}
extraction-pipeline:
external-id: abc123
# Read these from environment variables
host: ${COGNITE_BASE_URL}
project: ${COGNITE_PROJECT}
idp-authentication:
token-url: ${COGNITE_TOKEN_URL}
client-id: ${COGNITE_CLIENT_ID}
secret: ${COGNITE_CLIENT_SECRET}
scopes:
- ${COGNITE_BASE_URL}/.default
extraction-pipeline:
external-id: abc123

0 comments on commit 30d7143

Please sign in to comment.