From 35eeac562f7939ac9bfedd5191e6919171681831 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20V=2E=20Treider?= Date: Thu, 21 Dec 2023 10:30:12 +0100 Subject: [PATCH] Add `expand_properties` flag to `to_pandas` for Node, Edge, NodeList and EdgeList (#1555) --- .github/workflows/build.yml | 12 +-- .github/workflows/release.yaml | 5 +- CHANGELOG.md | 6 ++ cognite/client/_version.py | 2 +- .../client/data_classes/data_modeling/core.py | 62 +++++++++++++- .../data_classes/data_modeling/instances.py | 70 ++++++++++++++-- pyproject.toml | 2 +- .../test_data_models/test_instances.py | 82 +++++++++++++++++++ 8 files changed, 221 insertions(+), 20 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d6588a57c1..088e8b1df2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -24,8 +24,7 @@ jobs: poetry install -E numpy - name: Linting and static code checks - run: | - pre-commit run --all-files + run: pre-commit run --all-files - name: Verify proto files env: @@ -70,8 +69,7 @@ jobs: poetry install - name: Test core - run: pytest tests/tests_unit -n8 --dist loadscope --maxfail 10 -m 'not dsl' - --test-deps-only-core + run: pytest tests/tests_unit -n8 --dist loadscope --maxfail 10 -m 'not dsl' --test-deps-only-core test_full: runs-on: ${{ matrix.os }} @@ -101,8 +99,7 @@ jobs: COGNITE_PROJECT: python-sdk-test COGNITE_BASE_URL: https://greenfield.cognitedata.com COGNITE_CLIENT_NAME: python-sdk-integration-tests - run: | - pytest tests --durations=10 --cov --cov-report xml:coverage.xml -n8 --dist loadscope --reruns 2 + run: pytest tests --durations=10 --cov --cov-report xml:coverage.xml -n8 --dist loadscope --reruns 2 --maxfail 20 - uses: codecov/codecov-action@v3 with: @@ -118,8 +115,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install full dependencies - run: | - python3 -m pip install --upgrade pip poetry + run: python3 -m pip install --upgrade pip poetry - name: Build package run: poetry build diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 9e34942181..c584d34751 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -52,8 +52,7 @@ jobs: poetry install - name: Test core - run: pytest tests/tests_unit -n8 --dist loadscope --maxfail 10 -m 'not dsl' - --test-deps-only-core + run: pytest tests/tests_unit -n8 --dist loadscope --maxfail 10 -m 'not dsl' --test-deps-only-core test_full: runs-on: ${{ matrix.os }} @@ -83,7 +82,7 @@ jobs: COGNITE_PROJECT: python-sdk-test COGNITE_BASE_URL: https://greenfield.cognitedata.com COGNITE_CLIENT_NAME: python-sdk-integration-tests - run: pytest tests --cov --cov-report xml:coverage.xml -n8 --dist loadscope --reruns 2 --maxfail 20 + run: pytest tests --durations=10 --cov --cov-report xml:coverage.xml -n8 --dist loadscope --reruns 2 --maxfail 20 - uses: codecov/codecov-action@v3 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 4dde308e53..23ea16d6f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,12 @@ Changes are grouped as follows - `Fixed` for any bug fixes. - `Security` in case of vulnerabilities. +## [7.8.0] - 2023-12-20 +### Added +- Instance classes `Node`, `Edge`, `NodeList` and `EdgeList` now supports a new flag `expand_properties` in their `to_pandas` method, + that makes it much simpler to work with the fetched properties. Additionally, `remove_property_prefix` allows easy prefix + removal (of the view ID, e.g. `space.external_id/version.my_prop` -> `my_prop`). + ## [7.7.1] - 2023-12-20 ### Fixed - Missing legacy capability ACLs: `modelHostingAcl` and `genericsAcl`. diff --git a/cognite/client/_version.py b/cognite/client/_version.py index a98e1c887d..91e2bab438 100644 --- a/cognite/client/_version.py +++ b/cognite/client/_version.py @@ -1,4 +1,4 @@ from __future__ import annotations -__version__ = "7.7.1" +__version__ = "7.8.0" __api_subversion__ = "V20220125" diff --git a/cognite/client/data_classes/data_modeling/core.py b/cognite/client/data_classes/data_modeling/core.py index d770e5e587..1ddd881ca6 100644 --- a/cognite/client/data_classes/data_modeling/core.py +++ b/cognite/client/data_classes/data_modeling/core.py @@ -1,16 +1,26 @@ from __future__ import annotations import json +import warnings from abc import ABC -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Generic, Literal from typing_extensions import Self -from cognite.client.data_classes._base import CogniteObject, CogniteResource, basic_instance_dump +from cognite.client.data_classes._base import ( + CogniteObject, + CogniteResource, + CogniteResourceList, + T_CogniteResource, + basic_instance_dump, +) from cognite.client.utils._auxiliary import json_dump_default +from cognite.client.utils._importing import local_import from cognite.client.utils._text import convert_all_keys_to_snake_case if TYPE_CHECKING: + import pandas as pd + from cognite.client import CogniteClient @@ -34,6 +44,54 @@ def _load(cls, resource: dict[str, Any], cognite_client: CogniteClient | None = return cls(**convert_all_keys_to_snake_case(resource)) +class DataModelingInstancesList(CogniteResourceList, Generic[T_CogniteResource]): + def to_pandas( # type: ignore [override] + self, + camel_case: bool = False, + convert_timestamps: bool = True, + expand_properties: bool = False, + remove_property_prefix: bool = True, + **kwargs: Any, + ) -> pd.DataFrame: + """Convert the instance into a pandas DataFrame. Note that if the properties column is expanded and there are + keys in the metadata that already exist in the DataFrame, then an error will be raised by pandas during joining. + + Args: + camel_case (bool): Convert column names to camel case (e.g. `externalId` instead of `external_id`). Does not apply to properties. + convert_timestamps (bool): Convert known columns storing CDF timestamps (milliseconds since epoch) to datetime. Does not affect properties. + expand_properties (bool): Expand the properties into separate columns. Note: Will change default to True in the next major version. + remove_property_prefix (bool): Remove view ID prefix from columns names of expanded properties. Requires data to be from a single view. + **kwargs (Any): For backwards compatability. + + Returns: + pd.DataFrame: The Cognite resource as a dataframe. + """ + kwargs.pop("expand_metadata", None), kwargs.pop("metadata_prefix", None) + if kwargs: + raise TypeError(f"Unsupported keyword arguments: {kwargs}") + if not expand_properties: + warnings.warn( + "Keyword argument 'expand_properties' will change default from False to True in the next major version.", + DeprecationWarning, + ) + df = super().to_pandas(camel_case=camel_case, expand_metadata=False, convert_timestamps=convert_timestamps) + if not expand_properties or "properties" not in df.columns: + return df + + prop_df = local_import("pandas").json_normalize(df.pop("properties"), max_level=2) + if remove_property_prefix: + # We only do/allow this if we have a single source: + view_id, *extra = set(vid for item in self for vid in item.properties) + if not extra: + prop_df.columns = prop_df.columns.str.removeprefix("{}.{}/{}.".format(*view_id.as_tuple())) + else: + warnings.warn( + "Can't remove view ID prefix from expanded property columns as source was not unique", + RuntimeWarning, + ) + return df.join(prop_df) + + class DataModelingSort(CogniteObject): def __init__( self, diff --git a/cognite/client/data_classes/data_modeling/instances.py b/cognite/client/data_classes/data_modeling/instances.py index 0af6446208..2993001581 100644 --- a/cognite/client/data_classes/data_modeling/instances.py +++ b/cognite/client/data_classes/data_modeling/instances.py @@ -1,6 +1,7 @@ from __future__ import annotations import threading +import warnings from abc import ABC, abstractmethod from collections import defaultdict from dataclasses import dataclass @@ -29,7 +30,11 @@ from cognite.client.data_classes._base import CogniteResourceList from cognite.client.data_classes.aggregations import AggregatedNumberedValue from cognite.client.data_classes.data_modeling._validation import validate_data_modeling_identifier -from cognite.client.data_classes.data_modeling.core import DataModelingResource, DataModelingSort +from cognite.client.data_classes.data_modeling.core import ( + DataModelingInstancesList, + DataModelingResource, + DataModelingSort, +) from cognite.client.data_classes.data_modeling.data_types import ( DirectRelationReference, ) @@ -41,10 +46,14 @@ ViewId, ViewIdentifier, ) +from cognite.client.utils._importing import local_import from cognite.client.utils._text import convert_all_keys_to_snake_case if TYPE_CHECKING: + import pandas as pd + from cognite.client import CogniteClient + PropertyValue: TypeAlias = Union[ str, int, @@ -181,7 +190,7 @@ def load( raise ValueError("View id must be in the format /") view_id = ViewId.load((space, *view_tuple)) props[view_id] = properties - return Properties(props) + return cls(props) def dump(self) -> dict[Space, dict[str, dict[PropertyIdentifier, PropertyValue]]]: props: dict[Space, dict[str, dict[PropertyIdentifier, PropertyValue]]] = defaultdict(dict) @@ -287,6 +296,57 @@ def dump(self, camel_case: bool = True) -> dict[str, Any]: dumped["properties"] = self.properties.dump() return dumped + def to_pandas( # type: ignore [override] + self, + ignore: list[str] | None = None, + camel_case: bool = False, + convert_timestamps: bool = True, + expand_properties: bool = False, + remove_property_prefix: bool = True, + **kwargs: Any, + ) -> pd.DataFrame: + """Convert the instance into a pandas DataFrame. + + Args: + ignore (list[str] | None): List of row keys to skip when converting to a data frame. Is applied before expansions. + camel_case (bool): Convert attribute names to camel case (e.g. `externalId` instead of `external_id`). Does not affect properties if expanded. + convert_timestamps (bool): Convert known attributes storing CDF timestamps (milliseconds since epoch) to datetime. Does not affect properties. + expand_properties (bool): Expand the properties into separate rows. Note: Will change default to True in the next major version. + remove_property_prefix (bool): Remove view ID prefix from row names of expanded properties (in index). Requires data to be from a single view. + **kwargs (Any): For backwards compatability. + + Returns: + pd.DataFrame: The dataframe. + """ + kwargs.pop("expand_metadata", None), kwargs.pop("metadata_prefix", None) + if kwargs: + raise TypeError(f"Unsupported keyword arguments: {kwargs}") + if not expand_properties: + warnings.warn( + "Keyword argument 'expand_properties' will change default from False to True in the next major version.", + DeprecationWarning, + ) + df = super().to_pandas( + expand_metadata=False, ignore=ignore, camel_case=camel_case, convert_timestamps=convert_timestamps + ) + if not expand_properties or "properties" not in df.index: + return df + + pd = local_import("pandas") + col = df.squeeze() + prop_df = pd.json_normalize(col.pop("properties"), max_level=2) + if remove_property_prefix: + # We only do/allow this if we have a single source: + view_id, *extra = self.properties.keys() + if not extra: + prop_df.columns = prop_df.columns.str.removeprefix("{}.{}/{}.".format(*view_id.as_tuple())) + else: + warnings.warn( + "Can't remove view ID prefix from expanded property rows as source was not unique", + RuntimeWarning, + ) + return pd.concat((col, prop_df.T.squeeze())).to_frame(name="value") + @abstractmethod def as_apply(self, source: ViewIdentifier | ContainerIdentifier, existing_version: int) -> InstanceApply: """Convert the instance to an apply instance.""" @@ -326,7 +386,7 @@ def __init__( class InstanceAggregationResult(DataModelingResource): - """A node or edge. This represents the update on the instance. + """Represents instances aggregation results. Args: aggregates (list[AggregatedNumberedValue]): List of aggregated values. @@ -748,7 +808,7 @@ def as_ids(self) -> list[NodeId]: return [node.as_id() for node in self] -class NodeList(CogniteResourceList[Node]): +class NodeList(DataModelingInstancesList[Node]): _RESOURCE = Node def as_ids(self) -> list[NodeId]: @@ -795,7 +855,7 @@ def as_ids(self) -> list[EdgeId]: return [edge.as_id() for edge in self] -class EdgeList(CogniteResourceList[Edge]): +class EdgeList(DataModelingInstancesList[Edge]): _RESOURCE = Edge def as_ids(self) -> list[EdgeId]: diff --git a/pyproject.toml b/pyproject.toml index d5be0cce98..c42a70c9aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "cognite-sdk" -version = "7.7.1" +version = "7.8.0" description = "Cognite Python SDK" readme = "README.md" documentation = "https://cognite-sdk-python.readthedocs-hosted.com" diff --git a/tests/tests_unit/test_data_classes/test_data_models/test_instances.py b/tests/tests_unit/test_data_classes/test_data_models/test_instances.py index fbcc230f90..00c56c5d5a 100644 --- a/tests/tests_unit/test_data_classes/test_data_models/test_instances.py +++ b/tests/tests_unit/test_data_classes/test_data_models/test_instances.py @@ -1,10 +1,19 @@ +from __future__ import annotations + +from typing import Any + +import pytest + from cognite.client.data_classes.data_modeling import ( ContainerId, DirectRelationReference, + Edge, EdgeApply, + EdgeList, Node, NodeApply, NodeId, + NodeList, NodeOrEdgeData, ) @@ -135,3 +144,76 @@ def test_dump_and_load(self) -> None: "type": {"externalId": "someType", "space": "someSpace"}, "version": 1, } + + +@pytest.fixture +def node_dumped() -> dict[str, Any]: + return { + "space": "craft", + "externalId": "xid", + "version": "V", + "lastUpdatedTime": 123, + "createdTime": 123, + "properties": { + "my-space": {"my-view/v8": {"num": "210113347", "jsån": {"why": "is", "this": "here"}, "title": "sir"}} + }, + } + + +@pytest.fixture +def edge_dumped(node_dumped: dict[str, Any]) -> dict[str, Any]: + return { + **node_dumped, + "type": {"space": "sp", "externalId": "xid"}, + "startNode": {"space": "spsp", "externalId": "xid2"}, + "endNode": {"space": "spspsp", "externalId": "xid3"}, + } + + +@pytest.mark.dsl +class TestInstancesToPandas: + @pytest.mark.parametrize("inst_cls", (Node, Edge)) + def test_expand_properties( + self, node_dumped: dict[str, Any], edge_dumped: dict[str, Any], inst_cls: type[Node] | type[Edge] + ) -> None: + raw = node_dumped if inst_cls is Node else edge_dumped + # Need .copy() because load does inplace update of properties: + not_expanded = inst_cls._load(raw.copy()).to_pandas(expand_properties=False) + expanded = inst_cls._load(raw.copy()).to_pandas(expand_properties=True, remove_property_prefix=True) + expanded_with_prefix = inst_cls._load(raw.copy()).to_pandas( + expand_properties=True, remove_property_prefix=False + ) + + assert "properties" in not_expanded.index + assert "properties" not in expanded.index + assert "properties" not in expanded_with_prefix.index + + assert raw["properties"] == not_expanded.loc["properties"].item() + + for k, v in raw["properties"]["my-space"]["my-view/v8"].items(): + assert v == expanded.loc[k].item() + assert v == expanded_with_prefix.loc[f"my-space.my-view/v8.{k}"].item() + + @pytest.mark.parametrize("inst_cls", (NodeList, EdgeList)) + def test_expand_properties__list_class( + self, node_dumped: dict[str, Any], edge_dumped: dict[str, Any], inst_cls: type[NodeList] | type[EdgeList] + ) -> None: + raw = node_dumped if inst_cls is Node else edge_dumped + # Need .copy() because load does inplace update of properties: + not_expanded = inst_cls._load([raw.copy(), raw.copy()]).to_pandas(expand_properties=False) + expanded = inst_cls._load([raw.copy(), raw.copy()]).to_pandas( + expand_properties=True, remove_property_prefix=True + ) + expanded_with_prefix = inst_cls._load([raw.copy(), raw.copy()]).to_pandas( + expand_properties=True, remove_property_prefix=False + ) + + assert "properties" in not_expanded.columns + assert "properties" not in expanded.columns + assert "properties" not in expanded_with_prefix.columns + + assert raw["properties"] == not_expanded.loc[0, "properties"] + + for k, v in raw["properties"]["my-space"]["my-view/v8"].items(): + assert v == expanded.loc[0, k] + assert v == expanded_with_prefix.loc[0, f"my-space.my-view/v8.{k}"]