Add expand_properties flag to to_pandas for Node, Edge, NodeList …

…and EdgeList (#1555)
cognitedata · Dec 21, 2023 · 35eeac5 · 35eeac5
1 parent 75a7bf8
commit 35eeac5
Show file tree

Hide file tree

Showing 8 changed files with 221 additions and 20 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -24,8 +24,7 @@ jobs:
           poetry install -E numpy
 
       - name: Linting and static code checks
-        run: |
-          pre-commit run --all-files
+        run: pre-commit run --all-files
 
       - name: Verify proto files
         env:
@@ -70,8 +69,7 @@ jobs:
           poetry install
 
       - name: Test core
-        run: pytest tests/tests_unit -n8 --dist loadscope --maxfail 10 -m 'not dsl'
-          --test-deps-only-core
+        run: pytest tests/tests_unit -n8 --dist loadscope --maxfail 10 -m 'not dsl' --test-deps-only-core
 
   test_full:
     runs-on: ${{ matrix.os }}
@@ -101,8 +99,7 @@ jobs:
           COGNITE_PROJECT: python-sdk-test
           COGNITE_BASE_URL: https://greenfield.cognitedata.com
           COGNITE_CLIENT_NAME: python-sdk-integration-tests
-        run: |
-          pytest tests --durations=10 --cov --cov-report xml:coverage.xml -n8 --dist loadscope --reruns 2
+        run: pytest tests --durations=10 --cov --cov-report xml:coverage.xml -n8 --dist loadscope --reruns 2 --maxfail 20
 
       - uses: codecov/codecov-action@v3
         with:
@@ -118,8 +115,7 @@ jobs:
           python-version: ${{ env.PYTHON_VERSION }}
 
       - name: Install full dependencies
-        run: |
-          python3 -m pip install --upgrade pip poetry
+        run: python3 -m pip install --upgrade pip poetry
 
       - name: Build package
         run: poetry build
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -52,8 +52,7 @@ jobs:
           poetry install
 
       - name: Test core
-        run: pytest tests/tests_unit -n8 --dist loadscope --maxfail 10 -m 'not dsl'
-          --test-deps-only-core
+        run: pytest tests/tests_unit -n8 --dist loadscope --maxfail 10 -m 'not dsl' --test-deps-only-core
 
   test_full:
     runs-on: ${{ matrix.os }}
@@ -83,7 +82,7 @@ jobs:
           COGNITE_PROJECT: python-sdk-test
           COGNITE_BASE_URL: https://greenfield.cognitedata.com
           COGNITE_CLIENT_NAME: python-sdk-integration-tests
-        run: pytest tests --cov --cov-report xml:coverage.xml -n8 --dist loadscope --reruns 2 --maxfail 20
+        run: pytest tests --durations=10 --cov --cov-report xml:coverage.xml -n8 --dist loadscope --reruns 2 --maxfail 20
 
       - uses: codecov/codecov-action@v3
         with:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,12 @@ Changes are grouped as follows
 - `Fixed` for any bug fixes.
 - `Security` in case of vulnerabilities.
 
+## [7.8.0] - 2023-12-20
+### Added
+- Instance classes `Node`, `Edge`, `NodeList` and `EdgeList` now supports a new flag `expand_properties` in their `to_pandas` method,
+  that makes it much simpler to work with the fetched properties. Additionally, `remove_property_prefix` allows easy prefix
+  removal (of the view ID, e.g. `space.external_id/version.my_prop` -> `my_prop`).
+
 ## [7.7.1] - 2023-12-20
 ### Fixed
 - Missing legacy capability ACLs: `modelHostingAcl` and `genericsAcl`.  

diff --git a/cognite/client/_version.py b/cognite/client/_version.py
@@ -1,4 +1,4 @@
 from __future__ import annotations
 
-__version__ = "7.7.1"
+__version__ = "7.8.0"
 __api_subversion__ = "V20220125"
diff --git a/cognite/client/data_classes/data_modeling/core.py b/cognite/client/data_classes/data_modeling/core.py
@@ -1,16 +1,26 @@
 from __future__ import annotations
 
 import json
+import warnings
 from abc import ABC
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Generic, Literal
 
 from typing_extensions import Self
 
-from cognite.client.data_classes._base import CogniteObject, CogniteResource, basic_instance_dump
+from cognite.client.data_classes._base import (
+    CogniteObject,
+    CogniteResource,
+    CogniteResourceList,
+    T_CogniteResource,
+    basic_instance_dump,
+)
 from cognite.client.utils._auxiliary import json_dump_default
+from cognite.client.utils._importing import local_import
 from cognite.client.utils._text import convert_all_keys_to_snake_case
 
 if TYPE_CHECKING:
+    import pandas as pd
+
     from cognite.client import CogniteClient
 
 
@@ -34,6 +44,54 @@ def _load(cls, resource: dict[str, Any], cognite_client: CogniteClient | None =
         return cls(**convert_all_keys_to_snake_case(resource))
 
 
+class DataModelingInstancesList(CogniteResourceList, Generic[T_CogniteResource]):
+    def to_pandas(  # type: ignore [override]
+        self,
+        camel_case: bool = False,
+        convert_timestamps: bool = True,
+        expand_properties: bool = False,
+        remove_property_prefix: bool = True,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """Convert the instance into a pandas DataFrame. Note that if the properties column is expanded and there are
+        keys in the metadata that already exist in the DataFrame, then an error will be raised by pandas during joining.
+
+        Args:
+            camel_case (bool): Convert column names to camel case (e.g. `externalId` instead of `external_id`). Does not apply to properties.
+            convert_timestamps (bool): Convert known columns storing CDF timestamps (milliseconds since epoch) to datetime. Does not affect properties.
+            expand_properties (bool): Expand the properties into separate columns. Note: Will change default to True in the next major version.
+            remove_property_prefix (bool): Remove view ID prefix from columns names of expanded properties. Requires data to be from a single view.
+            **kwargs (Any): For backwards compatability.
+
+        Returns:
+            pd.DataFrame: The Cognite resource as a dataframe.
+        """
+        kwargs.pop("expand_metadata", None), kwargs.pop("metadata_prefix", None)
+        if kwargs:
+            raise TypeError(f"Unsupported keyword arguments: {kwargs}")
+        if not expand_properties:
+            warnings.warn(
+                "Keyword argument 'expand_properties' will change default from False to True in the next major version.",
+                DeprecationWarning,
+            )
+        df = super().to_pandas(camel_case=camel_case, expand_metadata=False, convert_timestamps=convert_timestamps)
+        if not expand_properties or "properties" not in df.columns:
+            return df
+
+        prop_df = local_import("pandas").json_normalize(df.pop("properties"), max_level=2)
+        if remove_property_prefix:
+            # We only do/allow this if we have a single source:
+            view_id, *extra = set(vid for item in self for vid in item.properties)
+            if not extra:
+                prop_df.columns = prop_df.columns.str.removeprefix("{}.{}/{}.".format(*view_id.as_tuple()))
+            else:
+                warnings.warn(
+                    "Can't remove view ID prefix from expanded property columns as source was not unique",
+                    RuntimeWarning,
+                )
+        return df.join(prop_df)
+
+
 class DataModelingSort(CogniteObject):
     def __init__(
         self,

diff --git a/cognite/client/data_classes/data_modeling/instances.py b/cognite/client/data_classes/data_modeling/instances.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import threading
+import warnings
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass
@@ -29,7 +30,11 @@
 from cognite.client.data_classes._base import CogniteResourceList
 from cognite.client.data_classes.aggregations import AggregatedNumberedValue
 from cognite.client.data_classes.data_modeling._validation import validate_data_modeling_identifier
-from cognite.client.data_classes.data_modeling.core import DataModelingResource, DataModelingSort
+from cognite.client.data_classes.data_modeling.core import (
+    DataModelingInstancesList,
+    DataModelingResource,
+    DataModelingSort,
+)
 from cognite.client.data_classes.data_modeling.data_types import (
     DirectRelationReference,
 )
@@ -41,10 +46,14 @@
     ViewId,
     ViewIdentifier,
 )
+from cognite.client.utils._importing import local_import
 from cognite.client.utils._text import convert_all_keys_to_snake_case
 
 if TYPE_CHECKING:
+    import pandas as pd
+
     from cognite.client import CogniteClient
+
 PropertyValue: TypeAlias = Union[
     str,
     int,
@@ -181,7 +190,7 @@ def load(
                     raise ValueError("View id must be in the format <external_id>/<version>")
                 view_id = ViewId.load((space, *view_tuple))
                 props[view_id] = properties
-        return Properties(props)
+        return cls(props)
 
     def dump(self) -> dict[Space, dict[str, dict[PropertyIdentifier, PropertyValue]]]:
         props: dict[Space, dict[str, dict[PropertyIdentifier, PropertyValue]]] = defaultdict(dict)
@@ -287,6 +296,57 @@ def dump(self, camel_case: bool = True) -> dict[str, Any]:
             dumped["properties"] = self.properties.dump()
         return dumped
 
+    def to_pandas(  # type: ignore [override]
+        self,
+        ignore: list[str] | None = None,
+        camel_case: bool = False,
+        convert_timestamps: bool = True,
+        expand_properties: bool = False,
+        remove_property_prefix: bool = True,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """Convert the instance into a pandas DataFrame.
+
+        Args:
+            ignore (list[str] | None): List of row keys to skip when converting to a data frame. Is applied before expansions.
+            camel_case (bool): Convert attribute names to camel case (e.g. `externalId` instead of `external_id`). Does not affect properties if expanded.
+            convert_timestamps (bool): Convert known attributes storing CDF timestamps (milliseconds since epoch) to datetime. Does not affect properties.
+            expand_properties (bool): Expand the properties into separate rows. Note: Will change default to True in the next major version.
+            remove_property_prefix (bool): Remove view ID prefix from row names of expanded properties (in index). Requires data to be from a single view.
+            **kwargs (Any): For backwards compatability.
+
+        Returns:
+            pd.DataFrame: The dataframe.
+        """
+        kwargs.pop("expand_metadata", None), kwargs.pop("metadata_prefix", None)
+        if kwargs:
+            raise TypeError(f"Unsupported keyword arguments: {kwargs}")
+        if not expand_properties:
+            warnings.warn(
+                "Keyword argument 'expand_properties' will change default from False to True in the next major version.",
+                DeprecationWarning,
+            )
+        df = super().to_pandas(
+            expand_metadata=False, ignore=ignore, camel_case=camel_case, convert_timestamps=convert_timestamps
+        )
+        if not expand_properties or "properties" not in df.index:
+            return df
+
+        pd = local_import("pandas")
+        col = df.squeeze()
+        prop_df = pd.json_normalize(col.pop("properties"), max_level=2)
+        if remove_property_prefix:
+            # We only do/allow this if we have a single source:
+            view_id, *extra = self.properties.keys()
+            if not extra:
+                prop_df.columns = prop_df.columns.str.removeprefix("{}.{}/{}.".format(*view_id.as_tuple()))
+            else:
+                warnings.warn(
+                    "Can't remove view ID prefix from expanded property rows as source was not unique",
+                    RuntimeWarning,
+                )
+        return pd.concat((col, prop_df.T.squeeze())).to_frame(name="value")
+
     @abstractmethod
     def as_apply(self, source: ViewIdentifier | ContainerIdentifier, existing_version: int) -> InstanceApply:
         """Convert the instance to an apply instance."""
@@ -326,7 +386,7 @@ def __init__(
 
 
 class InstanceAggregationResult(DataModelingResource):
-    """A node or edge. This represents the update on the instance.
+    """Represents instances aggregation results.
 
     Args:
         aggregates (list[AggregatedNumberedValue]): List of aggregated values.
@@ -748,7 +808,7 @@ def as_ids(self) -> list[NodeId]:
         return [node.as_id() for node in self]
 
 
-class NodeList(CogniteResourceList[Node]):
+class NodeList(DataModelingInstancesList[Node]):
     _RESOURCE = Node
 
     def as_ids(self) -> list[NodeId]:
@@ -795,7 +855,7 @@ def as_ids(self) -> list[EdgeId]:
         return [edge.as_id() for edge in self]
 
 
-class EdgeList(CogniteResourceList[Edge]):
+class EdgeList(DataModelingInstancesList[Edge]):
     _RESOURCE = Edge
 
     def as_ids(self) -> list[EdgeId]:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "cognite-sdk"
 
-version = "7.7.1"
+version = "7.8.0"
 description = "Cognite Python SDK"
 readme = "README.md"
 documentation = "https://cognite-sdk-python.readthedocs-hosted.com"

diff --git a/tests/tests_unit/test_data_classes/test_data_models/test_instances.py b/tests/tests_unit/test_data_classes/test_data_models/test_instances.py
@@ -1,10 +1,19 @@
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
 from cognite.client.data_classes.data_modeling import (
     ContainerId,
     DirectRelationReference,
+    Edge,
     EdgeApply,
+    EdgeList,
     Node,
     NodeApply,
     NodeId,
+    NodeList,
     NodeOrEdgeData,
 )
 
@@ -135,3 +144,76 @@ def test_dump_and_load(self) -> None:
             "type": {"externalId": "someType", "space": "someSpace"},
             "version": 1,
         }
+
+
+@pytest.fixture
+def node_dumped() -> dict[str, Any]:
+    return {
+        "space": "craft",
+        "externalId": "xid",
+        "version": "V",
+        "lastUpdatedTime": 123,
+        "createdTime": 123,
+        "properties": {
+            "my-space": {"my-view/v8": {"num": "210113347", "jsån": {"why": "is", "this": "here"}, "title": "sir"}}
+        },
+    }
+
+
+@pytest.fixture
+def edge_dumped(node_dumped: dict[str, Any]) -> dict[str, Any]:
+    return {
+        **node_dumped,
+        "type": {"space": "sp", "externalId": "xid"},
+        "startNode": {"space": "spsp", "externalId": "xid2"},
+        "endNode": {"space": "spspsp", "externalId": "xid3"},
+    }
+
+
+@pytest.mark.dsl
+class TestInstancesToPandas:
+    @pytest.mark.parametrize("inst_cls", (Node, Edge))
+    def test_expand_properties(
+        self, node_dumped: dict[str, Any], edge_dumped: dict[str, Any], inst_cls: type[Node] | type[Edge]
+    ) -> None:
+        raw = node_dumped if inst_cls is Node else edge_dumped
+        # Need .copy() because load does inplace update of properties:
+        not_expanded = inst_cls._load(raw.copy()).to_pandas(expand_properties=False)
+        expanded = inst_cls._load(raw.copy()).to_pandas(expand_properties=True, remove_property_prefix=True)
+        expanded_with_prefix = inst_cls._load(raw.copy()).to_pandas(
+            expand_properties=True, remove_property_prefix=False
+        )
+
+        assert "properties" in not_expanded.index
+        assert "properties" not in expanded.index
+        assert "properties" not in expanded_with_prefix.index
+
+        assert raw["properties"] == not_expanded.loc["properties"].item()
+
+        for k, v in raw["properties"]["my-space"]["my-view/v8"].items():
+            assert v == expanded.loc[k].item()
+            assert v == expanded_with_prefix.loc[f"my-space.my-view/v8.{k}"].item()
+
+    @pytest.mark.parametrize("inst_cls", (NodeList, EdgeList))
+    def test_expand_properties__list_class(
+        self, node_dumped: dict[str, Any], edge_dumped: dict[str, Any], inst_cls: type[NodeList] | type[EdgeList]
+    ) -> None:
+        raw = node_dumped if inst_cls is Node else edge_dumped
+        # Need .copy() because load does inplace update of properties:
+        not_expanded = inst_cls._load([raw.copy(), raw.copy()]).to_pandas(expand_properties=False)
+        expanded = inst_cls._load([raw.copy(), raw.copy()]).to_pandas(
+            expand_properties=True, remove_property_prefix=True
+        )
+        expanded_with_prefix = inst_cls._load([raw.copy(), raw.copy()]).to_pandas(
+            expand_properties=True, remove_property_prefix=False
+        )
+
+        assert "properties" in not_expanded.columns
+        assert "properties" not in expanded.columns
+        assert "properties" not in expanded_with_prefix.columns
+
+        assert raw["properties"] == not_expanded.loc[0, "properties"]
+
+        for k, v in raw["properties"]["my-space"]["my-view/v8"].items():
+            assert v == expanded.loc[0, k]
+            assert v == expanded_with_prefix.loc[0, f"my-space.my-view/v8.{k}"]