Skip to content

Commit

Permalink
feat: implementation to_pandas
Browse files Browse the repository at this point in the history
  • Loading branch information
doctrino committed Sep 4, 2024
1 parent 8c537f1 commit aa9488f
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 4 deletions.
3 changes: 2 additions & 1 deletion cognite/client/data_classes/data_modeling/instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -1024,7 +1024,8 @@ def to_pandas( # type: ignore [override]
prop_df = local_import("pandas").json_normalize(df.pop("properties"), max_level=2)
if remove_property_prefix and not prop_df.empty:
# We only do/allow this if we have a single source:
view_id, *extra = set(vid for item in self for vid in item.properties)
typed_view_ids = {item.get_source() for item in self if hasattr(item, "get_source")}
view_id, *extra = set(vid for item in self for vid in item.properties) | typed_view_ids
if not extra:
prop_df.columns = prop_df.columns.str.removeprefix("{}.{}/{}.".format(*view_id.as_tuple()))
else:
Expand Down
55 changes: 53 additions & 2 deletions cognite/client/data_classes/data_modeling/typed_instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@
NodeApply,
_serialize_property_value,
)
from cognite.client.utils._importing import local_import
from cognite.client.utils._text import to_camel_case
from cognite.client.utils._time import convert_data_modelling_timestamp
from cognite.client.utils._time import TIME_ATTRIBUTES, convert_data_modelling_timestamp

if TYPE_CHECKING:
import pandas as pd

from cognite.client import CogniteClient


Expand Down Expand Up @@ -157,6 +160,46 @@ def _load(cls, resource: dict[str, Any], cognite_client: CogniteClient | None =
properties = all_properties.get(source.space, {}).get(source.as_source_identifier(), {})
return cast(Self, _load_instance(cls, resource, properties, cls._instance_properties))

def to_pandas( # type: ignore [override]
self,
ignore: list[str] | None = None,
camel_case: bool = False,
convert_timestamps: bool = True,
**kwargs: Any,
) -> pd.DataFrame:
"""Convert the instance into a pandas DataFrame.
Args:
ignore (list[str] | None): List of row keys to skip when converting to a data frame. Is applied before expansions.
camel_case (bool): Convert attribute names to camel case (e.g. `externalId` instead of `external_id`). Does not affect properties if expanded.
convert_timestamps (bool): Convert known attributes storing CDF timestamps (milliseconds since epoch) to datetime.
**kwargs (Any): For backwards compatibility.
Returns:
pd.DataFrame: The instance as a pandas Series.
"""
pd = local_import("pandas")
for key in ["expand_metadata", "metadata_prefix", "expand_properties", "remove_property_prefix"]:
kwargs.pop(key, None)
if kwargs:
raise TypeError(f"Unsupported keyword arguments: {kwargs}")

dumped = super().dump(camel_case)
dumped.pop("properties", None)
properties = _dump_properties(
self, camel_case=camel_case, instance_properties=self._instance_properties, use_attribute_name=True
)
dumped.update(properties)

if convert_timestamps:
for k in TIME_ATTRIBUTES.intersection(dumped):
dumped[k] = pd.Timestamp(dumped[k], unit="ms")

for element in ignore or []:
dumped.pop(element, None)

return pd.Series(dumped)


class TypedEdge(Edge, ABC):
_instance_properties = frozenset(
Expand Down Expand Up @@ -274,9 +317,15 @@ def _get_properties_by_name(cls: type) -> dict[str, PropertyOptions]:
)


def _dump_properties(obj: object, camel_case: bool, instance_properties: frozenset[str]) -> dict[str, Any]:
def _dump_properties(
obj: object, camel_case: bool, instance_properties: frozenset[str], use_attribute_name: bool = False
) -> dict[str, Any]:
properties: dict[str, str | int | float | bool | dict | list] = {}
properties_by_name = _get_properties_by_name(type(obj))
if use_attribute_name:
attribute_by_property = {v.name: k for k, v in properties_by_name.items()}
else:
attribute_by_property = {}
for key, value in vars(obj).items():
if key in instance_properties or value is None:
continue
Expand All @@ -285,6 +334,8 @@ def _dump_properties(obj: object, camel_case: bool, instance_properties: frozens

if key in properties_by_name:
key = cast(str, properties_by_name[key].name)
if use_attribute_name and key in attribute_by_property:
key = attribute_by_property[key]

if isinstance(value, Iterable) and not isinstance(value, (str, dict)):
properties[key] = [_serialize_property_value(v, camel_case) for v in value]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

from datetime import date

from cognite.client.data_classes.data_modeling import DirectRelationReference, ViewId
import pytest

from cognite.client.data_classes.data_modeling import DirectRelationReference, NodeList, ViewId
from cognite.client.data_classes.data_modeling.typed_instances import (
PropertyOptions,
TypedEdge,
Expand Down Expand Up @@ -231,6 +233,60 @@ def test_dump_load_person(self) -> None:
assert isinstance(loaded.birth_date, date)
assert all(isinstance(sibling, DirectRelationReference) for sibling in loaded.siblings or [])

@pytest.mark.dsl
def test_to_pandas(self) -> None:
import pandas as pd

person = PersonRead(
"sp_my_fixed_space", "my_external_id", 1, 0, 0, "John Doe", date(1990, 1, 1), "[email protected]"
)

df = person.to_pandas()

pd.testing.assert_series_equal(
df,
pd.Series(
{
"space": "sp_my_fixed_space",
"external_id": "my_external_id",
"version": 1,
"last_updated_time": pd.Timestamp("1970-01-01 00:00:00"),
"created_time": pd.Timestamp("1970-01-01 00:00:00"),
"instance_type": "node",
"name": "John Doe",
"birth_date": "1990-01-01",
"email": "[email protected]",
}
),
)

@pytest.mark.dsl
def test_to_pandas_list(self) -> None:
import pandas as pd

person = NodeList[PersonRead](
[PersonRead("sp_my_fixed_space", "my_external_id", 1, 0, 0, "John Doe", date(1990, 1, 1), "[email protected]")]
)

df = person.to_pandas(expand_properties=True)

pd.testing.assert_frame_equal(
df,
pd.DataFrame(
{
"space": ["sp_my_fixed_space"],
"external_id": ["my_external_id"],
"version": [1],
"last_updated_time": [pd.Timestamp("1970-01-01 00:00:00")],
"created_time": [pd.Timestamp("1970-01-01 00:00:00")],
"instance_type": ["node"],
"name": ["John Doe"],
"birthDate": ["1990-01-01"],
"email": ["[email protected]"],
}
),
)


class TestTypedEdge:
def test_dump_load_flow(self) -> None:
Expand Down

0 comments on commit aa9488f

Please sign in to comment.