Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dropna option to RawRowsAPI.insert_dataframe #1789

Merged
merged 8 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ Changes are grouped as follows
- `Fixed` for any bug fixes.
- `Security` in case of vulnerabilities.

## [7.45.0] - 2024-05-28
## [7.46.0] - 2024-05-31
### Added
- `RawRowsAPI.insert_dataframe` now has a new `dropna` setting (defaulting to True, as this would otherwise raise later).

## [7.45.0] - 2024-05-31
### Added
- DatapointsAPI now support `timezone` and new calendar-based granularities like `month`, `quarter` and `year`.
These API features are in beta, and the SDK implementation in alpha, meaning breaking changes can
Expand Down
43 changes: 37 additions & 6 deletions cognite/client/_api/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
import random
import threading
import time
from collections import deque
from collections import defaultdict, deque
from typing import TYPE_CHECKING, Any, Iterator, Sequence, cast, overload

from cognite.client._api_client import APIClient
from cognite.client._constants import _RUNNING_IN_BROWSER, DEFAULT_LIMIT_READ
from cognite.client.data_classes import Database, DatabaseList, Row, RowList, RowWrite, Table, TableList
from cognite.client.data_classes.raw import RowCore
from cognite.client.utils._auxiliary import (
find_duplicates,
interpolate_and_url_encode,
is_finite,
is_unlimited,
Expand Down Expand Up @@ -523,7 +524,12 @@ def insert(
)

def insert_dataframe(
self, db_name: str, table_name: str, dataframe: pd.DataFrame, ensure_parent: bool = False
self,
db_name: str,
table_name: str,
dataframe: pd.DataFrame,
ensure_parent: bool = False,
dropna: bool = True,
) -> None:
"""`Insert pandas dataframe into a table <https://developer.cognite.com/api#tag/Raw/operation/postRows>`_

Expand All @@ -534,23 +540,48 @@ def insert_dataframe(
table_name (str): Name of the table.
dataframe (pd.DataFrame): The dataframe to insert. Index will be used as row keys.
ensure_parent (bool): Create database/table if they don't already exist.
dropna (bool): Remove NaNs (but keep None's in dtype=object columns) before inserting. Done individually per column. Default: True

Examples:

Insert new rows into a table::
Insert new rows into a table:

>>> import pandas as pd
>>> from cognite.client import CogniteClient
>>>
>>> client = CogniteClient()
>>> df = pd.DataFrame(data={"a": 1, "b": 2}, index=["r1", "r2", "r3"])
>>> res = client.raw.rows.insert_dataframe("db1", "table1", df)
>>> df = pd.DataFrame(
... {"col-a": [1, 3, None], "col-b": [2, -1, 9]},
... index=["r1", "r2", "r3"])
>>> res = client.raw.rows.insert_dataframe(
... "db1", "table1", df, dropna=True)
"""
if not dataframe.index.is_unique:
raise ValueError("Dataframe index is not unique (used for the row keys)")
rows = dataframe.to_dict(orient="index")
elif not dataframe.columns.is_unique:
raise ValueError(f"Dataframe columns are not unique: {sorted(find_duplicates(dataframe.columns))}")

rows = self._df_to_rows_skip_nans(dataframe) if dropna else dataframe.to_dict(orient="index")
self.insert(db_name=db_name, table_name=table_name, row=rows, ensure_parent=ensure_parent)

@staticmethod
def _df_to_rows_skip_nans(df: pd.DataFrame) -> dict[str, dict[str, Any]]:
np = local_import("numpy")
rows: defaultdict[str, dict[str, Any]] = defaultdict(dict)
object_cols = df.select_dtypes("object").columns

for column_id, col in df.items():
if column_id not in object_cols:
col = col.dropna()
else:
# pandas treat None as NaN, but numpy does not:
mask = np.logical_or(col.to_numpy() == None, col.notna()) # noqa: E711
col = col[mask]

for idx, val in col.items():
rows[idx][column_id] = val
return dict(rows)

def _process_row_input(self, row: Sequence[Row] | Sequence[RowWrite] | Row | RowWrite | dict) -> list[list[dict]]:
assert_type(row, "row", [Sequence, dict, RowCore])
rows = []
Expand Down
2 changes: 1 addition & 1 deletion cognite/client/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from __future__ import annotations

__version__ = "7.45.0"
__version__ = "7.46.0"
__api_subversion__ = "20230101"
4 changes: 2 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "cognite-sdk"

version = "7.45.0"
version = "7.46.0"
description = "Cognite Python SDK"
readme = "README.md"
documentation = "https://cognite-sdk-python.readthedocs-hosted.com"
Expand Down Expand Up @@ -31,8 +31,8 @@ protobuf = ">=4"
pip = ">=20.0.0" # make optional once poetry doesn't auto-remove it on "simple install"
typing_extensions = ">= 4"
backports-zoneinfo = { version = ">=0.2.1", python = "<3.9" }
# Windows does not have a ANSI database and need tzdata
tzdata = { version = ">=2024.1", markers = "platform_system == 'Windows'" }
# Windows does not have a ANSI database and need tzdata... pyodide also needs it:
tzdata = { version = ">=2024.1", markers = "platform_system == 'Windows' or platform_system == 'Emscripten'" }
numpy = [
{ version = ">=1.20, <1.25", python = "~3.8", optional = true },
{ version = "^1.25", python = ">=3.9, <3.12", optional = true },
Expand All @@ -59,7 +59,7 @@ geo = ["geopandas", "shapely"]
sympy = ["sympy"]
functions = ["pip"]
yaml = ["PyYAML"]
pyodide = ["pyodide-http"] # keep pyodide related dependencies outside of 'all'
pyodide = ["pyodide-http", "tzdata"] # keep pyodide related dependencies outside of 'all'
all = ["numpy", "pandas", "geopandas", "shapely", "sympy", "pip", "PyYAML"]

[tool.poetry.group.dev.dependencies]
Expand Down
47 changes: 46 additions & 1 deletion tests/tests_unit/test_api/test_raw.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import math
import re

import pytest

from cognite.client._api.raw import Database, DatabaseList, Row, RowList, Table, TableList
from cognite.client._api.raw import Database, DatabaseList, RawRowsAPI, Row, RowList, Table, TableList
from cognite.client.exceptions import CogniteAPIError
from tests.utils import jsgz_load

Expand Down Expand Up @@ -322,6 +323,50 @@ def test_raw_row__direct_column_access():
del row["wrong-key"]


@pytest.mark.dsl
def test_insert_dataframe_raises_on_duplicated_cols(cognite_client):
import pandas as pd

df = pd.DataFrame(
{
"a": [1, 2, 3],
"b": [1, 2, 3],
"c": [10, 20, 30],
"d": [10, 20, 30],
"e": [100, 200, 300],
"f": [100, 200, 300],
}
)
df.columns = ["a", "b", "a", "c", "a", "b"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not set columns above with the data?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Notice the duplicates 😉

with pytest.raises(ValueError, match=r"^Dataframe columns are not unique: \['a', 'b'\]$"):
cognite_client.raw.rows.insert_dataframe("db", "tbl", df)


@pytest.mark.dsl
def test_df_to_rows_skip_nans():
import numpy as np
import pandas as pd

df = pd.DataFrame(
{
"a": [1, None, 3],
"b": [1, 2, None],
"c": [10, 20, 30],
"d": [math.inf, 20, 30],
"e": [100, 200, np.nan],
"f": [None, None, None],
}
)
df.at[1, "f"] = math.nan # object column, should keep None's, but this should be removed
res = RawRowsAPI._df_to_rows_skip_nans(df)
expected = {
0: {"a": 1.0, "b": 1.0, "c": 10, "d": math.inf, "e": 100.0, "f": None},
1: {"b": 2.0, "c": 20, "d": 20.0, "e": 200.0},
2: {"a": 3.0, "c": 30, "d": 30.0, "f": None},
}
assert res == expected


@pytest.mark.dsl
class TestPandasIntegration:
def test_dbs_to_pandas(self):
Expand Down