cognitedata · haakonvt · May 31, 2024 · May 30, 2024 · May 30, 2024 · May 31, 2024
@@ -17,7 +17,11 @@ Changes are grouped as follows
 - `Fixed` for any bug fixes.
 - `Security` in case of vulnerabilities.
 
-## [7.45.0] - 2024-05-28
+## [7.46.0] - 2024-05-31
+### Added
+- `RawRowsAPI.insert_dataframe` now has a new `dropna` setting (defaulting to True, as this would otherwise raise later).
+
+## [7.45.0] - 2024-05-31
 ### Added
 - DatapointsAPI now support `timezone` and new calendar-based granularities like `month`, `quarter` and `year`.
   These API features are in beta, and the SDK implementation in alpha, meaning breaking changes can

@@ -4,14 +4,15 @@
 import random
 import threading
 import time
-from collections import deque
+from collections import defaultdict, deque
 from typing import TYPE_CHECKING, Any, Iterator, Sequence, cast, overload
 
 from cognite.client._api_client import APIClient
 from cognite.client._constants import _RUNNING_IN_BROWSER, DEFAULT_LIMIT_READ
 from cognite.client.data_classes import Database, DatabaseList, Row, RowList, RowWrite, Table, TableList
 from cognite.client.data_classes.raw import RowCore
 from cognite.client.utils._auxiliary import (
+    find_duplicates,
     interpolate_and_url_encode,
     is_finite,
     is_unlimited,
@@ -523,7 +524,12 @@ def insert(
         )
 
     def insert_dataframe(
-        self, db_name: str, table_name: str, dataframe: pd.DataFrame, ensure_parent: bool = False
+        self,
+        db_name: str,
+        table_name: str,
+        dataframe: pd.DataFrame,
+        ensure_parent: bool = False,
+        dropna: bool = True,
     ) -> None:
         """`Insert pandas dataframe into a table <https://developer.cognite.com/api#tag/Raw/operation/postRows>`_
 
@@ -534,23 +540,48 @@ def insert_dataframe(
             table_name (str): Name of the table.
             dataframe (pd.DataFrame): The dataframe to insert. Index will be used as row keys.
             ensure_parent (bool): Create database/table if they don't already exist.
+            dropna (bool): Remove NaNs (but keep None's in dtype=object columns) before inserting. Done individually per column. Default: True
 
         Examples:
 
-            Insert new rows into a table::
+            Insert new rows into a table:
 
                 >>> import pandas as pd
                 >>> from cognite.client import CogniteClient
                 >>>
                 >>> client = CogniteClient()
-                >>> df = pd.DataFrame(data={"a": 1, "b": 2}, index=["r1", "r2", "r3"])
-                >>> res = client.raw.rows.insert_dataframe("db1", "table1", df)
+                >>> df = pd.DataFrame(
+                ...     {"col-a": [1, 3, None], "col-b": [2, -1, 9]},
+                ...     index=["r1", "r2", "r3"])
+                >>> res = client.raw.rows.insert_dataframe(
+                ...     "db1", "table1", df, dropna=True)
         """
         if not dataframe.index.is_unique:
             raise ValueError("Dataframe index is not unique (used for the row keys)")
-        rows = dataframe.to_dict(orient="index")
+        elif not dataframe.columns.is_unique:
+            raise ValueError(f"Dataframe columns are not unique: {sorted(find_duplicates(dataframe.columns))}")
+
+        rows = self._df_to_rows_skip_nans(dataframe) if dropna else dataframe.to_dict(orient="index")
         self.insert(db_name=db_name, table_name=table_name, row=rows, ensure_parent=ensure_parent)
 
+    @staticmethod
+    def _df_to_rows_skip_nans(df: pd.DataFrame) -> dict[str, dict[str, Any]]:
+        np = local_import("numpy")
+        rows: defaultdict[str, dict[str, Any]] = defaultdict(dict)
+        object_cols = df.select_dtypes("object").columns
+
+        for column_id, col in df.items():
+            if column_id not in object_cols:
+                col = col.dropna()
+            else:
+                # pandas treat None as NaN, but numpy does not:
+                mask = np.logical_or(col.to_numpy() == None, col.notna())  # noqa: E711
+                col = col[mask]
+
+            for idx, val in col.items():
+                rows[idx][column_id] = val
+        return dict(rows)
+
     def _process_row_input(self, row: Sequence[Row] | Sequence[RowWrite] | Row | RowWrite | dict) -> list[list[dict]]:
         assert_type(row, "row", [Sequence, dict, RowCore])
         rows = []

@@ -1,4 +1,4 @@
 from __future__ import annotations
 
-__version__ = "7.45.0"
+__version__ = "7.46.0"
 __api_subversion__ = "20230101"
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "cognite-sdk"
 
-version = "7.45.0"
+version = "7.46.0"
 description = "Cognite Python SDK"
 readme = "README.md"
 documentation = "https://cognite-sdk-python.readthedocs-hosted.com"
@@ -31,8 +31,8 @@ protobuf = ">=4"
 pip = ">=20.0.0"  # make optional once poetry doesn't auto-remove it on "simple install"
 typing_extensions = ">= 4"
 backports-zoneinfo = { version = ">=0.2.1",  python = "<3.9" }
-# Windows does not have a ANSI database and need tzdata
-tzdata = { version = ">=2024.1", markers = "platform_system == 'Windows'" }
+# Windows does not have a ANSI database and need tzdata... pyodide also needs it:
+tzdata = { version = ">=2024.1", markers = "platform_system == 'Windows' or platform_system == 'Emscripten'" }
 numpy = [
     { version = ">=1.20, <1.25", python = "~3.8", optional = true },
     { version = "^1.25", python = ">=3.9, <3.12", optional = true },
@@ -59,7 +59,7 @@ geo = ["geopandas", "shapely"]
 sympy = ["sympy"]
 functions = ["pip"]
 yaml = ["PyYAML"]
-pyodide = ["pyodide-http"]  # keep pyodide related dependencies outside of 'all'
+pyodide = ["pyodide-http", "tzdata"]  # keep pyodide related dependencies outside of 'all'
 all = ["numpy", "pandas", "geopandas", "shapely", "sympy", "pip", "PyYAML"]
 
 [tool.poetry.group.dev.dependencies]

@@ -1,8 +1,9 @@
+import math
 import re
 
 import pytest
 
-from cognite.client._api.raw import Database, DatabaseList, Row, RowList, Table, TableList
+from cognite.client._api.raw import Database, DatabaseList, RawRowsAPI, Row, RowList, Table, TableList
 from cognite.client.exceptions import CogniteAPIError
 from tests.utils import jsgz_load
 
@@ -322,6 +323,50 @@ def test_raw_row__direct_column_access():
         del row["wrong-key"]
 
 
+@pytest.mark.dsl
+def test_insert_dataframe_raises_on_duplicated_cols(cognite_client):
+    import pandas as pd
+
+    df = pd.DataFrame(
+        {
+            "a": [1, 2, 3],
+            "b": [1, 2, 3],
+            "c": [10, 20, 30],
+            "d": [10, 20, 30],
+            "e": [100, 200, 300],
+            "f": [100, 200, 300],
+        }
+    )
+    df.columns = ["a", "b", "a", "c", "a", "b"]
+    with pytest.raises(ValueError, match=r"^Dataframe columns are not unique: \['a', 'b'\]$"):
+        cognite_client.raw.rows.insert_dataframe("db", "tbl", df)
+
+
+@pytest.mark.dsl
+def test_df_to_rows_skip_nans():
+    import numpy as np
+    import pandas as pd
+
+    df = pd.DataFrame(
+        {
+            "a": [1, None, 3],
+            "b": [1, 2, None],
+            "c": [10, 20, 30],
+            "d": [math.inf, 20, 30],
+            "e": [100, 200, np.nan],
+            "f": [None, None, None],
+        }
+    )
+    df.at[1, "f"] = math.nan  # object column, should keep None's, but this should be removed
+    res = RawRowsAPI._df_to_rows_skip_nans(df)
+    expected = {
+        0: {"a": 1.0, "b": 1.0, "c": 10, "d": math.inf, "e": 100.0, "f": None},
+        1: {"b": 2.0, "c": 20, "d": 20.0, "e": 200.0},
+        2: {"a": 3.0, "c": 30, "d": 30.0, "f": None},
+    }
+    assert res == expected
+
+
 @pytest.mark.dsl
 class TestPandasIntegration:
     def test_dbs_to_pandas(self):