chore: update metrics validation

ydataai · Oct 28, 2024 · a02ae09 · a02ae09
1 parent 5bbd589
commit a02ae09
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 7 deletions.
diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py
@@ -199,7 +199,7 @@ def __initialize_dataframe(
     ) -> Optional[Union[pd.DataFrame, sDataFrame]]:
 
         logger.info_def_report(
-            dataframe=type(df), timeseries=report_config.vars.timeseries.active
+            df=df, timeseries=report_config.vars.timeseries.active,
         )
 
         if (

diff --git a/src/ydata_profiling/utils/common.py b/src/ydata_profiling/utils/common.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 from typing import Mapping
 
+import pandas as pd
 import requests
 
 from ydata_profiling.version import __version__
@@ -98,7 +99,12 @@ def convert_timestamp_to_datetime(timestamp: int) -> datetime:
         return datetime(1970, 1, 1) + timedelta(seconds=int(timestamp))
 
 
-def analytics_features(dataframe: str, datatype: str, report_type: str) -> None:
+def analytics_features(dataframe: str,
+                       datatype: str,
+                       report_type: str,
+                       ncols: int,
+                       nrows:int,
+                       dbx: str) -> None:
     endpoint = "https://packages.ydata.ai/ydata-profiling?"
     package_version = __version__
 
@@ -120,9 +126,36 @@ def analytics_features(dataframe: str, datatype: str, report_type: str) -> None:
                 f"&python_version={python_version}"
                 f"&report_type={report_type}"
                 f"&dataframe={dataframe}"
+                f"&ncols={ncols}"
+                f"&nrows={nrows}"
                 f"&datatype={datatype}"
                 f"&os={platform.system()}"
                 f"&gpu={str(gpu_present)}"
+                f"&dbx={dbx}"
             )
 
             requests.get(request_message)
+
+def is_running_in_databricks():
+    mask = 'DATABRICKS_RUNTIME_VERSION' in os.environ
+    if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
+        return os.environ['DATABRICKS_RUNTIME_VERSION']
+    else:
+        return str(mask)
+
+def calculate_nrows(df):
+    """
+    Calculates the approx. number of rows spark dataframes
+
+    Returns: int, approximate number of rows
+    """
+    try:
+        n_partitions = df.rdd.getNumPartitions()
+
+        nrows = df.rdd.mapPartitionsWithIndex(
+            lambda idx, partition: [sum(1 for _ in partition)] if idx == 0 else [0]
+        ).collect()[0] * n_partitions
+    except:
+        nrows = 0  # returns 0 in case it was not possible to compute it from the partition
+
+    return nrows
diff --git a/src/ydata_profiling/utils/logger.py b/src/ydata_profiling/utils/logger.py
@@ -6,28 +6,36 @@
 
 import pandas as pd
 
-from ydata_profiling.utils.common import analytics_features
+from ydata_profiling.utils.common import (calculate_nrows,
+                                          analytics_features,
+                                          is_running_in_databricks)
 
 
 class ProfilingLogger(logging.Logger):
     def __init__(self, name: str, level: int = logging.INFO):
         super().__init__(name, level)
 
-    def info_def_report(self, dataframe, timeseries: bool) -> None:  # noqa: ANN001
-        if isinstance(dataframe, pd.DataFrame):
+    def info_def_report(self, df, timeseries: bool) -> None:  # noqa: ANN001
+        ncols = len(df.columns)
+        if isinstance(df, pd.DataFrame):
             dataframe = "pandas"
             report_type = "regular"
-        elif dataframe is None:
+            nrows=len(df)
+        elif df is None:
             dataframe = "pandas"
             report_type = "compare"
+            nrows=len(df)
         else:
             dataframe = "spark"
             report_type = "regular"
+            nrows=calculate_nrows(df)
 
+        dbx=is_running_in_databricks()
         datatype = "timeseries" if timeseries else "tabular"
 
         analytics_features(
-            dataframe=dataframe, datatype=datatype, report_type=report_type
+            dataframe=dataframe, datatype=datatype, report_type=report_type,
+            nrows=nrows, ncols=ncols, dbx=dbx
         )
 
         super().info(