From a02ae09c7a74ff851d28468cf263663320f9b65f Mon Sep 17 00:00:00 2001 From: Fabiana Clemente Date: Mon, 28 Oct 2024 17:20:10 +0000 Subject: [PATCH] chore: update metrics validation --- src/ydata_profiling/profile_report.py | 2 +- src/ydata_profiling/utils/common.py | 35 ++++++++++++++++++++++++++- src/ydata_profiling/utils/logger.py | 18 ++++++++++---- 3 files changed, 48 insertions(+), 7 deletions(-) diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index 2f382f568..db3dc02de 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -199,7 +199,7 @@ def __initialize_dataframe( ) -> Optional[Union[pd.DataFrame, sDataFrame]]: logger.info_def_report( - dataframe=type(df), timeseries=report_config.vars.timeseries.active + df=df, timeseries=report_config.vars.timeseries.active, ) if ( diff --git a/src/ydata_profiling/utils/common.py b/src/ydata_profiling/utils/common.py index 0cd8d018d..68f039ea9 100644 --- a/src/ydata_profiling/utils/common.py +++ b/src/ydata_profiling/utils/common.py @@ -12,6 +12,7 @@ from pathlib import Path from typing import Mapping +import pandas as pd import requests from ydata_profiling.version import __version__ @@ -98,7 +99,12 @@ def convert_timestamp_to_datetime(timestamp: int) -> datetime: return datetime(1970, 1, 1) + timedelta(seconds=int(timestamp)) -def analytics_features(dataframe: str, datatype: str, report_type: str) -> None: +def analytics_features(dataframe: str, + datatype: str, + report_type: str, + ncols: int, + nrows:int, + dbx: str) -> None: endpoint = "https://packages.ydata.ai/ydata-profiling?" package_version = __version__ @@ -120,9 +126,36 @@ def analytics_features(dataframe: str, datatype: str, report_type: str) -> None: f"&python_version={python_version}" f"&report_type={report_type}" f"&dataframe={dataframe}" + f"&ncols={ncols}" + f"&nrows={nrows}" f"&datatype={datatype}" f"&os={platform.system()}" f"&gpu={str(gpu_present)}" + f"&dbx={dbx}" ) requests.get(request_message) + +def is_running_in_databricks(): + mask = 'DATABRICKS_RUNTIME_VERSION' in os.environ + if 'DATABRICKS_RUNTIME_VERSION' in os.environ: + return os.environ['DATABRICKS_RUNTIME_VERSION'] + else: + return str(mask) + +def calculate_nrows(df): + """ + Calculates the approx. number of rows spark dataframes + + Returns: int, approximate number of rows + """ + try: + n_partitions = df.rdd.getNumPartitions() + + nrows = df.rdd.mapPartitionsWithIndex( + lambda idx, partition: [sum(1 for _ in partition)] if idx == 0 else [0] + ).collect()[0] * n_partitions + except: + nrows = 0 # returns 0 in case it was not possible to compute it from the partition + + return nrows diff --git a/src/ydata_profiling/utils/logger.py b/src/ydata_profiling/utils/logger.py index 18a51b1d3..b684aa6c4 100644 --- a/src/ydata_profiling/utils/logger.py +++ b/src/ydata_profiling/utils/logger.py @@ -6,28 +6,36 @@ import pandas as pd -from ydata_profiling.utils.common import analytics_features +from ydata_profiling.utils.common import (calculate_nrows, + analytics_features, + is_running_in_databricks) class ProfilingLogger(logging.Logger): def __init__(self, name: str, level: int = logging.INFO): super().__init__(name, level) - def info_def_report(self, dataframe, timeseries: bool) -> None: # noqa: ANN001 - if isinstance(dataframe, pd.DataFrame): + def info_def_report(self, df, timeseries: bool) -> None: # noqa: ANN001 + ncols = len(df.columns) + if isinstance(df, pd.DataFrame): dataframe = "pandas" report_type = "regular" - elif dataframe is None: + nrows=len(df) + elif df is None: dataframe = "pandas" report_type = "compare" + nrows=len(df) else: dataframe = "spark" report_type = "regular" + nrows=calculate_nrows(df) + dbx=is_running_in_databricks() datatype = "timeseries" if timeseries else "tabular" analytics_features( - dataframe=dataframe, datatype=datatype, report_type=report_type + dataframe=dataframe, datatype=datatype, report_type=report_type, + nrows=nrows, ncols=ncols, dbx=dbx ) super().info(