Skip to content

Commit

Permalink
chore: update metrics validation
Browse files Browse the repository at this point in the history
  • Loading branch information
Fabiana Clemente authored and Fabiana Clemente committed Oct 28, 2024
1 parent fc97cc6 commit b2f985c
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 7 deletions.
2 changes: 1 addition & 1 deletion src/ydata_profiling/profile_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def __initialize_dataframe(
) -> Optional[Union[pd.DataFrame, sDataFrame]]:

logger.info_def_report(
dataframe=type(df), timeseries=report_config.vars.timeseries.active
df=df, timeseries=report_config.vars.timeseries.active,
)

if (
Expand Down
35 changes: 34 additions & 1 deletion src/ydata_profiling/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pathlib import Path
from typing import Mapping

import pandas as pd
import requests

from ydata_profiling.version import __version__
Expand Down Expand Up @@ -98,7 +99,12 @@ def convert_timestamp_to_datetime(timestamp: int) -> datetime:
return datetime(1970, 1, 1) + timedelta(seconds=int(timestamp))


def analytics_features(dataframe: str, datatype: str, report_type: str) -> None:
def analytics_features(dataframe: str,
datatype: str,
report_type: str,
ncols: int,
nrows:int,
dbx: str) -> None:
endpoint = "https://packages.ydata.ai/ydata-profiling?"
package_version = __version__

Expand All @@ -120,9 +126,36 @@ def analytics_features(dataframe: str, datatype: str, report_type: str) -> None:
f"&python_version={python_version}"
f"&report_type={report_type}"
f"&dataframe={dataframe}"
f"&ncols={ncols}"
f"&nrows={nrows}"
f"&datatype={datatype}"
f"&os={platform.system()}"
f"&gpu={str(gpu_present)}"
f"&dbx={dbx}"
)

requests.get(request_message)

def is_running_in_databricks():
mask = 'DATABRICKS_RUNTIME_VERSION' in os.environ
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
return os.environ['DATABRICKS_RUNTIME_VERSION']
else:
return str(mask)

def calculate_nrows(df):
"""
Calculates the approx. number of rows spark dataframes
Returns: int, approximate number of rows
"""
try:
n_partitions = df.rdd.getNumPartitions()

nrows = df.rdd.mapPartitionsWithIndex(
lambda idx, partition: [sum(1 for _ in partition)] if idx == 0 else [0]
).collect()[0] * n_partitions
except:
nrows = 0 # returns 0 in case it was not possible to compute it from the partition

return nrows
18 changes: 13 additions & 5 deletions src/ydata_profiling/utils/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,36 @@

import pandas as pd

from ydata_profiling.utils.common import analytics_features
from ydata_profiling.utils.common import (calculate_nrows,
analytics_features,
is_running_in_databricks)


class ProfilingLogger(logging.Logger):
def __init__(self, name: str, level: int = logging.INFO):
super().__init__(name, level)

def info_def_report(self, dataframe, timeseries: bool) -> None: # noqa: ANN001
if isinstance(dataframe, pd.DataFrame):
def info_def_report(self, df, timeseries: bool) -> None: # noqa: ANN001
ncols = len(df.columns)
if isinstance(df, pd.DataFrame):
dataframe = "pandas"
report_type = "regular"
elif dataframe is None:
nrows=len(df)
elif df is None:
dataframe = "pandas"
report_type = "compare"
nrows=len(df)
else:
dataframe = "spark"
report_type = "regular"
nrows=calculate_nrows(df)

dbx=is_running_in_databricks()
datatype = "timeseries" if timeseries else "tabular"

analytics_features(
dataframe=dataframe, datatype=datatype, report_type=report_type
dataframe=dataframe, datatype=datatype, report_type=report_type,
nrows=nrows, ncols=ncols, dbx=dbx
)

super().info(
Expand Down

0 comments on commit b2f985c

Please sign in to comment.