From 816f1b7f49c58005d1fdba2b720f8ea32ca7f11b Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Tue, 3 Dec 2024 00:25:09 +0000 Subject: [PATCH 1/3] chore(actions): update sonarsource/sonarqube-scan-action action to v4.1.0 --- .github/workflows/sonarqube.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/sonarqube.yaml b/.github/workflows/sonarqube.yaml index 8e8bfd9b2..4fac8c898 100644 --- a/.github/workflows/sonarqube.yaml +++ b/.github/workflows/sonarqube.yaml @@ -25,7 +25,7 @@ jobs: echo "sonar.projectKey=${{ github.event.repository.name }}" > sonar-project.properties - name: SonarQube Scan - uses: sonarsource/sonarqube-scan-action@v4.0.0 + uses: sonarsource/sonarqube-scan-action@v4.1.0 env: SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }} From 1e8cb89ab86e69c22d84accb49729ccf377a8030 Mon Sep 17 00:00:00 2001 From: Alex Barros Date: Wed, 4 Dec 2024 09:20:42 -0300 Subject: [PATCH 2/3] fix: typeset invalid dates errors (#1678) * fix: ignore invalid dates during conversion * fix: apply type conversion to user defined types * test: add unit test to invalid date type convertion * fix: add invalid dates to variable info * fix(linting): code formatting * test: update unit tests * fix: rename to_datetime method --- .../model/pandas/describe_date_pandas.py | 22 ++++++++++++++++++- .../model/pandas/summary_pandas.py | 1 + .../report/structure/variables/render_date.py | 10 +++++++++ tests/unit/test_describe.py | 16 ++++++++++++++ 4 files changed, 48 insertions(+), 1 deletion(-) diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py index 1ff64a50f..72b25a697 100644 --- a/src/ydata_profiling/model/pandas/describe_date_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py @@ -11,6 +11,13 @@ series_handle_nulls, series_hashable, ) +from ydata_profiling.model.typeset_relations import is_pandas_1 + + +def to_datetime(series: pd.Series) -> pd.Series: + if is_pandas_1(): + return pd.to_datetime(series, errors="coerce") + return pd.to_datetime(series, format="mixed", errors="coerce") @describe_date_1d.register @@ -29,6 +36,12 @@ def pandas_describe_date_1d( Returns: A dict containing calculated series description values. """ + og_series = series.dropna() + series = to_datetime(og_series) + invalid_values = og_series[series.isna()] + + series = series.dropna() + if summary["value_counts_without_nan"].empty: values = series.values summary.update( @@ -53,5 +66,12 @@ def pandas_describe_date_1d( if config.vars.num.chi_squared_threshold > 0.0: summary["chi_squared"] = chi_square(values) - summary.update(histogram_compute(config, values, summary["n_distinct"])) + summary.update(histogram_compute(config, values, series.nunique())) + summary.update( + { + "invalid_dates": invalid_values.nunique(), + "n_invalid_dates": len(invalid_values), + "p_invalid_dates": len(invalid_values) / summary["n"], + } + ) return config, values, summary diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index bbb401fd0..5d15b2d3c 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -44,6 +44,7 @@ def pandas_describe_1d( and series.name in typeset.type_schema ): vtype = typeset.type_schema[series.name] + elif config.infer_dtypes: # Infer variable types vtype = typeset.infer_type(series) diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py index c75a80a5e..1f142daae 100644 --- a/src/ydata_profiling/report/structure/variables/render_date.py +++ b/src/ydata_profiling/report/structure/variables/render_date.py @@ -62,6 +62,16 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: [ {"name": "Minimum", "value": fmt(summary["min"]), "alert": False}, {"name": "Maximum", "value": fmt(summary["max"]), "alert": False}, + { + "name": "Invalid dates", + "value": fmt(summary["n_invalid_dates"]), + "alert": False, + }, + { + "name": "Invalid dates (%)", + "value": fmt_percent(summary["p_invalid_dates"]), + "alert": False, + }, ], style=config.html.style, ) diff --git a/tests/unit/test_describe.py b/tests/unit/test_describe.py index 0eb10b7b7..0918be08e 100644 --- a/tests/unit/test_describe.py +++ b/tests/unit/test_describe.py @@ -582,3 +582,19 @@ def test_describe_list(summarizer, typeset): with pytest.raises(NotImplementedError): describe(config, "", [1, 2, 3], summarizer, typeset) + + +def test_decribe_series_type_schema(config, summarizer): + "Test describe with invalid date types." + typeset = ProfilingTypeSet(config, type_schema={"date": "datetime"}) + data = { + "value": [1, 2, 3, 4], + "date": ["0001-01-01", "9999-12-31", "2022-10-03", "2022-10-04"], + } + df = pd.DataFrame(data) + result = describe(config, df, summarizer, typeset) + + assert result.variables["date"]["type"] == "DateTime" + assert result.variables["date"]["n_missing"] == 0 + assert result.variables["date"]["n_invalid_dates"] == 2 + assert result.variables["date"]["p_invalid_dates"] == 0.5 From f3bc959a5a9bf8c1b95db7ec767063fc4d5904d0 Mon Sep 17 00:00:00 2001 From: Alex Barros Date: Fri, 6 Dec 2024 12:30:47 -0300 Subject: [PATCH 3/3] fix: type schema not checking for empty columns (#1679) * fix: type schema not checking for empty columns * fix: remove alerts unused parameters * fix: indicate user defined type on empty columns * fix(linting): code formatting --------- Co-authored-by: Azory YData Bot --- src/ydata_profiling/model/alerts.py | 4 ++-- .../model/pandas/summary_pandas.py | 24 ++++++++++++++----- .../structure/variables/render_generic.py | 2 +- tests/unit/test_typeset_default.py | 11 +++++++++ 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index 3c2c4fee9..09ae6c0a3 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -634,7 +634,7 @@ def supported_alerts(summary: dict) -> List[Alert]: return alerts -def unsupported_alerts(summary: Dict[str, Any]) -> List[Alert]: +def unsupported_alerts() -> List[Alert]: alerts: List[Alert] = [ UnsupportedAlert(), RejectedAlert(), @@ -657,7 +657,7 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List alerts += generic_alerts(description) if description["type"] == "Unsupported": - alerts += unsupported_alerts(description) + alerts += unsupported_alerts() else: alerts += supported_alerts(description) diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index 5d15b2d3c..68e019451 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -16,6 +16,14 @@ from ydata_profiling.utils.dataframe import sort_column_names +def _is_cast_type_defined(typeset: VisionsTypeset, series: str) -> bool: + return ( + isinstance(typeset, ProfilingTypeSet) + and typeset.type_schema + and series in typeset.type_schema + ) + + @describe_1d.register def pandas_describe_1d( config: Settings, @@ -38,11 +46,10 @@ def pandas_describe_1d( # Make sure pd.NA is not in the series series = series.fillna(np.nan) - if ( - isinstance(typeset, ProfilingTypeSet) - and typeset.type_schema - and series.name in typeset.type_schema - ): + has_cast_type = _is_cast_type_defined(typeset, series.name) + cast_type = str(typeset.type_schema[series.name]) if has_cast_type else None + + if has_cast_type and not series.isna().all(): vtype = typeset.type_schema[series.name] elif config.infer_dtypes: @@ -55,7 +62,12 @@ def pandas_describe_1d( vtype = typeset.detect_type(series) typeset.type_schema[series.name] = vtype - return summarizer.summarize(config, series, dtype=vtype) + summary = summarizer.summarize(config, series, dtype=vtype) + # Cast type is only used on unsupported columns rendering pipeline + # to indicate the correct variable type when inference is not possible + summary["cast_type"] = cast_type + + return summary @get_series_descriptions.register diff --git a/src/ydata_profiling/report/structure/variables/render_generic.py b/src/ydata_profiling/report/structure/variables/render_generic.py index 0b2e00efb..0a8ce1e55 100644 --- a/src/ydata_profiling/report/structure/variables/render_generic.py +++ b/src/ydata_profiling/report/structure/variables/render_generic.py @@ -12,7 +12,7 @@ def render_generic(config: Settings, summary: dict) -> dict: info = VariableInfo( anchor_id=summary["varid"], alerts=summary["alerts"], - var_type="Unsupported", + var_type=summary["cast_type"] or "Unsupported", var_name=summary["varname"], description=summary["description"], style=config.html.style, diff --git a/tests/unit/test_typeset_default.py b/tests/unit/test_typeset_default.py index 8d58aeb03..d93d61cb0 100644 --- a/tests/unit/test_typeset_default.py +++ b/tests/unit/test_typeset_default.py @@ -475,3 +475,14 @@ def test_type_schema(dataframe: pd.DataFrame, column: str, type_schema: dict): assert prof.typeset.type_schema[column] == prof.typeset._get_type( type_schema[column] ) + + +def test_type_schema_with_null_column(): + df = pd.DataFrame({"null_col": [None] * 100}) + prof = ProfileReport(df, type_schema={"null_col": "datetime"}) + description = prof.description_set + assert description.variables["null_col"]["type"] == "Unsupported" + + prof = ProfileReport(df, type_schema={"null_col": "numeric"}) + description = prof.description_set + assert description.variables["null_col"]["type"] == "Unsupported"