Skip to content

Commit

Permalink
Merge branch 'develop' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
emmanuel-ferdman authored Dec 9, 2024
2 parents d31fff5 + f3bc959 commit 9175022
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/sonarqube.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
echo "sonar.projectKey=${{ github.event.repository.name }}" > sonar-project.properties
- name: SonarQube Scan
uses: sonarsource/sonarqube-scan-action@v4.0.0
uses: sonarsource/sonarqube-scan-action@v4.1.0
env:
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}
4 changes: 2 additions & 2 deletions src/ydata_profiling/model/alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def supported_alerts(summary: dict) -> List[Alert]:
return alerts


def unsupported_alerts(summary: Dict[str, Any]) -> List[Alert]:
def unsupported_alerts() -> List[Alert]:
alerts: List[Alert] = [
UnsupportedAlert(),
RejectedAlert(),
Expand All @@ -657,7 +657,7 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List
alerts += generic_alerts(description)

if description["type"] == "Unsupported":
alerts += unsupported_alerts(description)
alerts += unsupported_alerts()
else:
alerts += supported_alerts(description)

Expand Down
22 changes: 21 additions & 1 deletion src/ydata_profiling/model/pandas/describe_date_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
series_handle_nulls,
series_hashable,
)
from ydata_profiling.model.typeset_relations import is_pandas_1


def to_datetime(series: pd.Series) -> pd.Series:
if is_pandas_1():
return pd.to_datetime(series, errors="coerce")
return pd.to_datetime(series, format="mixed", errors="coerce")


@describe_date_1d.register
Expand All @@ -29,6 +36,12 @@ def pandas_describe_date_1d(
Returns:
A dict containing calculated series description values.
"""
og_series = series.dropna()
series = to_datetime(og_series)
invalid_values = og_series[series.isna()]

series = series.dropna()

if summary["value_counts_without_nan"].empty:
values = series.values
summary.update(
Expand All @@ -53,5 +66,12 @@ def pandas_describe_date_1d(
if config.vars.num.chi_squared_threshold > 0.0:
summary["chi_squared"] = chi_square(values)

summary.update(histogram_compute(config, values, summary["n_distinct"]))
summary.update(histogram_compute(config, values, series.nunique()))
summary.update(
{
"invalid_dates": invalid_values.nunique(),
"n_invalid_dates": len(invalid_values),
"p_invalid_dates": len(invalid_values) / summary["n"],
}
)
return config, values, summary
25 changes: 19 additions & 6 deletions src/ydata_profiling/model/pandas/summary_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
from ydata_profiling.utils.dataframe import sort_column_names


def _is_cast_type_defined(typeset: VisionsTypeset, series: str) -> bool:
return (
isinstance(typeset, ProfilingTypeSet)
and typeset.type_schema
and series in typeset.type_schema
)


@describe_1d.register
def pandas_describe_1d(
config: Settings,
Expand All @@ -38,12 +46,12 @@ def pandas_describe_1d(
# Make sure pd.NA is not in the series
series = series.fillna(np.nan)

if (
isinstance(typeset, ProfilingTypeSet)
and typeset.type_schema
and series.name in typeset.type_schema
):
has_cast_type = _is_cast_type_defined(typeset, series.name)
cast_type = str(typeset.type_schema[series.name]) if has_cast_type else None

if has_cast_type and not series.isna().all():
vtype = typeset.type_schema[series.name]

elif config.infer_dtypes:
# Infer variable types
vtype = typeset.infer_type(series)
Expand All @@ -54,7 +62,12 @@ def pandas_describe_1d(
vtype = typeset.detect_type(series)

typeset.type_schema[series.name] = vtype
return summarizer.summarize(config, series, dtype=vtype)
summary = summarizer.summarize(config, series, dtype=vtype)
# Cast type is only used on unsupported columns rendering pipeline
# to indicate the correct variable type when inference is not possible
summary["cast_type"] = cast_type

return summary


@get_series_descriptions.register
Expand Down
10 changes: 10 additions & 0 deletions src/ydata_profiling/report/structure/variables/render_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,16 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
[
{"name": "Minimum", "value": fmt(summary["min"]), "alert": False},
{"name": "Maximum", "value": fmt(summary["max"]), "alert": False},
{
"name": "Invalid dates",
"value": fmt(summary["n_invalid_dates"]),
"alert": False,
},
{
"name": "Invalid dates (%)",
"value": fmt_percent(summary["p_invalid_dates"]),
"alert": False,
},
],
style=config.html.style,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def render_generic(config: Settings, summary: dict) -> dict:
info = VariableInfo(
anchor_id=summary["varid"],
alerts=summary["alerts"],
var_type="Unsupported",
var_type=summary["cast_type"] or "Unsupported",
var_name=summary["varname"],
description=summary["description"],
style=config.html.style,
Expand Down
16 changes: 16 additions & 0 deletions tests/unit/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,3 +582,19 @@ def test_describe_list(summarizer, typeset):

with pytest.raises(NotImplementedError):
describe(config, "", [1, 2, 3], summarizer, typeset)


def test_decribe_series_type_schema(config, summarizer):
"Test describe with invalid date types."
typeset = ProfilingTypeSet(config, type_schema={"date": "datetime"})
data = {
"value": [1, 2, 3, 4],
"date": ["0001-01-01", "9999-12-31", "2022-10-03", "2022-10-04"],
}
df = pd.DataFrame(data)
result = describe(config, df, summarizer, typeset)

assert result.variables["date"]["type"] == "DateTime"
assert result.variables["date"]["n_missing"] == 0
assert result.variables["date"]["n_invalid_dates"] == 2
assert result.variables["date"]["p_invalid_dates"] == 0.5
11 changes: 11 additions & 0 deletions tests/unit/test_typeset_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,3 +475,14 @@ def test_type_schema(dataframe: pd.DataFrame, column: str, type_schema: dict):
assert prof.typeset.type_schema[column] == prof.typeset._get_type(
type_schema[column]
)


def test_type_schema_with_null_column():
df = pd.DataFrame({"null_col": [None] * 100})
prof = ProfileReport(df, type_schema={"null_col": "datetime"})
description = prof.description_set
assert description.variables["null_col"]["type"] == "Unsupported"

prof = ProfileReport(df, type_schema={"null_col": "numeric"})
description = prof.description_set
assert description.variables["null_col"]["type"] == "Unsupported"

0 comments on commit 9175022

Please sign in to comment.