diff --git a/.travis.yml b/.travis.yml index ad48c9055..459d22b1b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,12 +28,14 @@ env: - TEST=issue PANDAS="<1" - TEST=console PANDAS="<1" - TEST=examples PANDAS="<1" - - TEST=unit PANDAS=">=1" - - TEST=issue PANDAS=">=1" - - TEST=console PANDAS=">=1" - - TEST=examples PANDAS=">=1" - - TEST=lint PANDAS=">=1" - - TEST=typing PANDAS=">=1" + - TEST=unit PANDAS="==1.0.5" + - TEST=issue PANDAS="==1.0.5" + - TEST=unit PANDAS=">=1.1" + - TEST=issue PANDAS=">=1.1" + - TEST=console PANDAS=">=1.1" + - TEST=examples PANDAS=">=1.1" + - TEST=lint PANDAS=">=1.1" + - TEST=typing PANDAS=">=1.1" before_install: - pip install --upgrade pip setuptools wheel diff --git a/README.md b/README.md index e2360a668..f486db83b 100644 --- a/README.md +++ b/README.md @@ -27,32 +27,16 @@ For each column the following statistics - if relevant for the column type - are ## Announcements -### Version v2.8.0 released +### Version v2.9.0 released -News for users working with image datasets: ``pandas-profiling`` now has build-in supports for Files and Images. -Moreover, the text analysis features have also been reworked, providing more informative statistics. +The release candidate for v2.9.0 was already out for a while, now v2.9.0 is finally released. See the changelog below to know what has changed. -For a better feel, have a look at the [examples](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/examples.html#showcasing-specific-features) section in the docs or read the changelog for a complete view of the changes. +### Spark backend in progress -### Version v2.7.0 released +We can happily announce that we're working on a Spark backend for generating profile reports. +Stay tuned. -#### Performance - -There were several performance regressions pointed out to me recently when comparing 1.4.1 to 2.6.0. -To that end, we benchmarked the code and found several minor features introducing disproportionate computational complexity. -Version 2.7.0 optimizes these, giving significant performance improvements! -Moreover, the default configuration is tweaked for towards the needs of the average user. - -#### Phased builds and lazy loading - -A report is built in phases, which allows for new exciting features such as caching, only re-rendering partial reports and lazily computing the report. -Moreover, the progress bar provides more information on the building phase and step. - -#### Documentation - -This version introduces [more elaborate documentation](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/index.html) powered by Sphinx. The previously used pdoc3 has been adequate initially, however misses functionality and extensibility. Several recurring topics are now documented, for instance the configuration parameters are documented and there are pages on big datasets, sensitive data, integrations and resources. - -#### Support `pandas-profiling` +### Support `pandas-profiling` The development of ``pandas-profiling`` relies completely on contributions. If you find value in the package, we welcome you to support the project through [GitHub Sponsors](https://github.com/sponsors/sbrugman)! @@ -60,18 +44,17 @@ It's extra exciting that GitHub **matches your contribution** for the first year Find more information here: - - [Changelog v2.7.0](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/changelog.html#changelog-v2-7-0) - - [Changelog v2.8.0](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/changelog.html#changelog-v2-8-0) + - [Changelog v2.9.0](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/changelog.html#changelog-v2-9-0) - [Sponsor the project on GitHub](https://github.com/sponsors/sbrugman) - *May 7, 2020 💘* + *September 2, 2020 💘* --- _Contents:_ **[Examples](#examples)** | **[Installation](#installation)** | **[Documentation](#documentation)** | **[Large datasets](#large-datasets)** | **[Command line usage](#command-line-usage)** | -**[Advanced usage](#advanced-usage)** | +**[Advanced usage](#advanced-usage)** | **[Support](#supporting-open-source)** | **[Types](#types)** | **[How to contribute](#contributing)** | **[Editor Integration](#editor-integration)** | **[Dependencies](#dependencies)** @@ -97,7 +80,7 @@ Specific features: * [Orange prices](https://pandas-profiling.github.io/pandas-profiling/examples/master/features/united_report.html) and [Coal prices](https://pandas-profiling.github.io/pandas-profiling/examples/master/features/flatly_report.html) (showcases report themes) Tutorials: -* [Tutorial: report structure using Kaggle data (advanced)](https://pandas-profiling.github.io/pandas-profiling/examples/master/tutorials/modify_report_structure.ipynb) (modify the report's structure) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/pandas-profiling/pandas-profiling/blob/master/examples/kaggle/modify_report_structure.ipynb) [![Binder](https://camo.githubusercontent.com/483bae47a175c24dfbfc57390edd8b6982ac5fb3/68747470733a2f2f6d7962696e6465722e6f72672f62616467655f6c6f676f2e737667)](https://mybinder.org/v2/gh/pandas-profiling/pandas-profiling/master?filepath=examples%2Fkaggle%2Fmodify_report_structure.ipynb) +* [Tutorial: report structure using Kaggle data (advanced)](https://pandas-profiling.github.io/pandas-profiling/examples/master/tutorials/modify_report_structure.ipynb) (modify the report's structure) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/pandas-profiling/pandas-profiling/blob/master/examples/tutorials/modify_report_structure.ipynb) [![Binder](https://camo.githubusercontent.com/483bae47a175c24dfbfc57390edd8b6982ac5fb3/68747470733a2f2f6d7962696e6465722e6f72672f62616467655f6c6f676f2e737667)](https://mybinder.org/v2/gh/pandas-profiling/pandas-profiling/master?filepath=examples%2Ftutorials%2Fmodify_report_structure.ipynb) ## Installation @@ -237,19 +220,36 @@ profile = df.profile_report(title='Pandas Profiling Report', plot={'histogram': profile.to_file("output.html") ``` +# Supporting open source + +Maintaining and developing the open-source code for pandas-profiling, with millions of downloads and thousands of users, would not be possible with support of our gracious sponsors. + +
+ + + + | ++ +[Lambda workstations](https://lambdalabs.com/), servers, laptops, and cloud services power engineers and researchers at Fortune 500 companies and 94% of the top 50 universities. [Lambda Cloud](https://lambdalabs.com/service/gpu-cloud) offers 4 & 8 GPU instances starting at $1.50 / hr. Pre-installed with TensorFlow, PyTorch, Ubuntu, CUDA, and cuDNN. + + | +
{{ message.column_name }}
has a high cardinality: {{ message.values['n_unique'] }} distinct values
\ No newline at end of file
+{{ message.column_name }}
has a high cardinality: {{ message.values['n_distinct'] }} distinct values
\ No newline at end of file
diff --git a/src/pandas_profiling/report/structure/overview.py b/src/pandas_profiling/report/structure/overview.py
index b8bb4677c..28cbc9def 100644
--- a/src/pandas_profiling/report/structure/overview.py
+++ b/src/pandas_profiling/report/structure/overview.py
@@ -1,6 +1,7 @@
from typing import Optional
from urllib.parse import quote
+from pandas_profiling.config import config
from pandas_profiling.model.messages import MessageType
from pandas_profiling.report.presentation.core import Container, Table, Warnings
@@ -11,17 +12,17 @@ def get_dataset_overview(summary):
{
"name": "Number of variables",
"value": summary["table"]["n_var"],
- "fmt": "fmt_numeric",
+ "fmt": "fmt_number",
},
{
"name": "Number of observations",
"value": summary["table"]["n"],
- "fmt": "fmt_numeric",
+ "fmt": "fmt_number",
},
{
"name": "Missing cells",
"value": summary["table"]["n_cells_missing"],
- "fmt": "fmt_numeric",
+ "fmt": "fmt_number",
},
{
"name": "Missing cells (%)",
@@ -31,7 +32,7 @@ def get_dataset_overview(summary):
{
"name": "Duplicate rows",
"value": summary["table"]["n_duplicates"],
- "fmt": "fmt_numeric",
+ "fmt": "fmt_number",
},
{
"name": "Duplicate rows (%)",
@@ -68,39 +69,50 @@ def get_dataset_overview(summary):
)
-def get_dataset_schema(metadata) -> Optional[Table]:
- if len(metadata) > 0 or any(len(value) > 0 for value in metadata.values()):
- about_dataset = []
- for key in ["description", "creator", "author", "url"]:
- if key in metadata and len(metadata[key]) > 0:
- about_dataset.append(
- {"name": key.capitalize(), "value": metadata[key], "fmt": "fmt"}
- )
-
- if "copyright_holder" in metadata and len(metadata["copyright_holder"]) > 0:
- if "copyright_year" not in metadata:
- about_dataset.append(
- {
- "name": "Copyright",
- "value": f"(c) {metadata['copyright_holder']}",
- "fmt": "fmt",
- }
- )
- else:
- about_dataset.append(
- {
- "name": "Copyright",
- "value": f"(c) {metadata['copyright_holder']} {metadata['copyright_year']}",
- "fmt": "fmt",
- }
- )
-
- if len(about_dataset) > 0:
- return Table(about_dataset, name="Dataset", anchor_id="metadata_dataset")
- return None
-
-
-def get_dataset_reproduction(summary: dict, metadata: dict):
+def get_dataset_schema(metadata) -> Optional[Container]:
+ about_dataset = []
+ for key in ["description", "creator", "author"]:
+ if key in metadata and len(metadata[key]) > 0:
+ about_dataset.append(
+ {"name": key.capitalize(), "value": metadata[key], "fmt": "fmt"}
+ )
+
+ if "url" in metadata:
+ about_dataset.append(
+ {
+ "name": "URL",
+ "value": f'{metadata["url"]}',
+ "fmt": "raw",
+ }
+ )
+
+ if "copyright_holder" in metadata and len(metadata["copyright_holder"]) > 0:
+ if "copyright_year" not in metadata:
+ about_dataset.append(
+ {
+ "name": "Copyright",
+ "value": f"(c) {metadata['copyright_holder']}",
+ "fmt": "fmt",
+ }
+ )
+ else:
+ about_dataset.append(
+ {
+ "name": "Copyright",
+ "value": f"(c) {metadata['copyright_holder']} {metadata['copyright_year']}",
+ "fmt": "fmt",
+ }
+ )
+
+ return Container(
+ [Table(about_dataset, name="Dataset", anchor_id="metadata_dataset")],
+ name="Dataset",
+ anchor_id="dataset",
+ sequence_type="grid",
+ )
+
+
+def get_dataset_reproduction(summary: dict):
version = summary["package"]["pandas_profiling_version"]
config = quote(summary["package"]["pandas_profiling_config"])
date_start = summary["analysis"]["date_start"]
@@ -124,18 +136,43 @@ def get_dataset_reproduction(summary: dict, metadata: dict):
},
],
name="Reproduction",
- anchor_id="metadata_reproduction",
+ anchor_id="overview_reproduction",
)
- dataset_table = get_dataset_schema(metadata)
+ return Container(
+ [reproduction_table],
+ name="Reproduction",
+ anchor_id="reproduction",
+ sequence_type="grid",
+ )
+
+
+def get_dataset_column_definitions(definitions: dict):
+ """Generate an overview section for the variable description
- items = []
- if dataset_table:
- items.append(dataset_table)
- items.append(reproduction_table)
+ Args:
+ definitions: the variable descriptions.
+
+ Returns:
+ A container object
+ """
+
+ variable_descriptions = [
+ Table(
+ [
+ {"name": column, "value": value, "fmt": "fmt"}
+ for column, value in definitions.items()
+ ],
+ name="Variable descriptions",
+ anchor_id="variable_definition_table",
+ )
+ ]
return Container(
- items, name="Metadata", anchor_id="metadata", sequence_type="grid",
+ variable_descriptions,
+ name="Variables",
+ anchor_id="variable_descriptions",
+ sequence_type="grid",
)
@@ -148,3 +185,39 @@ def get_dataset_warnings(warnings: list) -> Warnings:
]
)
return Warnings(warnings=warnings, name=f"Warnings ({count})", anchor_id="warnings")
+
+
+def get_dataset_items(summary: dict, warnings: list) -> list:
+ """Returns the dataset overview (at the top of the report)
+
+ Args:
+ summary: the calculated summary
+ warnings: the warnings
+
+ Returns:
+ A list with components for the dataset overview (overview, reproduction, warnings)
+ """
+
+ items = [get_dataset_overview(summary)]
+
+ metadata = {
+ key: config["dataset"][key].get(str) for key in config["dataset"].keys()
+ }
+
+ if len(metadata) > 0 and any(len(value) > 0 for value in metadata.values()):
+ items.append(get_dataset_schema(metadata))
+
+ column_details = {
+ key: config["variables"]["descriptions"][key].get(str)
+ for key in config["variables"]["descriptions"].keys()
+ }
+
+ if len(column_details) > 0:
+ items.append(get_dataset_column_definitions(column_details))
+
+ if warnings:
+ items.append(get_dataset_warnings(warnings))
+
+ items.append(get_dataset_reproduction(summary))
+
+ return items
diff --git a/src/pandas_profiling/report/structure/report.py b/src/pandas_profiling/report/structure/report.py
index d1be52661..cdf3fc1aa 100644
--- a/src/pandas_profiling/report/structure/report.py
+++ b/src/pandas_profiling/report/structure/report.py
@@ -33,11 +33,7 @@
from pandas_profiling.report.presentation.core.renderable import Renderable
from pandas_profiling.report.presentation.core.root import Root
from pandas_profiling.report.structure.correlations import get_correlation_items
-from pandas_profiling.report.structure.overview import (
- get_dataset_overview,
- get_dataset_reproduction,
- get_dataset_warnings,
-)
+from pandas_profiling.report.structure.overview import get_dataset_items
from pandas_profiling.report.structure.variables import (
render_boolean,
render_categorical,
@@ -126,12 +122,13 @@ def render_variables_section(dataframe_summary: dict) -> list:
}
descriptions = config["variables"]["descriptions"].get(dict)
+ show_description = config["show_variable_description"].get(bool)
template_variables = {
"varname": idx,
"varid": hash(idx),
"warnings": warnings,
- "description": descriptions.get(idx, ""),
+ "description": descriptions.get(idx, "") if show_description else "",
"warn_fields": warning_fields,
}
@@ -183,6 +180,23 @@ def get_duplicates_items(duplicates: pd.DataFrame):
return items
+def get_definition_items(definitions: pd.DataFrame):
+ """Create the list of duplicates items
+
+ Args:
+ definitions: DataFrame of column definitions
+
+ Returns:
+ List of column definitions to show in the interface.
+ """
+ items = []
+ if definitions is not None and len(definitions) > 0:
+ items.append(
+ Duplicate(duplicate=definitions, name="Columns", anchor_id="definitions",)
+ )
+ return items
+
+
def get_sample_items(sample: dict):
"""Create the list of sample items
@@ -245,36 +259,10 @@ def clean_name(name):
return titems
-def get_dataset_items(summary: dict, warnings: list) -> list:
- """Returns the dataset overview (at the top of the report)
-
- Args:
- summary: the calculated summary
- warnings: the warnings
-
- Returns:
- A list with components for the dataset overview (overview, reproduction, warnings)
- """
- metadata = {
- key: config["dataset"][key].get(str) for key in config["dataset"].keys()
- }
-
- items = [
- get_dataset_overview(summary),
- get_dataset_reproduction(summary, metadata),
- ]
-
- if warnings:
- items.append(get_dataset_warnings(warnings))
-
- return items
-
-
def get_report_structure(summary: dict) -> Renderable:
"""Generate a HTML report from summary statistics and a given sample.
Args:
- sample: A dict containing the samples to print.
summary: Statistics to use for the overview, variables, correlations and missing values.
Returns:
diff --git a/src/pandas_profiling/report/structure/variables/render_boolean.py b/src/pandas_profiling/report/structure/variables/render_boolean.py
index 0e7e73202..659eed0fe 100644
--- a/src/pandas_profiling/report/structure/variables/render_boolean.py
+++ b/src/pandas_profiling/report/structure/variables/render_boolean.py
@@ -32,16 +32,16 @@ def render_boolean(summary):
table = Table(
[
{
- "name": "Distinct count",
- "value": summary["n_unique"],
+ "name": "Distinct",
+ "value": summary["n_distinct"],
"fmt": "fmt",
- "alert": "n_unique" in summary["warn_fields"],
+ "alert": "n_distinct" in summary["warn_fields"],
},
{
- "name": "Unique (%)",
- "value": summary["p_unique"],
+ "name": "Distinct (%)",
+ "value": summary["p_distinct"],
"fmt": "fmt_percent",
- "alert": "p_unique" in summary["warn_fields"],
+ "alert": "p_distinct" in summary["warn_fields"],
},
{
"name": "Missing",
diff --git a/src/pandas_profiling/report/structure/variables/render_categorical.py b/src/pandas_profiling/report/structure/variables/render_categorical.py
index 7de2b4bf2..320e41b6c 100644
--- a/src/pandas_profiling/report/structure/variables/render_categorical.py
+++ b/src/pandas_profiling/report/structure/variables/render_categorical.py
@@ -1,4 +1,5 @@
from pandas_profiling.config import config
+from pandas_profiling.report.formatters import help
from pandas_profiling.report.presentation.core import (
Container,
FrequencyTable,
@@ -12,6 +13,44 @@
from pandas_profiling.visualisation.plot import histogram, pie_plot
+def render_categorical_frequency(summary, varid, image_format):
+ frequency_table = Table(
+ [
+ {
+ "name": "Unique",
+ "value": f"{summary['n_unique']} {help('The number of unique values (all values that occur exactly once in the dataset).')}",
+ "fmt": "raw",
+ "alert": "n_unique" in summary["warn_fields"],
+ },
+ {
+ "name": "Unique (%)",
+ "value": summary["p_unique"],
+ "fmt": "fmt_percent",
+ "alert": "p_unique" in summary["warn_fields"],
+ },
+ ],
+ name="Unique",
+ anchor_id=f"{varid}uniquenessstats",
+ )
+
+ frequencies = Image(
+ histogram(*summary["histogram_frequencies"]),
+ image_format=image_format,
+ alt="frequencies histogram",
+ name="Frequencies histogram",
+ caption="Frequencies of value counts",
+ anchor_id=f"{varid}frequencies",
+ )
+
+ frequency_tab = Container(
+ [frequencies, frequency_table],
+ anchor_id=f"{varid}tbl",
+ name="Overview",
+ sequence_type="grid",
+ )
+ return frequency_tab
+
+
def render_categorical_length(summary, varid, image_format):
length_table = Table(
[
@@ -47,8 +86,9 @@ def render_categorical_length(summary, varid, image_format):
length = Image(
histogram(*summary["histogram_length"]),
image_format=image_format,
- alt="Scatter",
+ alt="length histogram",
name="Length",
+ caption="Histogram of lengths of the category",
anchor_id=f"{varid}length",
)
@@ -157,21 +197,21 @@ def render_categorical_unicode(summary, varid, redact):
"alert": False,
},
{
- "name": 'Unique unicode categories (?)',
- "value": summary["n_category"],
- "fmt": "fmt_numeric",
+ "name": "Unique unicode categories",
+ "value": f"{summary['n_category']} {help(title='Unicode categories (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_character_property#General_Category')}",
+ "fmt": "raw",
"alert": False,
},
{
- "name": 'Unique unicode scripts (?)',
- "value": summary["n_scripts"],
- "fmt": "fmt_numeric",
+ "name": "Unique unicode scripts",
+ "value": f"{summary['n_scripts']} {help(title='Unicode scripts (click for more information)', url='https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode')}",
+ "fmt": "raw",
"alert": False,
},
{
- "name": 'Unique unicode blocks (?)',
- "value": summary["n_block_alias"],
- "fmt": "fmt_numeric",
+ "name": "Unique unicode blocks",
+ "value": f"{summary['n_block_alias']} {help(title='Unicode blocks (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_block')}",
+ "fmt": "raw",
"alert": False,
},
],
@@ -245,16 +285,16 @@ def render_categorical(summary):
table = Table(
[
{
- "name": "Distinct count",
- "value": summary["n_unique"],
+ "name": "Distinct",
+ "value": summary["n_distinct"],
"fmt": "fmt",
- "alert": "n_unique" in summary["warn_fields"],
+ "alert": "n_distinct" in summary["warn_fields"],
},
{
- "name": "Unique (%)",
- "value": summary["p_unique"],
+ "name": "Distinct (%)",
+ "value": summary["p_distinct"],
"fmt": "fmt_percent",
- "alert": "p_unique" in summary["warn_fields"],
+ "alert": "p_distinct" in summary["warn_fields"],
},
{
"name": "Missing",
@@ -288,19 +328,19 @@ def render_categorical(summary):
template_variables["top"] = Container([info, table, fqm], sequence_type="grid")
- # Bottom
- items = [
+ citems = [
FrequencyTable(
template_variables["freq_table_rows"],
name="Common Values",
anchor_id=f"{varid}common_values",
redact=redact,
- )
+ ),
+ render_categorical_frequency(summary, varid, image_format),
]
max_unique = config["plot"]["pie"]["max_unique"].get(int)
- if max_unique > 0 and summary["n_unique"] <= max_unique:
- items.append(
+ if max_unique > 0 and summary["n_distinct"] <= max_unique:
+ citems.append(
Image(
pie_plot(summary["value_counts"], legend_kws={"loc": "upper right"}),
image_format=image_format,
@@ -310,6 +350,16 @@ def render_categorical(summary):
)
)
+ # Bottom
+ items = [
+ Container(
+ citems,
+ name="Frequencies",
+ anchor_id=f"{varid}frequencies",
+ sequence_type="tabs",
+ ),
+ ]
+
check_length = config["vars"]["cat"]["length"].get(bool)
if check_length:
items.append(render_categorical_length(summary, varid, image_format))
diff --git a/src/pandas_profiling/report/structure/variables/render_complex.py b/src/pandas_profiling/report/structure/variables/render_complex.py
index 155a25ec7..5220d3257 100644
--- a/src/pandas_profiling/report/structure/variables/render_complex.py
+++ b/src/pandas_profiling/report/structure/variables/render_complex.py
@@ -25,8 +25,12 @@ def render_complex(summary):
table1 = Table(
[
- {"name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt"},
- {"name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent"},
+ {"name": "Distinct", "value": summary["n_distinct"], "fmt": "fmt"},
+ {
+ "name": "Distinct (%)",
+ "value": summary["p_distinct"],
+ "fmt": "fmt_percent",
+ },
{"name": "Missing", "value": summary["n_missing"], "fmt": "fmt"},
{
"name": "Missing (%)",
diff --git a/src/pandas_profiling/report/structure/variables/render_count.py b/src/pandas_profiling/report/structure/variables/render_count.py
index b1759625c..97593c96b 100644
--- a/src/pandas_profiling/report/structure/variables/render_count.py
+++ b/src/pandas_profiling/report/structure/variables/render_count.py
@@ -28,14 +28,14 @@ def render_count(summary):
table1 = Table(
[
{
- "name": "Distinct count",
- "value": summary["n_unique"],
+ "name": "Distinct",
+ "value": summary["n_distinct"],
"fmt": "fmt",
"alert": False,
},
{
- "name": "Unique (%)",
- "value": summary["p_unique"],
+ "name": "Distinct (%)",
+ "value": summary["p_distinct"],
"fmt": "fmt_percent",
"alert": False,
},
diff --git a/src/pandas_profiling/report/structure/variables/render_date.py b/src/pandas_profiling/report/structure/variables/render_date.py
index 8f98dcd0e..9f9f9c299 100644
--- a/src/pandas_profiling/report/structure/variables/render_date.py
+++ b/src/pandas_profiling/report/structure/variables/render_date.py
@@ -27,14 +27,14 @@ def render_date(summary):
table1 = Table(
[
{
- "name": "Distinct count",
- "value": summary["n_unique"],
+ "name": "Distinct",
+ "value": summary["n_distinct"],
"fmt": "fmt",
"alert": False,
},
{
- "name": "Unique (%)",
- "value": summary["p_unique"],
+ "name": "Distinct (%)",
+ "value": summary["p_distinct"],
"fmt": "fmt_percent",
"alert": False,
},
diff --git a/src/pandas_profiling/report/structure/variables/render_real.py b/src/pandas_profiling/report/structure/variables/render_real.py
index a6494a90d..b1767e740 100644
--- a/src/pandas_profiling/report/structure/variables/render_real.py
+++ b/src/pandas_profiling/report/structure/variables/render_real.py
@@ -32,16 +32,16 @@ def render_real(summary):
table1 = Table(
[
{
- "name": "Distinct count",
- "value": summary["n_unique"],
+ "name": "Distinct",
+ "value": summary["n_distinct"],
"fmt": "fmt",
- "alert": "n_unique" in summary["warn_fields"],
+ "alert": "n_distinct" in summary["warn_fields"],
},
{
- "name": "Unique (%)",
- "value": summary["p_unique"],
+ "name": "Distinct (%)",
+ "value": summary["p_distinct"],
"fmt": "fmt_percent",
- "alert": "p_unique" in summary["warn_fields"],
+ "alert": "p_distinct" in summary["warn_fields"],
},
{
"name": "Missing",
diff --git a/src/pandas_profiling/report/structure/variables/render_url.py b/src/pandas_profiling/report/structure/variables/render_url.py
index 6120c665c..760b78ab9 100644
--- a/src/pandas_profiling/report/structure/variables/render_url.py
+++ b/src/pandas_profiling/report/structure/variables/render_url.py
@@ -88,28 +88,28 @@ def render_url(summary):
table = Table(
[
{
- "name": "Distinct count",
- "value": summary["n_unique"],
+ "name": "Distinct",
+ "value": summary["n_distinct"],
"fmt": "fmt",
- "alert": False,
+ "alert": "n_distinct" in summary["warn_fields"],
},
{
- "name": "Unique (%)",
- "value": summary["p_unique"],
+ "name": "Distinct (%)",
+ "value": summary["p_distinct"],
"fmt": "fmt_percent",
- "alert": False,
+ "alert": "p_distinct" in summary["warn_fields"],
},
{
"name": "Missing",
"value": summary["n_missing"],
"fmt": "fmt",
- "alert": False,
+ "alert": "n_missing" in summary["warn_fields"],
},
{
"name": "Missing (%)",
"value": summary["p_missing"],
"fmt": "fmt_percent",
- "alert": False,
+ "alert": "p_missing" in summary["warn_fields"],
},
{
"name": "Memory size",
diff --git a/src/pandas_profiling/version.py b/src/pandas_profiling/version.py
index c89eaa3ef..26e6cae09 100644
--- a/src/pandas_profiling/version.py
+++ b/src/pandas_profiling/version.py
@@ -1,2 +1,2 @@
"""This file is auto-generated by setup.py, please do not alter."""
-__version__ = "2.9.0rc1"
+__version__ = "2.9.0"
diff --git a/src/pandas_profiling/visualisation/plot.py b/src/pandas_profiling/visualisation/plot.py
index fc28491a4..44d13e34f 100644
--- a/src/pandas_profiling/visualisation/plot.py
+++ b/src/pandas_profiling/visualisation/plot.py
@@ -1,4 +1,5 @@
"""Plot functions for the profiling report."""
+import copy
from typing import Optional, Union
import numpy as np
@@ -157,6 +158,7 @@ def correlation_matrix(data: pd.DataFrame, vmin: int = -1) -> str:
cmap = plt.get_cmap(cmap_name)
if vmin == 0:
cmap = get_cmap_half(cmap)
+ cmap = copy.copy(cmap)
cmap.set_bad(cmap_bad)
labels = data.columns
@@ -265,11 +267,13 @@ def scatter_pairwise(series1, series2, x_label, y_label) -> str:
color = config["html"]["style"]["primary_color"].get(str)
scatter_threshold = config["plot"]["scatter_threshold"].get(int)
+ indices = (series1.notna()) & (series2.notna())
+
if len(series1) > scatter_threshold:
cmap = sns.light_palette(color, as_cmap=True)
- plt.hexbin(series1.tolist(), series2.tolist(), gridsize=15, cmap=cmap)
+ plt.hexbin(series1[indices], series2[indices], gridsize=15, cmap=cmap)
else:
- plt.scatter(series1.tolist(), series2.tolist(), color=color)
+ plt.scatter(series1[indices], series2[indices], color=color)
return plot_360_n0sc0pe(plt)
diff --git a/tests/issues/test_issue523.py b/tests/issues/test_issue523.py
new file mode 100644
index 000000000..3e087e9e9
--- /dev/null
+++ b/tests/issues/test_issue523.py
@@ -0,0 +1,34 @@
+"""
+Test for issue 523:
+https://github.com/pandas-profiling/pandas-profiling/issues/XXX
+"""
+import pandas as pd
+import pytest
+
+from pandas_profiling import ProfileReport
+
+
+@pytest.mark.skipif(
+ int(pd.__version__.split(".")[0]) < 1, reason="requires pandas 1 or higher"
+)
+def test_issue523():
+ # https://github.com/pandas-dev/pandas/issues/33803
+
+ data = [
+ 1871248,
+ 12522551,
+ 1489260,
+ 6657093,
+ pd.NA,
+ pd.NA,
+ pd.NA,
+ pd.NA,
+ pd.NA,
+ 1489260,
+ pd.NA,
+ 2468576,
+ ]
+ df = pd.DataFrame({"col": data}, dtype=pd.Int64Dtype())
+
+ profile_report = ProfileReport(df, title="Test Report")
+ assert len(profile_report.to_html()) > 0
diff --git a/tests/issues/test_issue537.py b/tests/issues/test_issue537.py
new file mode 100644
index 000000000..501d12357
--- /dev/null
+++ b/tests/issues/test_issue537.py
@@ -0,0 +1,126 @@
+"""
+Test for issue 537:
+https://github.com/pandas-profiling/pandas-profiling/issues/537
+
+ValueError: shape mismatch: value array of shape (136,) could not be broadcast to indexing result of shape (135,)
+
+Problem :
+ValueError is raised when running ProfileReport on large datasets and with multiprocessing on (pool_size >1).
+This is likely due to the series.fillna(np.nan, inplace=True) in summary.py seems to be performing multiple in-place
+mutations to the underlying DataFrame object through the passed series reference, resulting in some kind of race
+condition where two of the processes try to write to the DataFrame at the same time and the ValueError then occurs.
+ This is also why changing the pool_size to 1 fixes the issue, and why the error doesn't always occur -
+ you probably need enough data and threads to hit the race condition.
+
+Solution :
+Replace series.fillna(np.nan, inplace=True) with series = series.fillna(np.nan) , negating any side effects from mutation.
+
+
+"""
+
+import multiprocessing
+from gzip import decompress
+from typing import Tuple
+
+import numpy as np
+import pandas as pd
+import requests
+
+from pandas_profiling.model.summary import describe_1d
+
+
+def mock_multiprocess_1d(args) -> Tuple[str, dict]:
+ """Wrapper to process series in parallel.
+ copy of multiprocess_1d function in get_series_descriptions, summary.py
+
+ Args:
+ column: The name of the column.
+ series: The series values.
+
+ Returns:
+ A tuple with column and the series description.
+ """
+ column, series = args
+ return column, describe_1d(series)
+
+
+def test_multiprocessing_describe1d():
+ """
+ this test serves to get a large dataset, and ensure that even across parallelised describe1d operations,
+ there is no ValueError raised. Previously, series.fillna(np.nan,inplace=True) was used instead of
+ series = series.fillna(np.nan) in model.summmary.describe1d, resulting in a race condition where the underlying
+ df was being mutated by two threads at the same time creating a ValueError. This test checks that this does not
+ occur again by running a parallelised describe1d and testing if a ValueError is raised.
+
+ """
+
+ def download_and_process_data():
+ response = requests.get("https://ndownloader.figshare.com/files/5976042")
+ assert response.status_code == 200
+ file = decompress(response.content)
+ text = file.decode()
+ split_text = [i.split(",") for i in filter(lambda x: x, text.split("\n"))]
+ dt = [
+ ("duration", int),
+ ("protocol_type", "S4"),
+ ("service", "S11"),
+ ("flag", "S6"),
+ ("src_bytes", int),
+ ("dst_bytes", int),
+ ("land", int),
+ ("wrong_fragment", int),
+ ("urgent", int),
+ ("hot", int),
+ ("num_failed_logins", int),
+ ("logged_in", int),
+ ("num_compromised", int),
+ ("root_shell", int),
+ ("su_attempted", int),
+ ("num_root", int),
+ ("num_file_creations", int),
+ ("num_shells", int),
+ ("num_access_files", int),
+ ("num_outbound_cmds", int),
+ ("is_host_login", int),
+ ("is_guest_login", int),
+ ("count", int),
+ ("srv_count", int),
+ ("serror_rate", float),
+ ("srv_serror_rate", float),
+ ("rerror_rate", float),
+ ("srv_rerror_rate", float),
+ ("same_srv_rate", float),
+ ("diff_srv_rate", float),
+ ("srv_diff_host_rate", float),
+ ("dst_host_count", int),
+ ("dst_host_srv_count", int),
+ ("dst_host_same_srv_rate", float),
+ ("dst_host_diff_srv_rate", float),
+ ("dst_host_same_src_port_rate", float),
+ ("dst_host_srv_diff_host_rate", float),
+ ("dst_host_serror_rate", float),
+ ("dst_host_srv_serror_rate", float),
+ ("dst_host_rerror_rate", float),
+ ("dst_host_srv_rerror_rate", float),
+ ("labels", "S16"),
+ ]
+ DT = np.dtype(dt)
+ split_text = np.asarray(split_text, dtype=object)
+ for j in range(42):
+ split_text[:, j] = split_text[:, j].astype(DT[j])
+ df = pd.DataFrame(split_text)
+ return df
+
+ def run_multiprocess(df):
+ pool = multiprocessing.pool.ThreadPool(10)
+ args = [(column, series) for column, series in df.iteritems()]
+ results = pool.imap_unordered(mock_multiprocess_1d, args)
+ pool.close()
+ pool.join()
+ list(results)
+
+ try:
+ df = download_and_process_data()
+ run_multiprocess(df)
+ except ValueError:
+ raise Exception("myFunc() raised ValueError unexpectedly!")
diff --git a/tests/issues/test_issue545.py b/tests/issues/test_issue545.py
new file mode 100644
index 000000000..1dc97a121
--- /dev/null
+++ b/tests/issues/test_issue545.py
@@ -0,0 +1,29 @@
+"""
+Test for issue 545:
+https://github.com/pandas-profiling/pandas-profiling/issues/545
+"""
+
+import pandas as pd
+import pytest
+
+import pandas_profiling
+
+
+def pandas_version():
+ return tuple(int(s) for s in pd.__version__.split("."))
+
+
+@pytest.mark.skipif(
+ pandas_version() <= (1, 1, 0), reason="requires pandas 1.1.1 or higher"
+)
+def test_issue545(get_data_file):
+ file_name = get_data_file(
+ "sample_eda_df.pkl",
+ "https://github.com/justinsola/justinsola.github.com/raw/master/files/sample_eda_df.pkl",
+ )
+
+ sample_eda_df = pd.read_pickle(str(file_name))
+ sample_profile = sample_eda_df.profile_report(
+ title="Sample Profiling Report", explorative=True, pool_size=1
+ )
+ assert len(sample_profile.to_html()) > 0
diff --git a/tests/unit/test_dataset_schema.py b/tests/unit/test_dataset_schema.py
index 72932481d..42d357063 100644
--- a/tests/unit/test_dataset_schema.py
+++ b/tests/unit/test_dataset_schema.py
@@ -23,10 +23,11 @@ def test_dataset_schema():
assert "Dataset
" in html for key in metadata.keys(): - if not key.startswith("copyright_"): + if not key.startswith("copyright_") and not key == "url": assert f"Reproduction
" in html + assert 'Reproduction
" in html def test_dataset_schema_empty(): diff --git a/tests/unit/test_describe.py b/tests/unit/test_describe.py index 32142b958..7928cbb61 100644 --- a/tests/unit/test_describe.py +++ b/tests/unit/test_describe.py @@ -14,29 +14,30 @@ testdata = [ # Unique values - (pd.Series([1, 2]), True, 1), + (pd.Series([1, 2]), True, 1, 1), # Unique values including nan - (pd.Series([np.nan]), None, None), + (pd.Series([np.nan]), None, None, None), # Unique values all nan - (pd.Series([1, 2, np.nan]), True, 1), + (pd.Series([1, 2, np.nan]), True, 1, 1), # Non unique values - (pd.Series([1, 2, 2]), False, 2 / 3), + (pd.Series([1, 2, 2]), False, 2 / 3, 1 / 3), # Non unique nan - (pd.Series([1, np.nan, np.nan]), True, 1), + (pd.Series([1, np.nan, np.nan]), True, 1, 1), # Non unique values including nan - (pd.Series([1, 2, 2, np.nan]), False, 2 / 3), + (pd.Series([1, 2, 2, np.nan]), False, 2 / 3, 1 / 3), # Non unique values including non unique nan - (pd.Series([1, 2, 2, np.nan, np.nan]), False, 2 / 3), + (pd.Series([1, 2, 2, np.nan, np.nan]), False, 2 / 3, 1 / 3), ] -@pytest.mark.parametrize("data,is_unique,p_unique", testdata) -def test_describe_unique(data, is_unique, p_unique): +@pytest.mark.parametrize("data,is_unique,p_distinct,p_unique", testdata) +def test_describe_unique(data, is_unique, p_distinct, p_unique): """Test the unique feature of 1D data""" desc_1d = describe_1d(data) if is_unique is not None: assert desc_1d["p_unique"] == p_unique, "Describe 1D p_unique incorrect" + assert desc_1d["p_distinct"] == p_distinct, "Describe 1D p_distinct incorrect" assert desc_1d["is_unique"] == is_unique, "Describe 1D should return unique" @@ -163,7 +164,7 @@ def expected_results(): "95%": check_is_NaN, "count": 9, "cv": check_is_NaN, - "distinct_count": 8, + "n_distinct": 8, "freq": 2, "histogram": check_is_NaN, "iqr": check_is_NaN, @@ -176,7 +177,7 @@ def expected_results(): "mini_histogram": check_is_NaN, "n_missing": 0, "p_missing": 0.0, - "p_unique": 0.88888888, + "p_distinct": 0.88888888, "p_zeros": check_is_NaN, "range": check_is_NaN, "skewness": check_is_NaN, @@ -196,7 +197,7 @@ def expected_results(): "n_infinite": 0, "p_infinite": 0, "cv": 1.771071190261633, - "distinct_count": 6, + "n_distinct": 6, "freq": check_is_NaN, "iqr": 24.5, "is_unique": False, @@ -207,7 +208,7 @@ def expected_results(): "min": -10.0, "n_missing": 1, "p_missing": 0.11111111111111116, - "p_unique": 6 / 8, + "p_distinct": 6 / 8, "n": 9, "n_zeros": 2, "p_zeros": 0.2222222222222222, @@ -229,7 +230,7 @@ def expected_results(): "n_infinite": 0, "p_infinite": 0, "cv": 2.2112992878833846, - "distinct_count": 8, + "n_distinct": 8, "freq": check_is_NaN, "iqr": 236.66299975000001, "is_unique": True, @@ -240,7 +241,7 @@ def expected_results(): "min": -3.1415926535000001, "n_missing": 1, "p_missing": 0.11111111111111116, - "p_unique": 1, + "p_distinct": 1, "n_zeros": 0, "p_zeros": 0.0, "range": 3125.1415926535001, @@ -259,7 +260,7 @@ def expected_results(): "95%": check_is_NaN, "count": 8, "cv": check_is_NaN, - "distinct_count": 6, + "n_distinct": 6, "freq": 3, "histogram": check_is_NaN, "iqr": check_is_NaN, @@ -272,7 +273,7 @@ def expected_results(): "mini_histogram": check_is_NaN, "n_missing": 1, "p_missing": 0.11111111111111116, - "p_unique": 6 / 8, + "p_distinct": 6 / 8, "p_zeros": check_is_NaN, "range": check_is_NaN, "skewness": check_is_NaN, @@ -290,7 +291,7 @@ def expected_results(): "95%": check_is_NaN, "count": 9, "cv": check_is_NaN, - "distinct_count": 1, + "n_distinct": 1, "freq": 9, "histogram": check_is_NaN, "iqr": check_is_NaN, @@ -303,7 +304,7 @@ def expected_results(): "mini_histogram": check_is_NaN, "n_missing": 0, "p_missing": 0.0, - "p_unique": 0.1111111111111111, + "p_distinct": 0.1111111111111111, "p_zeros": check_is_NaN, "range": check_is_NaN, "skewness": check_is_NaN, @@ -321,7 +322,7 @@ def expected_results(): "95%": check_is_NaN, "count": 9, "cv": check_is_NaN, - "distinct_count": 1, + "n_distinct": 1, "freq": 9, "histogram": check_is_NaN, "iqr": check_is_NaN, @@ -334,7 +335,7 @@ def expected_results(): "mini_histogram": check_is_NaN, "n_missing": 0, "p_missing": 0.0, - "p_unique": 0.1111111111111111, + "p_distinct": 0.1111111111111111, "p_zeros": check_is_NaN, "range": check_is_NaN, "skewness": check_is_NaN, @@ -352,7 +353,7 @@ def expected_results(): "95%": check_is_NaN, "count": 8, "cv": check_is_NaN, - "distinct_count": 5, + "n_distinct": 5, "freq": check_is_NaN, "iqr": check_is_NaN, "is_unique": False, @@ -363,7 +364,7 @@ def expected_results(): "min": datetime.datetime(1898, 1, 2), "n_missing": 1, "p_missing": 0.11111111111111116, - "p_unique": 5 / 8, + "p_distinct": 5 / 8, "p_zeros": check_is_NaN, "range": datetime.timedelta(45289, hours=13, minutes=57), "skewness": check_is_NaN, @@ -380,7 +381,7 @@ def expected_results(): "95%": check_is_NaN, "count": 9, "cv": check_is_NaN, - "distinct_count": 2, + "n_distinct": 2, "freq": 6, "histogram": check_is_NaN, "iqr": check_is_NaN, @@ -392,7 +393,7 @@ def expected_results(): "mini_histogram": check_is_NaN, "n_missing": 0, "p_missing": 0, - "p_unique": 2 / 9, + "p_distinct": 2 / 9, "p_zeros": check_is_NaN, "range": check_is_NaN, "skewness": check_is_NaN, @@ -410,7 +411,7 @@ def expected_results(): "95%": check_is_NaN, "count": 8, "cv": check_is_NaN, - "distinct_count": 2, + "n_distinct": 2, "freq": 5, "histogram": check_is_NaN, "iqr": check_is_NaN, @@ -422,7 +423,7 @@ def expected_results(): "mini_histogram": check_is_NaN, "n_missing": 1, "p_missing": 0.11111111111111116, - "p_unique": 2 / 8, + "p_distinct": 2 / 8, "p_zeros": check_is_NaN, "range": check_is_NaN, "skewness": check_is_NaN, @@ -440,7 +441,7 @@ def expected_results(): "95%": check_is_NaN, "count": 9, "cv": check_is_NaN, - "distinct_count": 2, + "n_distinct": 2, "freq": 5, "histogram": check_is_NaN, "iqr": check_is_NaN, @@ -452,7 +453,7 @@ def expected_results(): "mini_histogram": check_is_NaN, "n_missing": 0, "p_missing": 0, - "p_unique": 2 / 9, + "p_distinct": 2 / 9, "p_zeros": check_is_NaN, "range": check_is_NaN, "skewness": check_is_NaN, @@ -470,7 +471,7 @@ def expected_results(): "95%": check_is_NaN, "count": 8, "cv": check_is_NaN, - "distinct_count": 2, + "n_distinct": 2, "freq": 4, "iqr": check_is_NaN, "is_unique": False, @@ -480,7 +481,7 @@ def expected_results(): "min": check_is_NaN, "n_missing": 1, "p_missing": 0.11111111111111116, - "p_unique": 2 / 8, + "p_distinct": 2 / 8, "p_zeros": check_is_NaN, "range": check_is_NaN, "skewness": check_is_NaN,