diff --git a/buckaroo/analysis_management.py b/buckaroo/analysis_management.py index d4f421b4..f6aa7da7 100644 --- a/buckaroo/analysis_management.py +++ b/buckaroo/analysis_management.py @@ -65,6 +65,7 @@ class NonExistentSummaryRowException(Exception): 'float': [0.5] *10, 'int': [8] *10, 'negative': [-1] *10, + 'UInt32': pd.Series([5]*10, dtype='UInt32') }) class AnalsysisPipeline(object): @@ -87,7 +88,6 @@ def process_summary_facts_set(self): self.provided_summary_facts_set = set(all_provided) - #all is a special value that will dipslay every row if self.summary_stats_display and not self.summary_stats_display == "all": #verify that we have a way of computing all of the facts we are displaying @@ -106,7 +106,7 @@ def unit_test(self): not implemented yet. This helps interactive development by doing a smoke test on - each new iteration of summary stats function + each new iteration of summary stats function. """ try: diff --git a/buckaroo/auto_clean.py b/buckaroo/auto_clean.py index aeb8c83a..498eca83 100644 --- a/buckaroo/auto_clean.py +++ b/buckaroo/auto_clean.py @@ -56,21 +56,29 @@ def get_object_typing_metadata(ser): counts['datetime_error'] += 1 except (pd.core.tools.datetimes.DateParseError, ValueError, TypeError): counts['datetime_error'] += 1 - try: + + if isinstance(v, bool): + counts['bool'] += 1 + else: + counts['bool_error'] += 1 + if isinstance(v, str): + try: + int(v) + counts['int'] += 1 + except ValueError: + counts['int_error'] += 1 + try: + float(v) + counts['float'] += 1 + except ValueError: + counts['float_error'] += 1 + elif isinstance(v, float) or isinstance(v, int): int(v) counts['int'] += 1 - except ValueError: - counts['int_error'] += 1 - try: float(v) counts['float'] += 1 - except ValueError: - counts['float_error'] += 1 + - if isinstance(v, bool): - counts['bool'] += 1 - else: - counts['bool_error'] += 1 if len(ser) == 0: return counts diff --git a/tests/auto_clean_test.py b/tests/auto_clean_test.py index a4c9c2a0..8db2fbde 100644 --- a/tests/auto_clean_test.py +++ b/tests/auto_clean_test.py @@ -12,6 +12,8 @@ INT_META = {'datetime': 0.0, 'datetime_error': 1.0, 'int': 0.75, 'int_error': 0.25, 'float': 0.75, 'float_error': 0.25, 'bool': 0.0, 'bool_error': 1.0} +FULL_INT_META = {'datetime': 0.0, 'datetime_error': 0, 'int': 1, 'int_error': 0, 'float': 0, 'float_error': 0, 'bool': 0.0, 'bool_error': 0, 'exact_type': 'UInt32', 'general_type':'int'} + FLOAT_META = {'datetime': 0.0, 'datetime_error': 1.0, 'int': 0.25, 'int_error': 0.75, 'float': 0.75, 'float_error': 0.25, 'bool': 0.0, 'bool_error': 1.0} STRING_META = {'datetime': 0.0, 'datetime_error': 1.0, 'int': 0.0, 'int_error': 1.0, 'float': 0.25, 'float_error': 0.75, 'bool': 0.0, 'bool_error': 1.0} @@ -61,6 +63,12 @@ def test_get_typing_metadata(): #there are still problems here, the code isn't properly distinguishing bools from ints and bools assert BOOL_META == ac.get_typing_metadata(pd.Series(['a', 'b', False, True, False])) + assert FULL_INT_META == ac.get_typing_metadata(pd.Series([5]*10, dtype='UInt32')) + + # what does the typing code do on "dtype" objects + ac.get_typing_metadata(pd.Series([pd.Series([5], dtype='UInt32').dtype]*3)) + + #only nans assert ac.get_typing_metadata(pd.Series([None])) == ONLY_NANS_META