Skip to content

Commit

Permalink
mend
Browse files Browse the repository at this point in the history
released 0.3.20
  • Loading branch information
paddymul committed Oct 5, 2023
1 parent 72d8d0b commit 2a0f12e
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 12 deletions.
4 changes: 2 additions & 2 deletions buckaroo/analysis_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class NonExistentSummaryRowException(Exception):
'float': [0.5] *10,
'int': [8] *10,
'negative': [-1] *10,
'UInt32': pd.Series([5]*10, dtype='UInt32')
})

class AnalsysisPipeline(object):
Expand All @@ -87,7 +88,6 @@ def process_summary_facts_set(self):

self.provided_summary_facts_set = set(all_provided)


#all is a special value that will dipslay every row
if self.summary_stats_display and not self.summary_stats_display == "all":
#verify that we have a way of computing all of the facts we are displaying
Expand All @@ -106,7 +106,7 @@ def unit_test(self):
not implemented yet.
This helps interactive development by doing a smoke test on
each new iteration of summary stats function
each new iteration of summary stats function.
"""
try:
Expand Down
28 changes: 18 additions & 10 deletions buckaroo/auto_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,29 @@ def get_object_typing_metadata(ser):
counts['datetime_error'] += 1
except (pd.core.tools.datetimes.DateParseError, ValueError, TypeError):
counts['datetime_error'] += 1
try:

if isinstance(v, bool):
counts['bool'] += 1
else:
counts['bool_error'] += 1
if isinstance(v, str):
try:
int(v)
counts['int'] += 1
except ValueError:
counts['int_error'] += 1
try:
float(v)
counts['float'] += 1
except ValueError:
counts['float_error'] += 1
elif isinstance(v, float) or isinstance(v, int):
int(v)
counts['int'] += 1
except ValueError:
counts['int_error'] += 1
try:
float(v)
counts['float'] += 1
except ValueError:
counts['float_error'] += 1


if isinstance(v, bool):
counts['bool'] += 1
else:
counts['bool_error'] += 1

if len(ser) == 0:
return counts
Expand Down
8 changes: 8 additions & 0 deletions tests/auto_clean_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

INT_META = {'datetime': 0.0, 'datetime_error': 1.0, 'int': 0.75, 'int_error': 0.25, 'float': 0.75, 'float_error': 0.25, 'bool': 0.0, 'bool_error': 1.0}

FULL_INT_META = {'datetime': 0.0, 'datetime_error': 0, 'int': 1, 'int_error': 0, 'float': 0, 'float_error': 0, 'bool': 0.0, 'bool_error': 0, 'exact_type': 'UInt32', 'general_type':'int'}

FLOAT_META = {'datetime': 0.0, 'datetime_error': 1.0, 'int': 0.25, 'int_error': 0.75, 'float': 0.75, 'float_error': 0.25, 'bool': 0.0, 'bool_error': 1.0}

STRING_META = {'datetime': 0.0, 'datetime_error': 1.0, 'int': 0.0, 'int_error': 1.0, 'float': 0.25, 'float_error': 0.75, 'bool': 0.0, 'bool_error': 1.0}
Expand Down Expand Up @@ -61,6 +63,12 @@ def test_get_typing_metadata():

#there are still problems here, the code isn't properly distinguishing bools from ints and bools
assert BOOL_META == ac.get_typing_metadata(pd.Series(['a', 'b', False, True, False]))
assert FULL_INT_META == ac.get_typing_metadata(pd.Series([5]*10, dtype='UInt32'))

# what does the typing code do on "dtype" objects
ac.get_typing_metadata(pd.Series([pd.Series([5], dtype='UInt32').dtype]*3))



#only nans
assert ac.get_typing_metadata(pd.Series([None])) == ONLY_NANS_META
Expand Down

0 comments on commit 2a0f12e

Please sign in to comment.