Skip to content

Commit

Permalink
Release 0.3.26 (#78)
Browse files Browse the repository at this point in the history
Release 0.3.26 fixes some bugs which prevented rendering and adds a good full tutorial

* Added example-notebooks/Full-Tour.ipynb a documented tour of buckaroo features.
* Added working google colab link to Full-Tour.ipynb (Fixes #17) #17
* Fixes Stacktrace on dataframes without a numeric column #52 #52
* Fixes add_analysis reproduce code is duplicated #51 #51
* Fixes Quiet=False sometimes fails #50 #50
  • Loading branch information
paddymul authored Oct 26, 2023
1 parent c5a721a commit 02ac7b9
Show file tree
Hide file tree
Showing 12 changed files with 788 additions and 105 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@ We all know how awkward it is to clean data in jupyter notebooks. Multiple cell

![Buckaroo Screenshot](https://raw.githubusercontent.com/paddymul/buckaroo-assets/main/quick-buckaroo.gif)

## Try it today

* [Buckaroo full tour](https://ydata-profiling.ydata.ai/examples/master/meteorites/meteorites_report.html) (comprehensive set of meteorite landing - object properties and locations) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/paddymul/buckaroo/blob/chore/colab-play/example-notebooks/Full-tour.ipynb)



https://
## Installation

If using JupyterLab, `buckaroo` requires JupyterLab version 3 or higher.
Expand Down
2 changes: 1 addition & 1 deletion buckaroo/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) Bloomberg.
# Distributed under the terms of the Modified BSD License.

__version__ = "0.3.25"
__version__ = "0.3.26"
8 changes: 8 additions & 0 deletions buckaroo/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,25 @@
from pandas.io.json import dumps as pdumps
import numpy as np
from buckaroo.pluggable_analysis_framework import ColAnalysis
import warnings

def probable_datetime(ser):
#turning off warnings in this single function is a bit of a hack.
#Understandable since this is explicitly abusing pd.to_datetime
#which throws warnings.

warnings.filterwarnings('ignore')
s_ser = ser.sample(np.min([len(ser), 500]))
try:
dt_ser = pd.to_datetime(s_ser)
#pd.to_datetime(1_00_000_000_000_000_000) == pd.to_datetime('1973-01-01')
warnings.filterwarnings('default')
if dt_ser.max() < pd.to_datetime('1973-01-01'):
return False
return True

except Exception as e:
warnings.filterwarnings('default')
return False

def get_mode(ser):
Expand Down
130 changes: 90 additions & 40 deletions buckaroo/analysis_management.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import sys
import traceback

import numpy as np
import pandas as pd
import traceback
from buckaroo.pluggable_analysis_framework import (
ColAnalysis, order_analysis, check_solvable, NotProvidedException)
from buckaroo.serialization_utils import pd_py_serialize, pick, d_update
Expand Down Expand Up @@ -28,13 +30,63 @@
'max_digits':None,
'histogram': []}

def reproduce_summary(ser_name, kls, summary_df):
summary_ser = summary_df[ser_name]


def get_df_name(df, level=0):
""" looks up the call stack until it finds the variable with this name"""
if level == 0:
_globals = globals()
elif level < 60:
try:
call_frame = sys._getframe(level)
_globals = call_frame.f_globals
except ValueError:
return None #we went to far up the stacktrace to a non-existent frame
else:
return None

name_possibs = [x for x in _globals.keys() if _globals[x] is df]
if name_possibs:
return name_possibs[0]
else:
#+2 because the function is recursive, and we need to skip over this frame
return get_df_name(df, level + 2)

def safe_summary_df(base_summary_df, index_list):
#there are instances where not all indexes of the summary_df will
#be available, because there was no valid data to produce those
#indexes. This fixes them and explicitly. Empty rows will have NaN
return pd.DataFrame(base_summary_df, index_list)

def reproduce_summary(ser_name, kls, summary_df, err, operating_df_name):
ssdf = safe_summary_df(summary_df, kls.requires_summary)
summary_ser = ssdf[ser_name]
minimal_summary_dict = pick(summary_ser, kls.requires_summary)
sum_ser_repr = "pd.Series(%s)" % pd_py_serialize(minimal_summary_dict)

print("%s.summary(PERVERSE_DF['%s'], %s, PERVERSE_DF['%s'])" % (
kls.cname(), ser_name, sum_ser_repr, ser_name))
f = "{kls}.summary({df_name}['{ser_name}'], {summary_ser_repr}, {df_name}['{ser_name}']) # {err_msg}"
print(f.format(
kls=kls.cname(), df_name=operating_df_name, ser_name=ser_name,
summary_ser_repr=sum_ser_repr, err_msg=err))

def output_reproduce_preamble():
print("#Reproduction code")
print("#" + "-" * 80)
print("from buckaroo.analysis_management import PERVERSE_DF")

def output_full_reproduce(errs, summary_df, df_name):
if len(errs) == 0:
raise Exception("output_full_reproduce called with 0 len errs")

try:
for ser_name, err_kls in errs.items():
err, kls = err_kls
reproduce_summary(ser_name, kls, summary_df, err, df_name)
except Exception as e:
#this is tricky stuff that shouldn't error, I want these stack traces to escape being caught
traceback.print_exc()




def produce_summary_df(df, ordered_objs, df_name='test_df'):
Expand Down Expand Up @@ -68,7 +120,6 @@ def produce_summary_df(df, ordered_objs, df_name='test_df'):
except Exception as e:
if not a_kls.quiet:
errs[ser_name] = e, a_kls
#traceback.print_exc()
continue
summary_col_dict[ser_name] = summary_ser

Expand All @@ -77,19 +128,6 @@ def produce_summary_df(df, ordered_objs, df_name='test_df'):
BASE_COL_HINT.keys())
summary_df = pd.DataFrame(summary_col_dict)
table_hints = table_hint_col_dict
if errs:
for ser_name, err_kls in errs.items():
err, kls = err_kls
print("%r failed on %s with %r" % (kls, ser_name, err))

print("Reproduction code")
print("-" * 80)
print("from buckaroo.analysis_management import PERVERSE_DF")
for ser_name, err_kls in errs.items():
err, kls = err_kls
reproduce_summary(ser_name, kls, summary_df)
print("-" * 80)

return summary_df, table_hints, errs

class NonExistentSummaryRowException(Exception):
Expand Down Expand Up @@ -126,8 +164,6 @@ def verify_analysis_objects(self, analysis_objects):
self.ordered_a_objs = order_analysis(analysis_objects)
check_solvable(self.ordered_a_objs)
self.process_summary_facts_set()
if self.unit_test_objs:
self.unit_test()

def unit_test(self):
"""test a single, or each col_analysis object with a set of commonly difficult series.
Expand All @@ -140,17 +176,16 @@ def unit_test(self):
try:
output_df, table_hint_dict, errs = produce_summary_df(PERVERSE_DF, self.ordered_a_objs)
if len(errs) == 0:
return True
return False
except Exception as e:
print("analysis pipeline unit_test failed")
print(e)
return False
return True, []
else:
return False, errs
except KeyError:
pass


def process_df(self, input_df):
output_df, table_hint_dict, errs = produce_summary_df(input_df, self.ordered_a_objs)
return output_df, table_hint_dict
return output_df, table_hint_dict, errs

def add_analysis(self, new_aobj):
new_cname = new_aobj.cname()
Expand All @@ -161,24 +196,27 @@ def add_analysis(self, new_aobj):
new_aobj_set.append(aobj)
new_aobj_set.append(new_aobj)
self.verify_analysis_objects(new_aobj_set)
if not self.unit_test():
print("new analysis fails unit tests")
return False
return True
passed_unit_test, errs = self.unit_test()
if passed_unit_test is False:
return False, errs
return True, []


class DfStats(object):
'''
DfStats exists to handle inteligent downampling and applying the ColAnalysis functions
'''

def __init__(self, df, col_analysis_objs):
self.df = self.get_operating_df(df, force_full_eval=False)
def __init__(self, df_stats_df, col_analysis_objs, operating_df_name=None):
self.df = self.get_operating_df(df_stats_df, force_full_eval=False)
self.col_order = self.df.columns
self.ap = AnalsysisPipeline(col_analysis_objs)
self.sdf, self.table_hints = self.ap.process_df(self.df)

self.operating_df_name = operating_df_name

self.sdf, self.table_hints, errs = self.ap.process_df(self.df)
if errs:
output_full_reproduce(errs, self.sdf, operating_df_name)

def get_operating_df(self, df, force_full_eval):
rows = len(df)
cols = len(df.columns)
Expand All @@ -193,9 +231,21 @@ def get_operating_df(self, df, force_full_eval):
def presentation_sdf(self):
if self.ap.summary_stats_display == "all":
return self.sdf
return self.sdf.loc[self.ap.summary_stats_display]
return safe_summary_df(self.sdf, self.ap.summary_stats_display)

def add_analysis(self, a_obj):
self.ap.add_analysis(a_obj)
self.sdf, self.table_hints = self.ap.process_df(self.df)

passed_unit_tests, ut_errs = self.ap.add_analysis(a_obj)
self.sdf, self.table_hints, errs = self.ap.process_df(self.df)
if passed_unit_tests == False:
print("Unit tests failed")
if errs:
print("Errors on original dataframe")

if ut_errs or errs:
output_reproduce_preamble()
if ut_errs:
ut_summary_df, _unused_table_hint_dict, ut_errs2 = produce_summary_df(
PERVERSE_DF, self.ap.ordered_a_objs)
output_full_reproduce(ut_errs, ut_summary_df, "PERVERSE_DF")
if errs:
output_full_reproduce(errs, self.sdf, self.operating_df_name)
6 changes: 4 additions & 2 deletions buckaroo/buckaroo_widget.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from .down_sample import sample

from .analysis import (TypingStats, DefaultSummaryStats, ColDisplayHints)
from .analysis_management import DfStats
from .analysis_management import DfStats, get_df_name

from .serialization_utils import df_to_obj, EMPTY_DF_OBJ

Expand Down Expand Up @@ -104,6 +104,7 @@ def __init__(self, df,
self.postProcessingF = postProcessingF
self.processed_result = None
self.transformed_df = None
self.df_name = get_df_name(df)

self.setup_from_command_kls_list()
self.dfConfig = self.get_df_config(df, sampled, reorderdColumns, showCommands)
Expand All @@ -113,6 +114,7 @@ def __init__(self, df,

warnings.filterwarnings('default')


def run_autoclean(self, autoType):
if autoType:
# this will trigger the setting of self.typed_df
Expand Down Expand Up @@ -202,7 +204,7 @@ def get_working_df(self):
def set_typed_df(self, new_df):
self.typed_df = new_df
# stats need to be rerun each time
self.stats = DfStats(self.typed_df, [TypingStats, DefaultSummaryStats, ColDisplayHints])
self.stats = DfStats(self.typed_df, [TypingStats, DefaultSummaryStats, ColDisplayHints], self.df_name)
self.summaryDf = df_to_obj(self.stats.presentation_sdf, self.stats.col_order)
self.update_based_on_df_config(3)

Expand Down
Loading

0 comments on commit 02ac7b9

Please sign in to comment.