Release 0.3.26 (#78)

Release 0.3.26 fixes some bugs which prevented rendering and adds a good full tutorial * Added example-notebooks/Full-Tour.ipynb a documented tour of buckaroo features. * Added working google colab link to Full-Tour.ipynb (Fixes #17) #17 * Fixes Stacktrace on dataframes without a numeric column #52 #52 * Fixes add_analysis reproduce code is duplicated #51 #51 * Fixes Quiet=False sometimes fails #50 #50
paddymul · Oct 26, 2023 · 02ac7b9 · 02ac7b9
1 parent c5a721a
commit 02ac7b9
Show file tree

Hide file tree

Showing 12 changed files with 788 additions and 105 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,13 @@ We all know how awkward it is to clean data in jupyter notebooks.  Multiple cell
 
 ![Buckaroo Screenshot](https://raw.githubusercontent.com/paddymul/buckaroo-assets/main/quick-buckaroo.gif)
 
+## Try it today
+
+* [Buckaroo full tour](https://ydata-profiling.ydata.ai/examples/master/meteorites/meteorites_report.html) (comprehensive set of meteorite landing - object properties and locations) [![Open In Colab](https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/paddymul/buckaroo/blob/chore/colab-play/example-notebooks/Full-tour.ipynb)
+
+
 
+https://
 ## Installation
 
 If using JupyterLab, `buckaroo` requires JupyterLab version 3 or higher.

diff --git a/buckaroo/_version.py b/buckaroo/_version.py
@@ -1,4 +1,4 @@
 # Copyright (c) Bloomberg.
 # Distributed under the terms of the Modified BSD License.
 
-__version__ = "0.3.25"
+__version__ = "0.3.26"
diff --git a/buckaroo/analysis.py b/buckaroo/analysis.py
@@ -3,17 +3,25 @@
 from pandas.io.json import dumps as pdumps
 import numpy as np
 from buckaroo.pluggable_analysis_framework import ColAnalysis
+import warnings
 
 def probable_datetime(ser):
+    #turning off warnings in this single function is a bit of a hack.
+    #Understandable since this is explicitly abusing pd.to_datetime
+    #which throws warnings.
+
+    warnings.filterwarnings('ignore')
     s_ser = ser.sample(np.min([len(ser), 500]))
     try:
         dt_ser = pd.to_datetime(s_ser)
         #pd.to_datetime(1_00_000_000_000_000_000) == pd.to_datetime('1973-01-01') 
+        warnings.filterwarnings('default')
         if dt_ser.max() < pd.to_datetime('1973-01-01'):
             return False
         return True
 
     except Exception as e:
+        warnings.filterwarnings('default')
         return False
 
 def get_mode(ser):

diff --git a/buckaroo/analysis_management.py b/buckaroo/analysis_management.py
@@ -1,6 +1,8 @@
+import sys
+import traceback
+
 import numpy as np
 import pandas as pd
-import traceback
 from buckaroo.pluggable_analysis_framework import (
     ColAnalysis, order_analysis, check_solvable, NotProvidedException)
 from buckaroo.serialization_utils import pd_py_serialize, pick, d_update
@@ -28,13 +30,63 @@
     'max_digits':None,
     'histogram': []}
 
-def reproduce_summary(ser_name, kls, summary_df):
-    summary_ser = summary_df[ser_name]
+
+
+def get_df_name(df, level=0):
+    """ looks up the call stack until it finds the variable with this name"""
+    if level == 0:
+        _globals = globals()
+    elif level < 60:
+        try:
+            call_frame = sys._getframe(level)
+            _globals = call_frame.f_globals
+        except ValueError:
+            return None #we went to far up the stacktrace to a non-existent frame
+    else:
+        return None
+
+    name_possibs = [x for x in _globals.keys() if _globals[x] is df]
+    if name_possibs:
+        return name_possibs[0]
+    else:
+        #+2 because the function is recursive, and we need to skip over this frame
+        return get_df_name(df, level + 2)
+
+def safe_summary_df(base_summary_df, index_list):
+    #there are instances where not all indexes of the summary_df will
+    #be available, because there was no valid data to produce those
+    #indexes. This fixes them and explicitly. Empty rows will have NaN
+    return pd.DataFrame(base_summary_df, index_list)
+
+def reproduce_summary(ser_name, kls, summary_df, err, operating_df_name):
+    ssdf = safe_summary_df(summary_df, kls.requires_summary)
+    summary_ser = ssdf[ser_name]
     minimal_summary_dict = pick(summary_ser, kls.requires_summary)
     sum_ser_repr = "pd.Series(%s)" % pd_py_serialize(minimal_summary_dict)
 
-    print("%s.summary(PERVERSE_DF['%s'], %s, PERVERSE_DF['%s'])" % (
-        kls.cname(), ser_name, sum_ser_repr, ser_name))
+    f = "{kls}.summary({df_name}['{ser_name}'], {summary_ser_repr}, {df_name}['{ser_name}']) # {err_msg}"
+    print(f.format(
+        kls=kls.cname(), df_name=operating_df_name, ser_name=ser_name,
+        summary_ser_repr=sum_ser_repr, err_msg=err))
+
+def output_reproduce_preamble():
+    print("#Reproduction code")
+    print("#" + "-" * 80)
+    print("from buckaroo.analysis_management import PERVERSE_DF")
+
+def output_full_reproduce(errs, summary_df, df_name):
+    if len(errs) == 0:
+        raise Exception("output_full_reproduce called with 0 len errs")
+
+    try:
+        for ser_name, err_kls in errs.items():
+            err, kls = err_kls
+            reproduce_summary(ser_name, kls, summary_df, err, df_name)
+    except Exception as e:
+        #this is tricky stuff that shouldn't error, I want these stack traces to escape being caught
+        traceback.print_exc()
+
+
 
 
 def produce_summary_df(df, ordered_objs, df_name='test_df'):
@@ -68,7 +120,6 @@ def produce_summary_df(df, ordered_objs, df_name='test_df'):
             except Exception as e:
                 if not a_kls.quiet:
                     errs[ser_name] = e, a_kls
-                    #traceback.print_exc()
                 continue
         summary_col_dict[ser_name] = summary_ser
 
@@ -77,19 +128,6 @@ def produce_summary_df(df, ordered_objs, df_name='test_df'):
             BASE_COL_HINT.keys())
     summary_df = pd.DataFrame(summary_col_dict)
     table_hints = table_hint_col_dict
-    if errs:
-        for ser_name, err_kls in errs.items():
-          err, kls = err_kls
-          print("%r failed on %s with %r" % (kls, ser_name, err))
-
-        print("Reproduction code")
-        print("-" * 80)
-        print("from buckaroo.analysis_management import PERVERSE_DF")
-        for ser_name, err_kls in errs.items():
-          err, kls = err_kls
-          reproduce_summary(ser_name, kls, summary_df)
-        print("-" * 80)
-
     return summary_df, table_hints, errs
 
 class NonExistentSummaryRowException(Exception):
@@ -126,8 +164,6 @@ def verify_analysis_objects(self, analysis_objects):
         self.ordered_a_objs = order_analysis(analysis_objects)
         check_solvable(self.ordered_a_objs)
         self.process_summary_facts_set()
-        if self.unit_test_objs:
-            self.unit_test()
 
     def unit_test(self):
         """test a single, or each col_analysis object with a set of commonly difficult series.
@@ -140,17 +176,16 @@ def unit_test(self):
         try:
             output_df, table_hint_dict, errs = produce_summary_df(PERVERSE_DF, self.ordered_a_objs)
             if len(errs) == 0:
-                return True
-            return False
-        except Exception as e:
-            print("analysis pipeline unit_test failed")
-            print(e)
-            return False
+                return True, []
+            else:
+                return False, errs
+        except KeyError:
+            pass
 
 
     def process_df(self, input_df):
         output_df, table_hint_dict, errs = produce_summary_df(input_df, self.ordered_a_objs)
-        return output_df, table_hint_dict
+        return output_df, table_hint_dict, errs
 
     def add_analysis(self, new_aobj):
         new_cname = new_aobj.cname()
@@ -161,24 +196,27 @@ def add_analysis(self, new_aobj):
             new_aobj_set.append(aobj)
         new_aobj_set.append(new_aobj)
         self.verify_analysis_objects(new_aobj_set)
-        if not self.unit_test():
-            print("new analysis fails unit tests")
-            return False
-        return True
+        passed_unit_test, errs = self.unit_test()
+        if passed_unit_test is False:
+            return False, errs
+        return True, []
 
 
 class DfStats(object):
     '''
     DfStats exists to handle inteligent downampling and applying the ColAnalysis functions
     '''
 
-    def __init__(self, df, col_analysis_objs):
-        self.df = self.get_operating_df(df, force_full_eval=False)
+    def __init__(self, df_stats_df, col_analysis_objs, operating_df_name=None):
+        self.df = self.get_operating_df(df_stats_df, force_full_eval=False)
         self.col_order = self.df.columns
         self.ap = AnalsysisPipeline(col_analysis_objs)
-        self.sdf, self.table_hints = self.ap.process_df(self.df)
-
+        self.operating_df_name = operating_df_name
 
+        self.sdf, self.table_hints, errs = self.ap.process_df(self.df)
+        if errs:
+            output_full_reproduce(errs, self.sdf, operating_df_name)
+
     def get_operating_df(self, df, force_full_eval):
         rows = len(df)
         cols = len(df.columns)
@@ -193,9 +231,21 @@ def get_operating_df(self, df, force_full_eval):
     def presentation_sdf(self):
         if self.ap.summary_stats_display == "all":
             return self.sdf
-        return self.sdf.loc[self.ap.summary_stats_display]
+        return safe_summary_df(self.sdf, self.ap.summary_stats_display)
 
     def add_analysis(self, a_obj):
-        self.ap.add_analysis(a_obj)
-        self.sdf, self.table_hints = self.ap.process_df(self.df)
-
+        passed_unit_tests, ut_errs = self.ap.add_analysis(a_obj)
+        self.sdf, self.table_hints, errs = self.ap.process_df(self.df)
+        if passed_unit_tests == False:
+            print("Unit tests failed")
+        if errs:
+            print("Errors on original dataframe")
+
+        if ut_errs or errs:
+            output_reproduce_preamble()
+        if ut_errs:
+            ut_summary_df, _unused_table_hint_dict, ut_errs2 = produce_summary_df(
+                PERVERSE_DF, self.ap.ordered_a_objs)
+            output_full_reproduce(ut_errs, ut_summary_df, "PERVERSE_DF")
+        if errs:
+            output_full_reproduce(errs, self.sdf, self.operating_df_name)
diff --git a/buckaroo/buckaroo_widget.py b/buckaroo/buckaroo_widget.py
@@ -21,7 +21,7 @@
 from .down_sample import sample
 
 from .analysis import (TypingStats, DefaultSummaryStats, ColDisplayHints)
-from .analysis_management import DfStats
+from .analysis_management import DfStats, get_df_name
 
 from .serialization_utils import df_to_obj, EMPTY_DF_OBJ
 
@@ -104,6 +104,7 @@ def __init__(self, df,
         self.postProcessingF = postProcessingF
         self.processed_result = None
         self.transformed_df = None
+        self.df_name = get_df_name(df)
 
         self.setup_from_command_kls_list()
         self.dfConfig = self.get_df_config(df, sampled, reorderdColumns, showCommands)
@@ -113,6 +114,7 @@ def __init__(self, df,
 
         warnings.filterwarnings('default')
 
+
     def run_autoclean(self, autoType):
         if autoType:
             # this will trigger the setting of self.typed_df
@@ -202,7 +204,7 @@ def get_working_df(self):
     def set_typed_df(self, new_df):
         self.typed_df = new_df
         # stats need to be rerun each time 
-        self.stats = DfStats(self.typed_df, [TypingStats, DefaultSummaryStats, ColDisplayHints])
+        self.stats = DfStats(self.typed_df, [TypingStats, DefaultSummaryStats, ColDisplayHints], self.df_name)
         self.summaryDf = df_to_obj(self.stats.presentation_sdf, self.stats.col_order)
         self.update_based_on_df_config(3)