Chore/fix nan display (#36)

* nulls/na are now diplayed as blank
paddymul · Oct 5, 2023 · e0f764f · e0f764f
1 parent f88fcb8
commit e0f764f
Show file tree

Hide file tree

Showing 7 changed files with 97 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -221,7 +221,9 @@ For a development installation:
 ```bash
 git clone https://github.com/paddymul/buckaroo.git
 cd buckaroo
-conda install ipywidgets=8 jupyterlab
+#we need to build against 3.6.5, jupyterlab 4.0 has different JS typing that conflicts
+# the installable still works in JL4
+pip install build twine pytest jupyterlab==3.6.5
 pip install -ve .
 ```
 

diff --git a/buckaroo/analysis.py b/buckaroo/analysis.py
@@ -71,6 +71,7 @@ def summary(sampled_ser, summary_ser, ser):
 
         is_numeric = pd.api.types.is_numeric_dtype(ser)
         is_object = pd.api.types.is_object_dtype
+        is_bool = pd.api.types.is_bool_dtype(ser)
 
         base_d = dict(
             length=l,
@@ -82,15 +83,18 @@ def summary(sampled_ser, summary_ser, ser):
             unique_per=unique_count/l,
             nan_per=nan_count/l,
             mode=get_mode(ser),
-            min=(is_numeric and ser.dropna().min() or np.nan),
-            max=(is_numeric and ser.dropna().max() or np.nan),
-            mean=(is_numeric and ser.dropna().mean() or np.nan))
-        if is_numeric:
-            base_d['mean'] = ser.mean()
+            min=np.nan,
+            max=np.nan)
+        if is_numeric and not is_bool:
+            base_d.update({
+                'mean': ser.mean(),
+                'min': ser.dropna().min(),
+                'max': ser.dropna().max()})
         return base_d
 
-
 def int_digits(n):
+    if pd.isna(n):
+        return 1
     if np.isnan(n):
         return 1
     if n == 0:
@@ -207,10 +211,15 @@ def table_hints(sampled_ser, summary_ser, table_hint_col_dict):
         #     return dict(is_numeric=False)
         # if len(sampled_ser) == 0:
         #     return dict(is_numeric=False)
-        return dict(
+        base_dict = dict(
             is_numeric=is_numeric,
             is_integer=pd.api.types.is_integer_dtype(sampled_ser),
-            min_digits=(is_numeric and not is_bool and int_digits(summary_ser.loc['min'])) or 0,
-            max_digits=(is_numeric and not is_bool and int_digits(summary_ser.loc['max'])) or 0,
             histogram=histogram(sampled_ser, summary_ser['nan_per']))
 
+        if is_numeric and not is_bool:
+            base_dict.update({
+                'min_digits':int_digits(summary_ser.loc['min']),
+                'max_digits':int_digits(summary_ser.loc['max']),
+                })
+        return base_dict
+
diff --git a/buckaroo/analysis_management.py b/buckaroo/analysis_management.py
@@ -4,9 +4,20 @@
 from buckaroo.pluggable_analysis_framework import (
     ColAnalysis, order_analysis, check_solvable, NotProvidedException)
 
-
 FAST_SUMMARY_WHEN_GREATER = 1_000_000
 
+PERVERSE_DF = pd.DataFrame({
+    'all_nan': [np.nan] * 10,
+    'all_false': [False] * 10,
+    'all_True': [True] * 10,
+    'mixed_bool': np.concatenate([[True]*5, [False]*5]),
+    'mixed_float': np.concatenate([[0.5, np.nan, None], [6]*7]),
+    'float': [0.5] *10,
+    'int': [8] *10,
+    'negative': [-1] *10,
+    'UInt32': pd.Series([5]*10, dtype='UInt32'),
+    'UInt8None':pd.Series([None] * 10, dtype='UInt8')
+    })
 
 def produce_summary_df(df, ordered_objs, df_name='test_df'):
     """
@@ -49,25 +60,10 @@ def produce_summary_df(df, ordered_objs, df_name='test_df'):
             print("%s.summary(test_ser.%s)" % (kls.__name__, ser_name))
     return pd.DataFrame(summary_col_dict), table_hint_col_dict, errs
 
-
-
 class NonExistentSummaryRowException(Exception):
     pass
 
 
-
-PERVERSE_DF = pd.DataFrame({
-    'all_nan': [np.nan] * 10,
-    'all_false': [False] * 10,
-    'all_True': [True] * 10,
-    'mixed_bool': np.concatenate([[True]*5, [False]*5]),
-    'mixed_float': np.concatenate([[0.5, np.nan, None], [6]*7]),
-    'float': [0.5] *10,
-    'int': [8] *10,
-    'negative': [-1] *10,
-    'UInt32': pd.Series([5]*10, dtype='UInt32')
-    })
-
 class AnalsysisPipeline(object):
     """
     manage the ordering of a set of col_analysis objects

diff --git a/js/components/DFViewer.tsx b/js/components/DFViewer.tsx
@@ -31,7 +31,11 @@ export function DFViewer(
     setActiveCol: () => null,
   }
 ) {
+
   const [agColsPure, agData] = dfToAgrid(df);
+  console.log("dfviewer df", df);
+  console.log("dfviewer agData", agData);
+
   const styledColumns = updateAtMatch(
     _.clone(agColsPure),
     activeCol || '___never',

diff --git a/js/components/gridUtils.ts b/js/components/gridUtils.ts
@@ -52,6 +52,18 @@ export const anyFormatter = (params: ValueFormatterParams): string => {
   }
   return val;
 };
+
+const getNumericFormatter = (totalWidth:number) => {
+  const formatter = new Intl.NumberFormat('en-US');
+  const numericFormatter = (params: ValueFormatterParams): string => {
+    const val = params.value;
+    if(val === null) {
+      return ''
+    }
+    return formatter.format(params.value).padStart(totalWidth, ' ');
+  }
+  return numericFormatter
+}
 /*
   console.log((new Intl.NumberFormat('en-US')).format(amount))
   console.log((new Intl.NumberFormat('en-US', {  maximumFractionDigits: 1})).format(number))
@@ -73,12 +85,7 @@ function getFormatter(hint: ColumnHint): ValueFormatterFunc<unknown> {
 
     if (hint.is_integer) {
       const totalWidth = commas + hint.max_digits;
-      return (params: ValueFormatterParams): string => {
-        console.log('params', params);
-
-        const formatter = new Intl.NumberFormat('en-US');
-        return formatter.format(params.value).padStart(totalWidth, ' ');
-      };
+      return getNumericFormatter(totalWidth);
     } else {
       /*
 
@@ -98,7 +105,9 @@ function getFormatter(hint: ColumnHint): ValueFormatterFunc<unknown> {
       };*/
       return (params: ValueFormatterParams): string => {
         //console.log("params", params)
-
+	if(params.value === null) {
+	  return '';
+	}
         return floatFormatter.format(params.value);
       };
     }

diff --git a/js/components/staticData.ts b/js/components/staticData.ts
@@ -33,7 +33,7 @@ export interface DFColumn {
   name: string;
   type: string;
 }
-export type DFDataRow = Record<string, string | number | boolean>;
+export type DFDataRow = Record<string, string | number | boolean | null>;
 
 export type DFData = DFDataRow[];
 
@@ -272,6 +272,9 @@ export const tableDf: DFWhole = {
   schema: {
     fields: [
       { name: 'index', type: 'integer' },
+      { name: 'nanNumeric', type: 'int' },
+      { name: 'nanObject', type: 'int' },
+      { name: 'nanFloat', type: 'float' },
       { name: 'end station name', type: 'string' },
       { name: 'tripduration', type: 'integer' },
       { name: 'start station name', type: 'string' },
@@ -287,34 +290,50 @@ export const tableDf: DFWhole = {
       tripduration: 471,
       'start station name': 'Catherine St & Monroe St',
       floatCol: '1.111',
+      nanNumeric: null,
+      nanObject: null,
+      nanFloat: null,
     },
     {
       index: 1,
       'end station name': 'South St & Whitehall St',
       tripduration: 1494,
       'start station name': '1 Ave & E 30 St',
       floatCol: '8.888',
+      nanNumeric: null,
+      nanObject: null,
+      nanFloat: null,
     },
     {
       index: 2,
       'end station name': 'E 59 St & Sutton Pl',
       tripduration: 464,
       'start station name': 'E 48 St & 3 Ave',
       floatCol: '9.999',
+      nanNumeric: null,
+      nanObject: null,
+      nanFloat: null,
     },
     {
       index: 3,
       'end station name': 'E 33 St & 5 Ave',
       tripduration: 373,
       'start station name': 'Pershing Square N',
       floatCol: '-10.1',
+      nanCol: null,
+      nanNumeric: null,
+      nanObject: null,
+      nanFloat: null,
     },
     {
       index: 4,
       'end station name': 'Hancock St & Bedford Ave',
       tripduration: 660,
       'start station name': 'Atlantic Ave & Fort Greene Pl',
       floatCol: '10.99',
+      nanNumeric: null,
+      nanObject: null,
+      nanFloat: null,
     },
   ],
   table_hints: {
@@ -345,5 +364,23 @@ export const tableDf: DFWhole = {
         { name: 'NA', cat_pop: 0.0 },
       ],
     },
+    nanNumeric: {
+      is_numeric: true,
+      is_integer: true,
+      min_digits: 1,
+      max_digits: 3,
+      histogram:histograms.num_histo
+    },
+    nanFloat: {
+      is_numeric: true,
+      is_integer: false,
+      min_digits: 1,
+      max_digits: 3,
+      histogram:histograms.num_histo
+    },
+    nanObject: {
+      is_numeric: false,
+    }
+
   },
 };
diff --git a/tests/basic_widget_test.py b/tests/basic_widget_test.py
@@ -1,15 +1,19 @@
 import pandas as pd
 from IPython.display import display
 from buckaroo.buckaroo_widget import BuckarooWidget
+from buckaroo.analysis_management import PERVERSE_DF
 
 
 simple_df = pd.DataFrame({'int_col':[1, 2, 3], 'str_col':['a', 'b', 'c']})
 
 def test_basic_instantiation():
-    df = simple_df
-    w = BuckarooWidget(df)
+    w = BuckarooWidget(simple_df)
     assert w.dfConfig['totalRows'] == 3
 
+def test_perverse_instantiation():
+    w = BuckarooWidget(PERVERSE_DF)
+    assert w.dfConfig['totalRows'] == 10
+
 def test_basic_display():
     df = simple_df
     w = BuckarooWidget(df)