Skip to content

Commit

Permalink
Chore/fix nan display (#36)
Browse files Browse the repository at this point in the history
* nulls/na are now diplayed as blank
  • Loading branch information
paddymul authored Oct 5, 2023
1 parent f88fcb8 commit e0f764f
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 36 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,9 @@ For a development installation:
```bash
git clone https://github.com/paddymul/buckaroo.git
cd buckaroo
conda install ipywidgets=8 jupyterlab
#we need to build against 3.6.5, jupyterlab 4.0 has different JS typing that conflicts
# the installable still works in JL4
pip install build twine pytest jupyterlab==3.6.5
pip install -ve .
```

Expand Down
27 changes: 18 additions & 9 deletions buckaroo/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def summary(sampled_ser, summary_ser, ser):

is_numeric = pd.api.types.is_numeric_dtype(ser)
is_object = pd.api.types.is_object_dtype
is_bool = pd.api.types.is_bool_dtype(ser)

base_d = dict(
length=l,
Expand All @@ -82,15 +83,18 @@ def summary(sampled_ser, summary_ser, ser):
unique_per=unique_count/l,
nan_per=nan_count/l,
mode=get_mode(ser),
min=(is_numeric and ser.dropna().min() or np.nan),
max=(is_numeric and ser.dropna().max() or np.nan),
mean=(is_numeric and ser.dropna().mean() or np.nan))
if is_numeric:
base_d['mean'] = ser.mean()
min=np.nan,
max=np.nan)
if is_numeric and not is_bool:
base_d.update({
'mean': ser.mean(),
'min': ser.dropna().min(),
'max': ser.dropna().max()})
return base_d


def int_digits(n):
if pd.isna(n):
return 1
if np.isnan(n):
return 1
if n == 0:
Expand Down Expand Up @@ -207,10 +211,15 @@ def table_hints(sampled_ser, summary_ser, table_hint_col_dict):
# return dict(is_numeric=False)
# if len(sampled_ser) == 0:
# return dict(is_numeric=False)
return dict(
base_dict = dict(
is_numeric=is_numeric,
is_integer=pd.api.types.is_integer_dtype(sampled_ser),
min_digits=(is_numeric and not is_bool and int_digits(summary_ser.loc['min'])) or 0,
max_digits=(is_numeric and not is_bool and int_digits(summary_ser.loc['max'])) or 0,
histogram=histogram(sampled_ser, summary_ser['nan_per']))

if is_numeric and not is_bool:
base_dict.update({
'min_digits':int_digits(summary_ser.loc['min']),
'max_digits':int_digits(summary_ser.loc['max']),
})
return base_dict

28 changes: 12 additions & 16 deletions buckaroo/analysis_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,20 @@
from buckaroo.pluggable_analysis_framework import (
ColAnalysis, order_analysis, check_solvable, NotProvidedException)


FAST_SUMMARY_WHEN_GREATER = 1_000_000

PERVERSE_DF = pd.DataFrame({
'all_nan': [np.nan] * 10,
'all_false': [False] * 10,
'all_True': [True] * 10,
'mixed_bool': np.concatenate([[True]*5, [False]*5]),
'mixed_float': np.concatenate([[0.5, np.nan, None], [6]*7]),
'float': [0.5] *10,
'int': [8] *10,
'negative': [-1] *10,
'UInt32': pd.Series([5]*10, dtype='UInt32'),
'UInt8None':pd.Series([None] * 10, dtype='UInt8')
})

def produce_summary_df(df, ordered_objs, df_name='test_df'):
"""
Expand Down Expand Up @@ -49,25 +60,10 @@ def produce_summary_df(df, ordered_objs, df_name='test_df'):
print("%s.summary(test_ser.%s)" % (kls.__name__, ser_name))
return pd.DataFrame(summary_col_dict), table_hint_col_dict, errs



class NonExistentSummaryRowException(Exception):
pass



PERVERSE_DF = pd.DataFrame({
'all_nan': [np.nan] * 10,
'all_false': [False] * 10,
'all_True': [True] * 10,
'mixed_bool': np.concatenate([[True]*5, [False]*5]),
'mixed_float': np.concatenate([[0.5, np.nan, None], [6]*7]),
'float': [0.5] *10,
'int': [8] *10,
'negative': [-1] *10,
'UInt32': pd.Series([5]*10, dtype='UInt32')
})

class AnalsysisPipeline(object):
"""
manage the ordering of a set of col_analysis objects
Expand Down
4 changes: 4 additions & 0 deletions js/components/DFViewer.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ export function DFViewer(
setActiveCol: () => null,
}
) {

const [agColsPure, agData] = dfToAgrid(df);
console.log("dfviewer df", df);
console.log("dfviewer agData", agData);

const styledColumns = updateAtMatch(
_.clone(agColsPure),
activeCol || '___never',
Expand Down
23 changes: 16 additions & 7 deletions js/components/gridUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,18 @@ export const anyFormatter = (params: ValueFormatterParams): string => {
}
return val;
};

const getNumericFormatter = (totalWidth:number) => {
const formatter = new Intl.NumberFormat('en-US');
const numericFormatter = (params: ValueFormatterParams): string => {
const val = params.value;
if(val === null) {
return ''
}
return formatter.format(params.value).padStart(totalWidth, ' ');
}
return numericFormatter
}
/*
console.log((new Intl.NumberFormat('en-US')).format(amount))
console.log((new Intl.NumberFormat('en-US', { maximumFractionDigits: 1})).format(number))
Expand All @@ -73,12 +85,7 @@ function getFormatter(hint: ColumnHint): ValueFormatterFunc<unknown> {

if (hint.is_integer) {
const totalWidth = commas + hint.max_digits;
return (params: ValueFormatterParams): string => {
console.log('params', params);

const formatter = new Intl.NumberFormat('en-US');
return formatter.format(params.value).padStart(totalWidth, ' ');
};
return getNumericFormatter(totalWidth);
} else {
/*
Expand All @@ -98,7 +105,9 @@ function getFormatter(hint: ColumnHint): ValueFormatterFunc<unknown> {
};*/
return (params: ValueFormatterParams): string => {
//console.log("params", params)

if(params.value === null) {
return '';
}
return floatFormatter.format(params.value);
};
}
Expand Down
39 changes: 38 additions & 1 deletion js/components/staticData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ export interface DFColumn {
name: string;
type: string;
}
export type DFDataRow = Record<string, string | number | boolean>;
export type DFDataRow = Record<string, string | number | boolean | null>;

export type DFData = DFDataRow[];

Expand Down Expand Up @@ -272,6 +272,9 @@ export const tableDf: DFWhole = {
schema: {
fields: [
{ name: 'index', type: 'integer' },
{ name: 'nanNumeric', type: 'int' },
{ name: 'nanObject', type: 'int' },
{ name: 'nanFloat', type: 'float' },
{ name: 'end station name', type: 'string' },
{ name: 'tripduration', type: 'integer' },
{ name: 'start station name', type: 'string' },
Expand All @@ -287,34 +290,50 @@ export const tableDf: DFWhole = {
tripduration: 471,
'start station name': 'Catherine St & Monroe St',
floatCol: '1.111',
nanNumeric: null,
nanObject: null,
nanFloat: null,
},
{
index: 1,
'end station name': 'South St & Whitehall St',
tripduration: 1494,
'start station name': '1 Ave & E 30 St',
floatCol: '8.888',
nanNumeric: null,
nanObject: null,
nanFloat: null,
},
{
index: 2,
'end station name': 'E 59 St & Sutton Pl',
tripduration: 464,
'start station name': 'E 48 St & 3 Ave',
floatCol: '9.999',
nanNumeric: null,
nanObject: null,
nanFloat: null,
},
{
index: 3,
'end station name': 'E 33 St & 5 Ave',
tripduration: 373,
'start station name': 'Pershing Square N',
floatCol: '-10.1',
nanCol: null,
nanNumeric: null,
nanObject: null,
nanFloat: null,
},
{
index: 4,
'end station name': 'Hancock St & Bedford Ave',
tripduration: 660,
'start station name': 'Atlantic Ave & Fort Greene Pl',
floatCol: '10.99',
nanNumeric: null,
nanObject: null,
nanFloat: null,
},
],
table_hints: {
Expand Down Expand Up @@ -345,5 +364,23 @@ export const tableDf: DFWhole = {
{ name: 'NA', cat_pop: 0.0 },
],
},
nanNumeric: {
is_numeric: true,
is_integer: true,
min_digits: 1,
max_digits: 3,
histogram:histograms.num_histo
},
nanFloat: {
is_numeric: true,
is_integer: false,
min_digits: 1,
max_digits: 3,
histogram:histograms.num_histo
},
nanObject: {
is_numeric: false,
}

},
};
8 changes: 6 additions & 2 deletions tests/basic_widget_test.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
import pandas as pd
from IPython.display import display
from buckaroo.buckaroo_widget import BuckarooWidget
from buckaroo.analysis_management import PERVERSE_DF


simple_df = pd.DataFrame({'int_col':[1, 2, 3], 'str_col':['a', 'b', 'c']})

def test_basic_instantiation():
df = simple_df
w = BuckarooWidget(df)
w = BuckarooWidget(simple_df)
assert w.dfConfig['totalRows'] == 3

def test_perverse_instantiation():
w = BuckarooWidget(PERVERSE_DF)
assert w.dfConfig['totalRows'] == 10

def test_basic_display():
df = simple_df
w = BuckarooWidget(df)
Expand Down

0 comments on commit e0f764f

Please sign in to comment.