diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso index c2de5b654654..45bf6d4e14e1 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso @@ -403,7 +403,7 @@ type DB_Table result = self.updated_columns (new_columns.map _.as_internal) Warning.attach (Deprecated.Warning "Standard.Database.DB_Table.DB_Table" "remove_columns_by_type" "Deprecated: use `remove_columns` with a `By_Type` instead.") result - ## ALIAS select_blank_fields, select_missing_columns, select_na + ## ALIAS select_blank_fields, select_missing_columns, select_na, filter_blank_columns GROUP Standard.Base.Selections ICON select_column @@ -2559,7 +2559,7 @@ type DB_Table _ = [columns, shrink_types, error_on_missing_columns, on_problems] Error.throw (Unsupported_Database_Operation.Error "auto_cast") - ## ALIAS drop_missing_rows, dropna + ## ALIAS drop_missing_rows, dropna, remove_blank_rows, remove_empty_rows, remove_missing_rows, filter_empty_rows, drop_empty_rows GROUP Standard.Base.Selections ICON preparation Remove rows which are all blank or containing blank values. diff --git a/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Snowflake_Connection.enso b/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Snowflake_Connection.enso index 3e73680448cb..bf1d4c07a3dd 100644 --- a/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Snowflake_Connection.enso +++ b/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Snowflake_Connection.enso @@ -327,4 +327,4 @@ type Snowflake_Connection Converts this value to a JSON serializable object. to_js_object : JS_Object to_js_object self = - JS_Object.from_pairs [["type", "Snowflake_Connection"], ["links", self.tables.at "Name" . to_vector]] + JS_Object.from_pairs [["type", "Snowflake_Connection"], ["links", self.tables.at "Name" . to_vector], ["get_child_node_action", "query"]] diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso index 663a1236f9ce..9c301e2a2c65 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso @@ -40,6 +40,7 @@ from project.Internal.Storage import enso_to_java, java_to_enso polyglot java import org.enso.base.Time_Utils polyglot java import org.enso.table.data.column.operation.cast.CastProblemAggregator polyglot java import org.enso.table.data.column.operation.CountNothing +polyglot java import org.enso.table.data.column.operation.CountUntrimmed polyglot java import org.enso.table.data.column.operation.unary.DatePartOperation polyglot java import org.enso.table.data.column.operation.unary.IsEmptyOperation polyglot java import org.enso.table.data.column.operation.unary.IsFiniteOperation @@ -2213,6 +2214,19 @@ type Column count_nothing : Integer count_nothing self = CountNothing.apply self.java_column + ## PRIVATE + Counts the number of text values with leading or trailing whitespace. + Used for data quality indicator in Table Viz. + count_untrimmed : Integer -> Integer | Nothing + count_untrimmed self sample_size:Integer=Column.default_sample_size = + if (self.value_type == Value_Type.Mixed || self.value_type.is_text).not then Nothing else + CountUntrimmed.apply self.java_column sample_size + + ## PRIVATE + Default size for sampling data quality indicators. + default_sample_size -> Integer = + CountUntrimmed.DEFAULT_SAMPLE_SIZE + ## GROUP Standard.Base.Metadata ICON metadata Returns the number of non-null items in this column. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso index 7a96f752b843..eb458835c857 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso @@ -690,7 +690,7 @@ type Table result = Table.new new_columns Warning.attach (Deprecated.Warning "Standard.Table.Table.Table" "remove_columns_by_type" "Deprecated: use `remove_columns` with a `By_Type` instead.") result - ## ALIAS select_blank_fields, select_missing_columns, select_na + ## ALIAS drop_missing_rows, dropna, remove_blank_rows, remove_empty_rows, remove_missing_rows, filter_empty_rows, drop_empty_rows GROUP Standard.Base.Selections ICON select_column @@ -730,7 +730,7 @@ type Table new_columns = self.columns_helper.select_blank_columns_helper when treat_nans_as_blank Table.new new_columns - ## ALIAS drop_missing_columns, drop_na, select_blank_columns, select_blank_fields, select_missing_columns, select_na + ## ALIAS drop_missing_columns, drop_na, select_blank_columns, select_blank_fields, select_missing_columns, select_na, filter_blank_columns GROUP Standard.Base.Selections ICON select_column diff --git a/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso b/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso index 121819081711..1c36bc89a3e1 100644 --- a/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso +++ b/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso @@ -32,14 +32,13 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error _ : Row -> make_json_for_dictionary x.to_dictionary max_rows "column" _ : Column -> prepare_visualization x.to_table max_rows _ : Table -> - dataframe = x.take max_rows all_rows_count = x.row_count - make_json_for_table dataframe all_rows_count True False + make_json_for_table x max_rows all_rows_count True False _ : DB_Column -> prepare_visualization x.to_table max_rows _ : DB_Table -> dataframe = x.read (..First max_rows) all_rows_count = x.row_count - make_json_for_table dataframe all_rows_count True True + make_json_for_table dataframe max_rows all_rows_count True True _ : Function -> pairs = [['_display_text_', '[Function '+x.to_text+']']] value = JS_Object.from_pairs pairs @@ -59,14 +58,6 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error Column Limit max_columns = 250 -## PRIVATE -whitespace_count : Column -> Integer | Nothing -whitespace_count col = - find_whitespace col = - filtered = col.to_vector.filter (c-> c.is_a Text && c.is_empty.not && (c.first.is_whitespace || c.last.is_whitespace)) - filtered.length - if (col.value_type == Value_Type.Mixed || col.value_type.is_text) then find_whitespace col else Nothing - ## PRIVATE Render Error to JSON make_json_for_error : Any -> JS_Object @@ -187,22 +178,24 @@ make_json_for_xml_element xml_element max_items type:Text="XML_Element" = to display. - all_rows_count: the number of all rows in the underlying data, useful if only a fragment is displayed. -make_json_for_table : Table -> Integer -> Boolean -> Boolean -> JS_Object -make_json_for_table dataframe all_rows_count include_index_col is_db_table = - get_vector c = Warning.set (c.to_vector.map v-> make_json_for_value v) [] - columns = dataframe.columns - header = ["header", columns.map .name] - value_type = ["value_type", columns.map .value_type] - data = ["data", columns.map get_vector] - all_rows = ["all_rows_count", all_rows_count] - has_index_col = ["has_index_col", include_index_col] - links = ["get_child_node_action", "get_row"] - number_of_nothing = if is_db_table then Nothing else columns.map c-> c.count_nothing - number_of_whitespace= if is_db_table then Nothing else columns.map c-> whitespace_count c - nothing_p = JS_Object.from_pairs [["name", "Number of nothings"], ["percentage_value", number_of_nothing]] - whitespace_p = JS_Object.from_pairs [["name", "Number of untrimmed whitespace"], ["percentage_value",number_of_whitespace]] - data_quality_metrics = [nothing_p, whitespace_p] - pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_metrics", data_quality_metrics] ,["type", "Table"]] +make_json_for_table : Table -> Integer -> Integer -> Boolean -> Boolean -> JS_Object +make_json_for_table dataframe max_rows all_rows_count include_index_col is_db_table = + act_max = if max_rows < all_rows_count then max_rows else all_rows_count + get_vector c = Warning.set (Vector.new act_max i-> make_json_for_value (c.get i)) [] + columns = dataframe.columns + header = ["header", columns.map .name] + value_type = ["value_type", columns.map .value_type] + data = ["data", columns.map get_vector] + all_rows = ["all_rows_count", all_rows_count] + has_index_col = ["has_index_col", include_index_col] + links = ["get_child_node_action", "get_row"] + data_quality_metrics = if is_db_table then [] else + number_nothing = JS_Object.from_pairs [["name", "Count nothings"], ["percentage_value", columns.map .count_nothing]] + number_untrimmed = case all_rows_count > Column.default_sample_size of + False -> JS_Object.from_pairs [["name", "Count untrimmed whitespace"], ["percentage_value", columns.map .count_untrimmed]] + True -> JS_Object.from_pairs [["name", "Count untrimmed whitespace (sampled)"], ["percentage_value", columns.map .count_untrimmed]] + [number_nothing, number_untrimmed] + pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_metrics", data_quality_metrics] ,["type", "Table"]] JS_Object.from_pairs pairs ## PRIVATE diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java index b525ce753bf8..1c6da9b4f901 100644 --- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java @@ -35,6 +35,30 @@ public static String substring(String string, int from, int to) { return string.substring(from, to); } + /** + * Checks if the string has leading or trailing whitespace. + * + * @param s the string to check + * @return whether the string has leading or trailing whitespace + */ + public static boolean has_leading_trailing_whitespace(String s) { + if (s == null || s.isEmpty()) { + return false; + } + + var leading = Text_Utils.take_prefix(s, 1); + if (leading != null && is_all_whitespace(leading)) { + return true; + } + + var trailing = Text_Utils.take_suffix(s, 1); + if (trailing != null && is_all_whitespace(trailing)) { + return true; + } + + return false; + } + /** * Returns a new string containing characters starting at the given UTF-16 index. * diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java new file mode 100644 index 000000000000..9e0f5923a015 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java @@ -0,0 +1,66 @@ +package org.enso.table.data.column.operation; + +import java.util.Random; +import org.enso.base.Text_Utils; +import org.enso.table.data.column.storage.ColumnStorage; +import org.enso.table.data.column.storage.StringStorage; +import org.enso.table.data.table.Column; +import org.graalvm.polyglot.Context; + +public class CountUntrimmed { + // Default seed for random number generation (no specific reason for this value, just stability on + // result). + private static final long RANDOM_SEED = 677280131; + + // Default sample size for counting untrimmed cells. + public static final long DEFAULT_SAMPLE_SIZE = 10000; + + /** Counts the number of cells in the columns with leading or trailing whitespace. */ + public static Long apply(Column column, long sampleSize) throws InterruptedException { + ColumnStorage storage = column.getStorage(); + return applyToStorage(storage, sampleSize); + } + + /** Counts the number of cells in the given storage with leading or trailing whitespace. */ + public static Long applyToStorage(ColumnStorage storage, long sampleSize) + throws InterruptedException { + return (sampleSize == DEFAULT_SAMPLE_SIZE && storage instanceof StringStorage stringStorage) + ? stringStorage.cachedUntrimmedCount() + : (Long) compute(storage, sampleSize, Context.getCurrent()); + } + + /** Internal method performing the calculation on a storage. */ + public static long compute(ColumnStorage storage, long sampleSize, Context context) { + long size = storage.getSize(); + + long count = 0; + if (sampleSize < size) { + var rng = new Random(RANDOM_SEED); + for (int i = 0; i < sampleSize; i++) { + long idx = rng.nextInt(Math.toIntExact(size)); + var val = storage.getItemAsObject(idx); + if (val instanceof String str && Text_Utils.has_leading_trailing_whitespace(str)) { + count++; + } + + if (context != null) { + context.safepoint(); + } + } + count = Math.min(size, (long) Math.ceil((double) count / sampleSize * size)); + } else { + for (long i = 0; i < storage.getSize(); i++) { + var val = storage.getItemAsObject(i); + if (val instanceof String str && Text_Utils.has_leading_trailing_whitespace(str)) { + count++; + } + + if (context != null) { + context.safepoint(); + } + } + } + + return count; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 5994a921f3c5..b7268a8f431f 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -1,8 +1,12 @@ package org.enso.table.data.column.storage; import java.util.BitSet; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; import org.enso.base.CompareException; import org.enso.base.Text_Utils; +import org.enso.table.data.column.operation.CountUntrimmed; import org.enso.table.data.column.operation.map.BinaryMapOperation; import org.enso.table.data.column.operation.map.MapOperationProblemAggregator; import org.enso.table.data.column.operation.map.MapOperationStorage; @@ -15,11 +19,14 @@ import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.data.column.storage.type.TextType; import org.graalvm.polyglot.Context; +import org.slf4j.Logger; /** A column storing strings. */ public final class StringStorage extends SpecializedStorage { + private static final Logger LOGGER = org.slf4j.LoggerFactory.getLogger(StringStorage.class); private final TextType type; + private Future untrimmedCount; /** * @param data the underlying data @@ -29,6 +36,10 @@ public final class StringStorage extends SpecializedStorage { public StringStorage(String[] data, int size, TextType type) { super(data, size, buildOps()); this.type = type; + + untrimmedCount = + CompletableFuture.supplyAsync( + () -> CountUntrimmed.compute(this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, null)); } @Override @@ -46,6 +57,29 @@ public TextType getType() { return type; } + /** + * Counts the number of cells in the columns with whitespace. If the calculation fails then it + * returns null. + * + * @return the number of cells with whitespace + */ + public Long cachedUntrimmedCount() throws InterruptedException { + if (untrimmedCount.isCancelled()) { + // Need to recompute the value, as was cancelled. + untrimmedCount = + CompletableFuture.completedFuture( + CountUntrimmed.compute( + this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, Context.getCurrent())); + } + + try { + return untrimmedCount.get(); + } catch (ExecutionException e) { + LOGGER.error("Failed to compute untrimmed count", e); + return null; + } + } + private static MapOperationStorage> buildOps() { MapOperationStorage> t = ObjectStorage.buildObjectOps(); t.add( diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java index 2d89360443b2..ac775485d1b8 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java @@ -303,8 +303,18 @@ private static MapOperationStorage buildOps() { @Override public Storage slice(int offset, int limit) { int newSize = Math.min(size - offset, limit); - long[] newData = new long[newSize]; - System.arraycopy(data, offset, newData, 0, newSize); + long[] newData; + + // Special case if slice is after the actual data + if (offset >= data.length) { + newData = new long[0]; + } else { + // Can only copy as much as there is data + int newDataSize = Math.min(data.length - offset, newSize); + newData = new long[newDataSize]; + System.arraycopy(data, offset, newData, 0, newDataSize); + } + BitSet newMask = isNothing.get(offset, offset + limit); return new DoubleStorage(newData, newSize, newMask); } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java index a30332d64c0b..9af9a246beb8 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java @@ -210,8 +210,18 @@ public long[] getRawData() { @Override public LongStorage slice(int offset, int limit) { int newSize = Math.min(size - offset, limit); - long[] newData = new long[newSize]; - System.arraycopy(data, offset, newData, 0, newSize); + long[] newData; + + // Special case if slice is after the actual data + if (offset >= data.length) { + newData = new long[0]; + } else { + // Can only copy as much as there is data + int newDataSize = Math.min(data.length - offset, newSize); + newData = new long[newDataSize]; + System.arraycopy(data, offset, newData, 0, newDataSize); + } + BitSet newMask = isNothing.get(offset, offset + limit); return new LongStorage(newData, newSize, newMask, type); } diff --git a/test/Visualization_Tests/src/Table_Spec.enso b/test/Visualization_Tests/src/Table_Spec.enso index 5743cc8c7fb1..3d5de6016179 100644 --- a/test/Visualization_Tests/src/Table_Spec.enso +++ b/test/Visualization_Tests/src/Table_Spec.enso @@ -47,16 +47,17 @@ type Foo_Link to_js_object self = JS_Object.from_pairs [["x", self.x], ["links", ["a", "b", "c"]]] add_specs suite_builder = - make_json header data all_rows value_type has_index_col get_child_node number_of_nothing number_of_whitespace = + make_json header data all_rows value_type has_index_col get_child_node number_of_nothing=Nothing number_of_whitespace=Nothing number_of_whitespace_sampled=Nothing = p_header = ["header", header] p_data = ["data", data] p_all_rows = ["all_rows_count", all_rows] p_value_type = ["value_type", value_type] p_has_index_col = ["has_index_col", has_index_col] p_get_child_node = ["get_child_node_action", get_child_node] - p_number_of_nothing = JS_Object.from_pairs [["name", "Number of nothings"], ["percentage_value", number_of_nothing]] - p_number_of_whitespace = JS_Object.from_pairs [["name", "Number of untrimmed whitespace"], ["percentage_value", number_of_whitespace]] - data_quality_metrics = [p_number_of_nothing, p_number_of_whitespace] + p_number_of_nothing = if number_of_nothing.is_nothing then [] else [JS_Object.from_pairs [["name", "Count nothings"], ["percentage_value", number_of_nothing]]] + p_number_of_whitespace = if number_of_whitespace.is_nothing then [] else [JS_Object.from_pairs [["name", "Count untrimmed whitespace"], ["percentage_value", number_of_whitespace]]] + p_number_of_whitespace_sampled = if number_of_whitespace_sampled.is_nothing then [] else [JS_Object.from_pairs [["name", "Count untrimmed whitespace (sampled)"], ["percentage_value", number_of_whitespace_sampled]]] + data_quality_metrics = p_number_of_nothing + p_number_of_whitespace + p_number_of_whitespace_sampled pairs = [p_header, p_value_type, p_data, p_all_rows, p_has_index_col, p_get_child_node, ["data_quality_metrics", data_quality_metrics], ["type", "Table"]] JS_Object.from_pairs pairs . to_text @@ -67,19 +68,19 @@ add_specs suite_builder = vis = Visualization.prepare_visualization data.t 1 value_type_int = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Integer"], ["display_text", "Integer (64 bits)"], ["bits", 64]] value_type_char = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Char"], ["display_text", "Char (variable length, max_size=unlimited)"], ["size", Nothing], ["variable_length", True]] - json = make_json header=["A", "B", "C"] data=[['a'], [2], [3]] all_rows=3 value_type=[value_type_char, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=Nothing number_of_whitespace=Nothing + json = make_json header=["A", "B", "C"] data=[['a'], [2], [3]] all_rows=3 value_type=[value_type_char, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" vis . should_equal json group_builder.specify "should visualize database columns" <| vis = Visualization.prepare_visualization (data.t.at "A") 2 value_type_char = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Char"], ["display_text", "Char (variable length, max_size=unlimited)"], ["size", Nothing], ["variable_length", True]] value_type_float = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Float"], ["display_text", "Float (64 bits)"], ["bits", 64]] - json = make_json header=["A"] data=[['a', 'a']] all_rows=3 value_type=[value_type_char] has_index_col=True get_child_node="get_row" number_of_nothing=Nothing number_of_whitespace=Nothing + json = make_json header=["A"] data=[['a', 'a']] all_rows=3 value_type=[value_type_char] has_index_col=True get_child_node="get_row" vis . should_equal json g = data.t.aggregate ["A", "B"] [Aggregate_Column.Average "C"] . at "Average C" vis2 = Visualization.prepare_visualization g 1 - json2 = make_json header=["Average C"] data=[[4.0]] all_rows=2 value_type=[value_type_float] has_index_col=True get_child_node="get_row" number_of_nothing=Nothing number_of_whitespace=Nothing + json2 = make_json header=["Average C"] data=[[4.0]] all_rows=2 value_type=[value_type_float] has_index_col=True get_child_node="get_row" vis2 . should_equal json2 group_builder.specify "should visualize dataframe tables" <| @@ -144,6 +145,18 @@ add_specs suite_builder = json = make_json header=["A", "B", "C"] data=[['hello', ' leading space', 'trailing space '],['a', 'b', 'c'],[7, 8, 9]] all_rows=3 value_type=[value_type_char, value_type_char, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0, 0, 0] number_of_whitespace=[2, 0, Nothing] vis . should_equal json + group_builder.specify "should indicate number of leading/trailing whitespace with sample if more than 10000 rows" <| + space_data = 0.up_to 11000 . map i-> case i % 4 of + 0 -> 'hello' + 1 -> ' leading space' + 2 -> 'trailing space ' + 3 -> ' leading and trailing space ' + space_table = Table.new [["A", space_data]] + vis = Visualization.prepare_visualization space_table 1000 + value_type_char = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Char"], ["display_text", "Char (variable length, max_size=unlimited)"], ["size", Nothing], ["variable_length", True]] + json = make_json header=["A"] data=[space_data.take 1000] all_rows=11000 value_type=[value_type_char] has_index_col=True get_child_node="get_row" number_of_nothing=[0] number_of_whitespace_sampled=[8266] + vis . should_equal json + main filter=Nothing = suite = Test.build suite_builder-> add_specs suite_builder