From d0951838504c4825c9fe637a23865bef620ae642 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Wed, 20 Nov 2024 16:31:46 +0000 Subject: [PATCH 01/15] Use memoized in built untrimmed count. --- .../Standard/Table/0.0.0-dev/src/Column.enso | 9 +++ .../0.0.0-dev/src/Table/Visualization.enso | 24 +++----- .../main/java/org/enso/base/Text_Utils.java | 58 +++++++++++++++++++ .../data/column/operation/CountUntrimmed.java | 39 +++++++++++++ .../data/column/storage/StringStorage.java | 16 +++++ 5 files changed, 130 insertions(+), 16 deletions(-) create mode 100644 std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso index 663a1236f9ce..128c13fc30a1 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso @@ -40,6 +40,7 @@ from project.Internal.Storage import enso_to_java, java_to_enso polyglot java import org.enso.base.Time_Utils polyglot java import org.enso.table.data.column.operation.cast.CastProblemAggregator polyglot java import org.enso.table.data.column.operation.CountNothing +polyglot java import org.enso.table.data.column.operation.CountUntrimmed polyglot java import org.enso.table.data.column.operation.unary.DatePartOperation polyglot java import org.enso.table.data.column.operation.unary.IsEmptyOperation polyglot java import org.enso.table.data.column.operation.unary.IsFiniteOperation @@ -2213,6 +2214,14 @@ type Column count_nothing : Integer count_nothing self = CountNothing.apply self.java_column + ## PRIVATE + Counts the number of text values with leading or trailing whitespace. + Used for data quality indicator in Table Viz. + count_untrimmed : Integer | Nothing + count_untrimmed self = + if (self.value_type == Value_Type.Mixed || self.value_type.is_text).not then Nothing else + CountUntrimmed.apply self.java_column + ## GROUP Standard.Base.Metadata ICON metadata Returns the number of non-null items in this column. diff --git a/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso b/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso index 121819081711..9e91314852f9 100644 --- a/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso +++ b/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso @@ -32,14 +32,13 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error _ : Row -> make_json_for_dictionary x.to_dictionary max_rows "column" _ : Column -> prepare_visualization x.to_table max_rows _ : Table -> - dataframe = x.take max_rows all_rows_count = x.row_count - make_json_for_table dataframe all_rows_count True False + make_json_for_table x max_rows all_rows_count True False _ : DB_Column -> prepare_visualization x.to_table max_rows _ : DB_Table -> dataframe = x.read (..First max_rows) all_rows_count = x.row_count - make_json_for_table dataframe all_rows_count True True + make_json_for_table dataframe max_rows all_rows_count True True _ : Function -> pairs = [['_display_text_', '[Function '+x.to_text+']']] value = JS_Object.from_pairs pairs @@ -59,14 +58,6 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error Column Limit max_columns = 250 -## PRIVATE -whitespace_count : Column -> Integer | Nothing -whitespace_count col = - find_whitespace col = - filtered = col.to_vector.filter (c-> c.is_a Text && c.is_empty.not && (c.first.is_whitespace || c.last.is_whitespace)) - filtered.length - if (col.value_type == Value_Type.Mixed || col.value_type.is_text) then find_whitespace col else Nothing - ## PRIVATE Render Error to JSON make_json_for_error : Any -> JS_Object @@ -187,9 +178,10 @@ make_json_for_xml_element xml_element max_items type:Text="XML_Element" = to display. - all_rows_count: the number of all rows in the underlying data, useful if only a fragment is displayed. -make_json_for_table : Table -> Integer -> Boolean -> Boolean -> JS_Object -make_json_for_table dataframe all_rows_count include_index_col is_db_table = - get_vector c = Warning.set (c.to_vector.map v-> make_json_for_value v) [] +make_json_for_table : Table -> Integer -> Integer -> Boolean -> Boolean -> JS_Object +make_json_for_table dataframe max_rows all_rows_count include_index_col is_db_table = + act_max = if max_rows < all_rows_count then max_rows else all_rows_count + get_vector c = Warning.set (Vector.new act_max i-> make_json_for_value (c.get i)) [] columns = dataframe.columns header = ["header", columns.map .name] value_type = ["value_type", columns.map .value_type] @@ -197,8 +189,8 @@ make_json_for_table dataframe all_rows_count include_index_col is_db_table = all_rows = ["all_rows_count", all_rows_count] has_index_col = ["has_index_col", include_index_col] links = ["get_child_node_action", "get_row"] - number_of_nothing = if is_db_table then Nothing else columns.map c-> c.count_nothing - number_of_whitespace= if is_db_table then Nothing else columns.map c-> whitespace_count c + number_of_nothing = if is_db_table then Nothing else columns.map .count_nothing + number_of_whitespace= if is_db_table then Nothing else columns.map .count_untrimmed nothing_p = JS_Object.from_pairs [["name", "Number of nothings"], ["percentage_value", number_of_nothing]] whitespace_p = JS_Object.from_pairs [["name", "Number of untrimmed whitespace"], ["percentage_value",number_of_whitespace]] data_quality_metrics = [nothing_p, whitespace_p] diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java index b525ce753bf8..945c431b3c7f 100644 --- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java @@ -35,6 +35,64 @@ public static String substring(String string, int from, int to) { return string.substring(from, to); } + /** + * Gets the first Grapheme cluster in the string. + * + * @param string the string to substring + * @return the first grapheme cluster in the string or null if the string is empty. + */ + public static String first_cluster(String string) { + BreakIterator breakIterator = BreakIterator.getCharacterInstance(); + breakIterator.setText(string); + int start = breakIterator.first(); + if (start == -1) { + return null; + } + int end = breakIterator.next(); + return string.substring(start, end); + } + + /** + * Gets the last Grapheme cluster in the string. + * + * @param string the string to substring + * @return the last grapheme cluster in the string or null if the string is empty. + */ + public static String last_cluster(String string) { + BreakIterator breakIterator = BreakIterator.getCharacterInstance(); + breakIterator.setText(string); + int start = breakIterator.last(); + if (start == -1) { + return null; + } + int end = breakIterator.previous(); + return string.substring(end, start); + } + + /** + * Checks if the string has leading or trailing whitespace. + * + * @param s the string to check + * @return whether the string has leading or trailing whitespace + */ + public static boolean has_leading_trailing_whitespace(String s) { + if (s == null && s.isEmpty()) { + return false; + } + + var leading = Text_Utils.first_cluster(s); + if (leading != null && is_all_whitespace(leading)) { + return true; + } + + var trailing = Text_Utils.last_cluster(s); + if (trailing != null && is_all_whitespace(trailing)) { + return true; + } + + return false; + } + /** * Returns a new string containing characters starting at the given UTF-16 index. * diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java new file mode 100644 index 000000000000..da0c419f19d8 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java @@ -0,0 +1,39 @@ +package org.enso.table.data.column.operation; + +import org.enso.base.Text_Utils; +import org.enso.table.data.column.storage.ColumnStorage; +import org.enso.table.data.column.storage.StringStorage; +import org.enso.table.data.table.Column; +import org.graalvm.polyglot.Context; + +public class CountUntrimmed { + /** Counts the number of cells in the columns with leading or trailing whitespace. */ + public static long apply(Column column) { + ColumnStorage storage = column.getStorage(); + return applyToStorage(storage); + } + + /** Counts the number of cells in the given storage with leading or trailing whitespace. */ + public static long applyToStorage(ColumnStorage storage) { + if (storage instanceof StringStorage stringStorage) { + return stringStorage.countLeadingTrailingWhitespace(); + } + return compute(storage); + } + + /** Internal method performing the calculation on a storage. */ + public static long compute(ColumnStorage storage) { + Context context = Context.getCurrent(); + long count = 0; + for (long i = 0; i < storage.getSize(); i++) { + var val = storage.getItemAsObject(i); + if (val instanceof String str) { + if (Text_Utils.has_leading_trailing_whitespace(str)) { + count++; + } + } + context.safepoint(); + } + return count; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 5994a921f3c5..3f503f751293 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -3,6 +3,7 @@ import java.util.BitSet; import org.enso.base.CompareException; import org.enso.base.Text_Utils; +import org.enso.table.data.column.operation.CountUntrimmed; import org.enso.table.data.column.operation.map.BinaryMapOperation; import org.enso.table.data.column.operation.map.MapOperationProblemAggregator; import org.enso.table.data.column.operation.map.MapOperationStorage; @@ -20,6 +21,7 @@ public final class StringStorage extends SpecializedStorage { private final TextType type; + private long _countLeadingTrailingWhitespace = -1; /** * @param data the underlying data @@ -46,6 +48,20 @@ public TextType getType() { return type; } + /** + * Counts the number of cells in the columns with whitespace. + * Memoized into the storage for performance. + * @return the number of cells with whitespace + */ + public Long countLeadingTrailingWhitespace() { + if (_countLeadingTrailingWhitespace >= 0) { + return _countLeadingTrailingWhitespace; + } + + _countLeadingTrailingWhitespace = CountUntrimmed.compute(this); + return _countLeadingTrailingWhitespace; + } + private static MapOperationStorage> buildOps() { MapOperationStorage> t = ObjectStorage.buildObjectOps(); t.add( From 4281c5185ba50ae2550881577ce83818fc6c7dc3 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Thu, 21 Nov 2024 11:01:07 +0000 Subject: [PATCH 02/15] Logging --- .../data/column/operation/CountUntrimmed.java | 9 ++++++++- .../table/data/column/storage/StringStorage.java | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java index da0c419f19d8..4caa5fb2471b 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java @@ -5,8 +5,12 @@ import org.enso.table.data.column.storage.StringStorage; import org.enso.table.data.table.Column; import org.graalvm.polyglot.Context; +import org.slf4j.LoggerFactory; +import org.slf4j.Logger; public class CountUntrimmed { + private static final Logger LOGGER = LoggerFactory.getLogger(CountUntrimmed.class); + /** Counts the number of cells in the columns with leading or trailing whitespace. */ public static long apply(Column column) { ColumnStorage storage = column.getStorage(); @@ -16,8 +20,11 @@ public static long apply(Column column) { /** Counts the number of cells in the given storage with leading or trailing whitespace. */ public static long applyToStorage(ColumnStorage storage) { if (storage instanceof StringStorage stringStorage) { + LOGGER.warn("Using memoized implementation for StringStorage"); return stringStorage.countLeadingTrailingWhitespace(); } + + LOGGER.warn("Using fall back implementation for ColumnStorage"); return compute(storage); } @@ -29,7 +36,7 @@ public static long compute(ColumnStorage storage) { var val = storage.getItemAsObject(i); if (val instanceof String str) { if (Text_Utils.has_leading_trailing_whitespace(str)) { - count++; + count += 100; } } context.safepoint(); diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 3f503f751293..a88dddc01833 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -1,6 +1,12 @@ package org.enso.table.data.column.storage; import java.util.BitSet; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executor; +import java.util.concurrent.Executors; +import java.util.logging.Level; +import java.util.logging.Logger; + import org.enso.base.CompareException; import org.enso.base.Text_Utils; import org.enso.table.data.column.operation.CountUntrimmed; @@ -16,9 +22,12 @@ import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.data.column.storage.type.TextType; import org.graalvm.polyglot.Context; +import org.slf4j.LoggerFactory; /** A column storing strings. */ public final class StringStorage extends SpecializedStorage { + private static final Executor EXECUTOR = Executors.newSingleThreadExecutor(); + private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(StringStorage.class); private final TextType type; private long _countLeadingTrailingWhitespace = -1; @@ -31,6 +40,10 @@ public final class StringStorage extends SpecializedStorage { public StringStorage(String[] data, int size, TextType type) { super(data, size, buildOps()); this.type = type; + + CompletableFuture.runAsync( + this::countLeadingTrailingWhitespace, + EXECUTOR); } @Override @@ -55,10 +68,12 @@ public TextType getType() { */ public Long countLeadingTrailingWhitespace() { if (_countLeadingTrailingWhitespace >= 0) { + LOGGER.warn("Using memoized implementation for StringStorage"); return _countLeadingTrailingWhitespace; } _countLeadingTrailingWhitespace = CountUntrimmed.compute(this); + LOGGER.warn("Counted leading and trailing whitespace in the column " + this.size); return _countLeadingTrailingWhitespace; } From ff532a20b17b161afdb599960812da2720c014af Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Tue, 26 Nov 2024 09:29:35 +0000 Subject: [PATCH 03/15] Fix issue with DB_Table visualization. --- .../Visualization/0.0.0-dev/src/Table/Visualization.enso | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso b/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso index 9e91314852f9..2b65eff9a7d0 100644 --- a/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso +++ b/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso @@ -189,11 +189,10 @@ make_json_for_table dataframe max_rows all_rows_count include_index_col is_db_ta all_rows = ["all_rows_count", all_rows_count] has_index_col = ["has_index_col", include_index_col] links = ["get_child_node_action", "get_row"] - number_of_nothing = if is_db_table then Nothing else columns.map .count_nothing - number_of_whitespace= if is_db_table then Nothing else columns.map .count_untrimmed - nothing_p = JS_Object.from_pairs [["name", "Number of nothings"], ["percentage_value", number_of_nothing]] - whitespace_p = JS_Object.from_pairs [["name", "Number of untrimmed whitespace"], ["percentage_value",number_of_whitespace]] - data_quality_metrics = [nothing_p, whitespace_p] + data_quality_metrics = if is_db_table then [] else + number_nothing = JS_Object.from_pairs [["name", "Number of nothings"], ["percentage_value", columns.map .count_nothing]] + number_untrimmed = JS_Object.from_pairs [["name", "Number of untrimmed whitespace"], ["percentage_value", columns.map .count_untrimmed]] + [number_nothing, number_untrimmed] pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_metrics", data_quality_metrics] ,["type", "Table"]] JS_Object.from_pairs pairs From 7bf6c9ebbef11cfbac40ed2bfd1aa3b506d0d819 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Thu, 28 Nov 2024 16:24:40 +0000 Subject: [PATCH 04/15] Make Snowflake link double click work. --- .../Standard/Snowflake/0.0.0-dev/src/Snowflake_Connection.enso | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Snowflake_Connection.enso b/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Snowflake_Connection.enso index 3e73680448cb..bf1d4c07a3dd 100644 --- a/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Snowflake_Connection.enso +++ b/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Snowflake_Connection.enso @@ -327,4 +327,4 @@ type Snowflake_Connection Converts this value to a JSON serializable object. to_js_object : JS_Object to_js_object self = - JS_Object.from_pairs [["type", "Snowflake_Connection"], ["links", self.tables.at "Name" . to_vector]] + JS_Object.from_pairs [["type", "Snowflake_Connection"], ["links", self.tables.at "Name" . to_vector], ["get_child_node_action", "query"]] From f2a7f39ce803452be4a7bb6c951aada64f682f70 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Thu, 28 Nov 2024 16:27:42 +0000 Subject: [PATCH 05/15] ALIAS for blank functions. --- .../lib/Standard/Database/0.0.0-dev/src/DB_Table.enso | 4 ++-- distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso index c2de5b654654..45bf6d4e14e1 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso @@ -403,7 +403,7 @@ type DB_Table result = self.updated_columns (new_columns.map _.as_internal) Warning.attach (Deprecated.Warning "Standard.Database.DB_Table.DB_Table" "remove_columns_by_type" "Deprecated: use `remove_columns` with a `By_Type` instead.") result - ## ALIAS select_blank_fields, select_missing_columns, select_na + ## ALIAS select_blank_fields, select_missing_columns, select_na, filter_blank_columns GROUP Standard.Base.Selections ICON select_column @@ -2559,7 +2559,7 @@ type DB_Table _ = [columns, shrink_types, error_on_missing_columns, on_problems] Error.throw (Unsupported_Database_Operation.Error "auto_cast") - ## ALIAS drop_missing_rows, dropna + ## ALIAS drop_missing_rows, dropna, remove_blank_rows, remove_empty_rows, remove_missing_rows, filter_empty_rows, drop_empty_rows GROUP Standard.Base.Selections ICON preparation Remove rows which are all blank or containing blank values. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso index 7a96f752b843..eb458835c857 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso @@ -690,7 +690,7 @@ type Table result = Table.new new_columns Warning.attach (Deprecated.Warning "Standard.Table.Table.Table" "remove_columns_by_type" "Deprecated: use `remove_columns` with a `By_Type` instead.") result - ## ALIAS select_blank_fields, select_missing_columns, select_na + ## ALIAS drop_missing_rows, dropna, remove_blank_rows, remove_empty_rows, remove_missing_rows, filter_empty_rows, drop_empty_rows GROUP Standard.Base.Selections ICON select_column @@ -730,7 +730,7 @@ type Table new_columns = self.columns_helper.select_blank_columns_helper when treat_nans_as_blank Table.new new_columns - ## ALIAS drop_missing_columns, drop_na, select_blank_columns, select_blank_fields, select_missing_columns, select_na + ## ALIAS drop_missing_columns, drop_na, select_blank_columns, select_blank_fields, select_missing_columns, select_na, filter_blank_columns GROUP Standard.Base.Selections ICON select_column From fe647dfc65c770e1e3f390cd91a88877274f25f7 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 29 Nov 2024 09:27:20 +0000 Subject: [PATCH 06/15] Memoize nearly working... --- .../Standard/Table/0.0.0-dev/src/Column.enso | 6 +-- .../data/column/operation/CountUntrimmed.java | 41 +++++++++++++++---- .../data/column/storage/StringStorage.java | 18 ++++---- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso index 128c13fc30a1..aa74ed7b7c8f 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso @@ -2217,10 +2217,10 @@ type Column ## PRIVATE Counts the number of text values with leading or trailing whitespace. Used for data quality indicator in Table Viz. - count_untrimmed : Integer | Nothing - count_untrimmed self = + count_untrimmed : Integer -> Integer | Nothing + count_untrimmed self sample_size:Integer=CountUntrimmed.DEFAULT_SAMPLE_SIZE = if (self.value_type == Value_Type.Mixed || self.value_type.is_text).not then Nothing else - CountUntrimmed.apply self.java_column + CountUntrimmed.apply self.java_column sample_size ## GROUP Standard.Base.Metadata ICON metadata diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java index 4caa5fb2471b..73485d1dfa7e 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java @@ -1,6 +1,7 @@ package org.enso.table.data.column.operation; import org.enso.base.Text_Utils; +import org.enso.base.random.Random_Utils; import org.enso.table.data.column.storage.ColumnStorage; import org.enso.table.data.column.storage.StringStorage; import org.enso.table.data.table.Column; @@ -8,39 +9,61 @@ import org.slf4j.LoggerFactory; import org.slf4j.Logger; +import java.util.Random; +import java.util.random.RandomGenerator; + public class CountUntrimmed { private static final Logger LOGGER = LoggerFactory.getLogger(CountUntrimmed.class); + // Default seed for random number generation (no specific reason for this value, just stability on result). + private static final long RANDOM_SEED = 677280131; + + // Default sample size for counting untrimmed cells. + public static final long DEFAULT_SAMPLE_SIZE = 10000; + /** Counts the number of cells in the columns with leading or trailing whitespace. */ - public static long apply(Column column) { + public static long apply(Column column, long sampleSize) { ColumnStorage storage = column.getStorage(); - return applyToStorage(storage); + return applyToStorage(storage, sampleSize); } /** Counts the number of cells in the given storage with leading or trailing whitespace. */ - public static long applyToStorage(ColumnStorage storage) { - if (storage instanceof StringStorage stringStorage) { + public static long applyToStorage(ColumnStorage storage, long sampleSize) { + if (sampleSize == DEFAULT_SAMPLE_SIZE && storage instanceof StringStorage stringStorage) { LOGGER.warn("Using memoized implementation for StringStorage"); - return stringStorage.countLeadingTrailingWhitespace(); + return stringStorage.countUntrimmed(); } - LOGGER.warn("Using fall back implementation for ColumnStorage"); - return compute(storage); + LOGGER.warn("Using fallback implementation for ColumnStorage"); + return compute(storage, sampleSize); } /** Internal method performing the calculation on a storage. */ - public static long compute(ColumnStorage storage) { + public static long compute(ColumnStorage storage, long sampleSize) { + long size = storage.getSize(); + boolean sample = sampleSize < size; + Random rng = sample ? new Random(RANDOM_SEED) : null; + double sampleRate = sample ? (double) sampleSize / size : 1.0; + Context context = Context.getCurrent(); long count = 0; for (long i = 0; i < storage.getSize(); i++) { + if (sample && rng.nextDouble() > sampleRate) { + continue; + } + var val = storage.getItemAsObject(i); if (val instanceof String str) { if (Text_Utils.has_leading_trailing_whitespace(str)) { - count += 100; + count++; } } context.safepoint(); } + + if (sample) { + count = Math.min(size, (long) Math.ceil((double) count / sampleRate)); + } return count; } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index a88dddc01833..4d02167b7959 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -4,8 +4,6 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.Executor; import java.util.concurrent.Executors; -import java.util.logging.Level; -import java.util.logging.Logger; import org.enso.base.CompareException; import org.enso.base.Text_Utils; @@ -26,7 +24,6 @@ /** A column storing strings. */ public final class StringStorage extends SpecializedStorage { - private static final Executor EXECUTOR = Executors.newSingleThreadExecutor(); private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(StringStorage.class); private final TextType type; @@ -42,8 +39,11 @@ public StringStorage(String[] data, int size, TextType type) { this.type = type; CompletableFuture.runAsync( - this::countLeadingTrailingWhitespace, - EXECUTOR); + () -> { + LOGGER.warn("Background counting untrimmed cells in the column " + this.size); + countUntrimmed(); + LOGGER.warn("Counted untrimmed " + this.size); + }); } @Override @@ -66,14 +66,14 @@ public TextType getType() { * Memoized into the storage for performance. * @return the number of cells with whitespace */ - public Long countLeadingTrailingWhitespace() { - if (_countLeadingTrailingWhitespace >= 0) { + public Long countUntrimmed() { + if (_countLeadingTrailingWhitespace != -1) { LOGGER.warn("Using memoized implementation for StringStorage"); return _countLeadingTrailingWhitespace; } - _countLeadingTrailingWhitespace = CountUntrimmed.compute(this); - LOGGER.warn("Counted leading and trailing whitespace in the column " + this.size); + _countLeadingTrailingWhitespace = CountUntrimmed.compute(this, CountUntrimmed.DEFAULT_SAMPLE_SIZE); + LOGGER.warn("Counted untrimmed text in the column " + this.size); return _countLeadingTrailingWhitespace; } From c40a7a5a3e9bae418bd5691be6e12b8082643fb9 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 29 Nov 2024 10:49:16 +0000 Subject: [PATCH 07/15] Remove logging. Note when sampled. --- .../0.0.0-dev/src/Table/Visualization.enso | 7 ++++--- .../table/data/column/operation/CountUntrimmed.java | 12 +++--------- .../table/data/column/storage/StringStorage.java | 11 +---------- 3 files changed, 8 insertions(+), 22 deletions(-) diff --git a/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso b/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso index 2b65eff9a7d0..366632a0d262 100644 --- a/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso +++ b/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso @@ -190,9 +190,10 @@ make_json_for_table dataframe max_rows all_rows_count include_index_col is_db_ta has_index_col = ["has_index_col", include_index_col] links = ["get_child_node_action", "get_row"] data_quality_metrics = if is_db_table then [] else - number_nothing = JS_Object.from_pairs [["name", "Number of nothings"], ["percentage_value", columns.map .count_nothing]] - number_untrimmed = JS_Object.from_pairs [["name", "Number of untrimmed whitespace"], ["percentage_value", columns.map .count_untrimmed]] - [number_nothing, number_untrimmed] + number_nothing = JS_Object.from_pairs [["name", "Count nothings"], ["percentage_value", columns.map .count_nothing]] + number_untrimmed = JS_Object.from_pairs [["name", "Count untrimmed whitespace"], ["percentage_value", columns.map c-> if c.length > 10000 then Nothing else c.count_untrimmed]] + number_untrimmed_sampled = JS_Object.from_pairs [["name", "Count untrimmed whitespace (sampled)"], ["percentage_value", columns.map c-> if c.length <= 10000 then Nothing else c.count_untrimmed]] + [number_nothing, number_untrimmed, number_untrimmed_sampled] pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_metrics", data_quality_metrics] ,["type", "Table"]] JS_Object.from_pairs pairs diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java index 73485d1dfa7e..2ac1f8861c90 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java @@ -13,8 +13,6 @@ import java.util.random.RandomGenerator; public class CountUntrimmed { - private static final Logger LOGGER = LoggerFactory.getLogger(CountUntrimmed.class); - // Default seed for random number generation (no specific reason for this value, just stability on result). private static final long RANDOM_SEED = 677280131; @@ -29,13 +27,9 @@ public static long apply(Column column, long sampleSize) { /** Counts the number of cells in the given storage with leading or trailing whitespace. */ public static long applyToStorage(ColumnStorage storage, long sampleSize) { - if (sampleSize == DEFAULT_SAMPLE_SIZE && storage instanceof StringStorage stringStorage) { - LOGGER.warn("Using memoized implementation for StringStorage"); - return stringStorage.countUntrimmed(); - } - - LOGGER.warn("Using fallback implementation for ColumnStorage"); - return compute(storage, sampleSize); + return (sampleSize == DEFAULT_SAMPLE_SIZE && storage instanceof StringStorage stringStorage) + ? stringStorage.countUntrimmed() + : compute(storage, sampleSize); } /** Internal method performing the calculation on a storage. */ diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 4d02167b7959..b633977bbfa6 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -24,8 +24,6 @@ /** A column storing strings. */ public final class StringStorage extends SpecializedStorage { - private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(StringStorage.class); - private final TextType type; private long _countLeadingTrailingWhitespace = -1; @@ -38,12 +36,7 @@ public StringStorage(String[] data, int size, TextType type) { super(data, size, buildOps()); this.type = type; - CompletableFuture.runAsync( - () -> { - LOGGER.warn("Background counting untrimmed cells in the column " + this.size); - countUntrimmed(); - LOGGER.warn("Counted untrimmed " + this.size); - }); + CompletableFuture.runAsync(this::countUntrimmed); } @Override @@ -68,12 +61,10 @@ public TextType getType() { */ public Long countUntrimmed() { if (_countLeadingTrailingWhitespace != -1) { - LOGGER.warn("Using memoized implementation for StringStorage"); return _countLeadingTrailingWhitespace; } _countLeadingTrailingWhitespace = CountUntrimmed.compute(this, CountUntrimmed.DEFAULT_SAMPLE_SIZE); - LOGGER.warn("Counted untrimmed text in the column " + this.size); return _countLeadingTrailingWhitespace; } From fd575c9458017aa903e303c4276f90983c2d04f7 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 29 Nov 2024 11:10:11 +0000 Subject: [PATCH 08/15] Cope with slicing null ending Long/Double columns. --- .../data/column/operation/CountUntrimmed.java | 10 +++------- .../table/data/column/storage/StringStorage.java | 12 +++++------- .../data/column/storage/numeric/DoubleStorage.java | 14 ++++++++++++-- .../data/column/storage/numeric/LongStorage.java | 14 ++++++++++++-- 4 files changed, 32 insertions(+), 18 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java index 2ac1f8861c90..62ac2e170b27 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java @@ -1,19 +1,15 @@ package org.enso.table.data.column.operation; +import java.util.Random; import org.enso.base.Text_Utils; -import org.enso.base.random.Random_Utils; import org.enso.table.data.column.storage.ColumnStorage; import org.enso.table.data.column.storage.StringStorage; import org.enso.table.data.table.Column; import org.graalvm.polyglot.Context; -import org.slf4j.LoggerFactory; -import org.slf4j.Logger; - -import java.util.Random; -import java.util.random.RandomGenerator; public class CountUntrimmed { - // Default seed for random number generation (no specific reason for this value, just stability on result). + // Default seed for random number generation (no specific reason for this value, just stability on + // result). private static final long RANDOM_SEED = 677280131; // Default sample size for counting untrimmed cells. diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index b633977bbfa6..f4c9f8f1e6c2 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -2,9 +2,6 @@ import java.util.BitSet; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.Executor; -import java.util.concurrent.Executors; - import org.enso.base.CompareException; import org.enso.base.Text_Utils; import org.enso.table.data.column.operation.CountUntrimmed; @@ -20,7 +17,6 @@ import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.data.column.storage.type.TextType; import org.graalvm.polyglot.Context; -import org.slf4j.LoggerFactory; /** A column storing strings. */ public final class StringStorage extends SpecializedStorage { @@ -55,8 +51,9 @@ public TextType getType() { } /** - * Counts the number of cells in the columns with whitespace. - * Memoized into the storage for performance. + * Counts the number of cells in the columns with whitespace. Memoized into the storage for + * performance. + * * @return the number of cells with whitespace */ public Long countUntrimmed() { @@ -64,7 +61,8 @@ public Long countUntrimmed() { return _countLeadingTrailingWhitespace; } - _countLeadingTrailingWhitespace = CountUntrimmed.compute(this, CountUntrimmed.DEFAULT_SAMPLE_SIZE); + _countLeadingTrailingWhitespace = + CountUntrimmed.compute(this, CountUntrimmed.DEFAULT_SAMPLE_SIZE); return _countLeadingTrailingWhitespace; } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java index 2d89360443b2..ac775485d1b8 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java @@ -303,8 +303,18 @@ private static MapOperationStorage buildOps() { @Override public Storage slice(int offset, int limit) { int newSize = Math.min(size - offset, limit); - long[] newData = new long[newSize]; - System.arraycopy(data, offset, newData, 0, newSize); + long[] newData; + + // Special case if slice is after the actual data + if (offset >= data.length) { + newData = new long[0]; + } else { + // Can only copy as much as there is data + int newDataSize = Math.min(data.length - offset, newSize); + newData = new long[newDataSize]; + System.arraycopy(data, offset, newData, 0, newDataSize); + } + BitSet newMask = isNothing.get(offset, offset + limit); return new DoubleStorage(newData, newSize, newMask); } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java index a30332d64c0b..9af9a246beb8 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java @@ -210,8 +210,18 @@ public long[] getRawData() { @Override public LongStorage slice(int offset, int limit) { int newSize = Math.min(size - offset, limit); - long[] newData = new long[newSize]; - System.arraycopy(data, offset, newData, 0, newSize); + long[] newData; + + // Special case if slice is after the actual data + if (offset >= data.length) { + newData = new long[0]; + } else { + // Can only copy as much as there is data + int newDataSize = Math.min(data.length - offset, newSize); + newData = new long[newDataSize]; + System.arraycopy(data, offset, newData, 0, newDataSize); + } + BitSet newMask = isNothing.get(offset, offset + limit); return new LongStorage(newData, newSize, newMask, type); } From d972d29d767b92b8719bdcaab40240ae26fb717a Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 29 Nov 2024 12:13:12 +0000 Subject: [PATCH 09/15] Fix viz tests --- test/Visualization_Tests/src/Table_Spec.enso | 23 ++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/test/Visualization_Tests/src/Table_Spec.enso b/test/Visualization_Tests/src/Table_Spec.enso index 5743cc8c7fb1..12140b6b93b2 100644 --- a/test/Visualization_Tests/src/Table_Spec.enso +++ b/test/Visualization_Tests/src/Table_Spec.enso @@ -47,16 +47,17 @@ type Foo_Link to_js_object self = JS_Object.from_pairs [["x", self.x], ["links", ["a", "b", "c"]]] add_specs suite_builder = - make_json header data all_rows value_type has_index_col get_child_node number_of_nothing number_of_whitespace = + make_json header data all_rows value_type has_index_col get_child_node number_of_nothing=Nothing number_of_whitespace=Nothing number_of_whitespace_sampled=Nothing = p_header = ["header", header] p_data = ["data", data] p_all_rows = ["all_rows_count", all_rows] p_value_type = ["value_type", value_type] p_has_index_col = ["has_index_col", has_index_col] p_get_child_node = ["get_child_node_action", get_child_node] - p_number_of_nothing = JS_Object.from_pairs [["name", "Number of nothings"], ["percentage_value", number_of_nothing]] - p_number_of_whitespace = JS_Object.from_pairs [["name", "Number of untrimmed whitespace"], ["percentage_value", number_of_whitespace]] - data_quality_metrics = [p_number_of_nothing, p_number_of_whitespace] + p_number_of_nothing = if number_of_nothing.is_nothing then [] else [JS_Object.from_pairs [["name", "Count nothings"], ["percentage_value", number_of_nothing]]] + p_number_of_whitespace = if number_of_whitespace.is_nothing then [] else [JS_Object.from_pairs [["name", "Count untrimmed whitespace"], ["percentage_value", number_of_whitespace]]] + p_number_of_whitespace_sampled = if number_of_whitespace_sampled.is_nothing then [] else [JS_Object.from_pairs [["name", "Count untrimmed whitespace (sampled)"], ["percentage_value", number_of_whitespace_sampled]]] + data_quality_metrics = p_number_of_nothing + p_number_of_whitespace + p_number_of_whitespace_sampled pairs = [p_header, p_value_type, p_data, p_all_rows, p_has_index_col, p_get_child_node, ["data_quality_metrics", data_quality_metrics], ["type", "Table"]] JS_Object.from_pairs pairs . to_text @@ -67,31 +68,31 @@ add_specs suite_builder = vis = Visualization.prepare_visualization data.t 1 value_type_int = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Integer"], ["display_text", "Integer (64 bits)"], ["bits", 64]] value_type_char = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Char"], ["display_text", "Char (variable length, max_size=unlimited)"], ["size", Nothing], ["variable_length", True]] - json = make_json header=["A", "B", "C"] data=[['a'], [2], [3]] all_rows=3 value_type=[value_type_char, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=Nothing number_of_whitespace=Nothing + json = make_json header=["A", "B", "C"] data=[['a'], [2], [3]] all_rows=3 value_type=[value_type_char, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" vis . should_equal json group_builder.specify "should visualize database columns" <| vis = Visualization.prepare_visualization (data.t.at "A") 2 value_type_char = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Char"], ["display_text", "Char (variable length, max_size=unlimited)"], ["size", Nothing], ["variable_length", True]] value_type_float = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Float"], ["display_text", "Float (64 bits)"], ["bits", 64]] - json = make_json header=["A"] data=[['a', 'a']] all_rows=3 value_type=[value_type_char] has_index_col=True get_child_node="get_row" number_of_nothing=Nothing number_of_whitespace=Nothing + json = make_json header=["A"] data=[['a', 'a']] all_rows=3 value_type=[value_type_char] has_index_col=True get_child_node="get_row" vis . should_equal json g = data.t.aggregate ["A", "B"] [Aggregate_Column.Average "C"] . at "Average C" vis2 = Visualization.prepare_visualization g 1 - json2 = make_json header=["Average C"] data=[[4.0]] all_rows=2 value_type=[value_type_float] has_index_col=True get_child_node="get_row" number_of_nothing=Nothing number_of_whitespace=Nothing + json2 = make_json header=["Average C"] data=[[4.0]] all_rows=2 value_type=[value_type_float] has_index_col=True get_child_node="get_row" vis2 . should_equal json2 group_builder.specify "should visualize dataframe tables" <| vis = Visualization.prepare_visualization data.t2 1 value_type_int = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Integer"], ["display_text", "Integer (64 bits)"], ["bits", 64]] - json = make_json header=["A", "B", "C"] data=[[1], [4], [7]] all_rows=3 value_type=[value_type_int, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0,0,0] number_of_whitespace=[Nothing, Nothing, Nothing] + json = make_json header=["A", "B", "C"] data=[[1], [4], [7]] all_rows=3 value_type=[value_type_int, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0,0,0] number_of_whitespace=[Nothing, Nothing, Nothing] number_of_whitespace_sampled=[Nothing, Nothing, Nothing] vis . should_equal json group_builder.specify "should visualize dataframe columns" <| vis = Visualization.prepare_visualization (data.t2.at "A") 2 value_type_int = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Integer"], ["display_text", "Integer (64 bits)"], ["bits", 64]] - json = make_json header=["A"] data=[[1, 2]] all_rows=3 value_type=[value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0] number_of_whitespace=[Nothing] + json = make_json header=["A"] data=[[1, 2]] all_rows=3 value_type=[value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0] number_of_whitespace=[Nothing] number_of_whitespace_sampled=[Nothing] vis . should_equal json group_builder.specify "should handle Vectors" <| @@ -134,14 +135,14 @@ add_specs suite_builder = group_builder.specify "should indicate number of Nothing/Nulls" <| vis = Visualization.prepare_visualization data.t3_with_nulls 3 value_type_int = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Integer"], ["display_text", "Integer (64 bits)"], ["bits", 64]] - json = make_json header=["A", "B", "C"] data=[[1,Nothing,3],[4,Nothing,Nothing],[7,Nothing,Nothing]] all_rows=3 value_type=[value_type_int, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[1, 2, 2] number_of_whitespace=[Nothing, Nothing, Nothing] + json = make_json header=["A", "B", "C"] data=[[1,Nothing,3],[4,Nothing,Nothing],[7,Nothing,Nothing]] all_rows=3 value_type=[value_type_int, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[1, 2, 2] number_of_whitespace=[Nothing, Nothing, Nothing] number_of_whitespace_sampled=[Nothing, Nothing, Nothing] vis . should_equal json group_builder.specify "should indicate number of leading/trailing whitespace" <| vis = Visualization.prepare_visualization data.t4_with_space 3 value_type_char = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Char"], ["display_text", "Char (variable length, max_size=unlimited)"], ["size", Nothing], ["variable_length", True]] value_type_int = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Integer"], ["display_text", "Integer (64 bits)"], ["bits", 64]] - json = make_json header=["A", "B", "C"] data=[['hello', ' leading space', 'trailing space '],['a', 'b', 'c'],[7, 8, 9]] all_rows=3 value_type=[value_type_char, value_type_char, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0, 0, 0] number_of_whitespace=[2, 0, Nothing] + json = make_json header=["A", "B", "C"] data=[['hello', ' leading space', 'trailing space '],['a', 'b', 'c'],[7, 8, 9]] all_rows=3 value_type=[value_type_char, value_type_char, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0, 0, 0] number_of_whitespace=[2, 0, Nothing] number_of_whitespace_sampled=[Nothing, Nothing, Nothing] vis . should_equal json main filter=Nothing = From dc7b143b89711734856043cf06782196ea2ef99f Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 29 Nov 2024 12:25:12 +0000 Subject: [PATCH 10/15] Test for sampled data quality. --- test/Visualization_Tests/src/Table_Spec.enso | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/Visualization_Tests/src/Table_Spec.enso b/test/Visualization_Tests/src/Table_Spec.enso index 12140b6b93b2..8ef061632461 100644 --- a/test/Visualization_Tests/src/Table_Spec.enso +++ b/test/Visualization_Tests/src/Table_Spec.enso @@ -145,6 +145,18 @@ add_specs suite_builder = json = make_json header=["A", "B", "C"] data=[['hello', ' leading space', 'trailing space '],['a', 'b', 'c'],[7, 8, 9]] all_rows=3 value_type=[value_type_char, value_type_char, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0, 0, 0] number_of_whitespace=[2, 0, Nothing] number_of_whitespace_sampled=[Nothing, Nothing, Nothing] vis . should_equal json + group_builder.specify "should indicate number of leading/trailing whitespace with sample if more than 10000 rows" <| + space_data = 0.up_to 11000 . map i-> case i % 4 of + 0 -> 'hello' + 1 -> ' leading space' + 2 -> 'trailing space ' + 3 -> ' leading and trailing space ' + space_table = Table.new [["A", space_data]] + vis = Visualization.prepare_visualization space_table 1000 + value_type_char = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Char"], ["display_text", "Char (variable length, max_size=unlimited)"], ["size", Nothing], ["variable_length", True]] + json = make_json header=["A"] data=[space_data.take 1000] all_rows=11000 value_type=[value_type_char] has_index_col=True get_child_node="get_row" number_of_nothing=[0] number_of_whitespace=[Nothing] number_of_whitespace_sampled=[8236] + vis . should_equal json + main filter=Nothing = suite = Test.build suite_builder-> add_specs suite_builder From 55b9a8e753a7666c4a64dae5b489926e95f8bab6 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 29 Nov 2024 12:31:10 +0000 Subject: [PATCH 11/15] Remove unnecessary new methods. --- .../main/java/org/enso/base/Text_Utils.java | 38 +------------------ 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java index 945c431b3c7f..ceb031c9cadd 100644 --- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java @@ -35,40 +35,6 @@ public static String substring(String string, int from, int to) { return string.substring(from, to); } - /** - * Gets the first Grapheme cluster in the string. - * - * @param string the string to substring - * @return the first grapheme cluster in the string or null if the string is empty. - */ - public static String first_cluster(String string) { - BreakIterator breakIterator = BreakIterator.getCharacterInstance(); - breakIterator.setText(string); - int start = breakIterator.first(); - if (start == -1) { - return null; - } - int end = breakIterator.next(); - return string.substring(start, end); - } - - /** - * Gets the last Grapheme cluster in the string. - * - * @param string the string to substring - * @return the last grapheme cluster in the string or null if the string is empty. - */ - public static String last_cluster(String string) { - BreakIterator breakIterator = BreakIterator.getCharacterInstance(); - breakIterator.setText(string); - int start = breakIterator.last(); - if (start == -1) { - return null; - } - int end = breakIterator.previous(); - return string.substring(end, start); - } - /** * Checks if the string has leading or trailing whitespace. * @@ -80,12 +46,12 @@ public static boolean has_leading_trailing_whitespace(String s) { return false; } - var leading = Text_Utils.first_cluster(s); + var leading = Text_Utils.take_prefix(s, 1); if (leading != null && is_all_whitespace(leading)) { return true; } - var trailing = Text_Utils.last_cluster(s); + var trailing = Text_Utils.take_suffix(s, 1); if (trailing != null && is_all_whitespace(trailing)) { return true; } From 7059f3954abff981bbc2f29f633f7302b3ca45cd Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 29 Nov 2024 12:34:18 +0000 Subject: [PATCH 12/15] Remove unnecessary new methods. --- .../enso/table/data/column/storage/StringStorage.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index f4c9f8f1e6c2..2581fd839998 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -21,7 +21,7 @@ /** A column storing strings. */ public final class StringStorage extends SpecializedStorage { private final TextType type; - private long _countLeadingTrailingWhitespace = -1; + private long countUntrimmed = -1; /** * @param data the underlying data @@ -57,13 +57,13 @@ public TextType getType() { * @return the number of cells with whitespace */ public Long countUntrimmed() { - if (_countLeadingTrailingWhitespace != -1) { - return _countLeadingTrailingWhitespace; + if (countUntrimmed != -1) { + return countUntrimmed; } - _countLeadingTrailingWhitespace = + countUntrimmed = CountUntrimmed.compute(this, CountUntrimmed.DEFAULT_SAMPLE_SIZE); - return _countLeadingTrailingWhitespace; + return countUntrimmed; } private static MapOperationStorage> buildOps() { From 0d059be391233da26cf37919027d0e7e720c6270 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 29 Nov 2024 14:05:23 +0000 Subject: [PATCH 13/15] Use Future for memorized value. PR comments. --- .../Standard/Table/0.0.0-dev/src/Column.enso | 7 ++- .../0.0.0-dev/src/Table/Visualization.enso | 25 ++++++----- .../data/column/operation/CountUntrimmed.java | 45 +++++++++++-------- .../data/column/storage/StringStorage.java | 30 ++++++++----- test/Visualization_Tests/src/Table_Spec.enso | 10 ++--- 5 files changed, 70 insertions(+), 47 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso index aa74ed7b7c8f..9c301e2a2c65 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso @@ -2218,10 +2218,15 @@ type Column Counts the number of text values with leading or trailing whitespace. Used for data quality indicator in Table Viz. count_untrimmed : Integer -> Integer | Nothing - count_untrimmed self sample_size:Integer=CountUntrimmed.DEFAULT_SAMPLE_SIZE = + count_untrimmed self sample_size:Integer=Column.default_sample_size = if (self.value_type == Value_Type.Mixed || self.value_type.is_text).not then Nothing else CountUntrimmed.apply self.java_column sample_size + ## PRIVATE + Default size for sampling data quality indicators. + default_sample_size -> Integer = + CountUntrimmed.DEFAULT_SAMPLE_SIZE + ## GROUP Standard.Base.Metadata ICON metadata Returns the number of non-null items in this column. diff --git a/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso b/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso index 366632a0d262..1c36bc89a3e1 100644 --- a/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso +++ b/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso @@ -181,20 +181,21 @@ make_json_for_xml_element xml_element max_items type:Text="XML_Element" = make_json_for_table : Table -> Integer -> Integer -> Boolean -> Boolean -> JS_Object make_json_for_table dataframe max_rows all_rows_count include_index_col is_db_table = act_max = if max_rows < all_rows_count then max_rows else all_rows_count - get_vector c = Warning.set (Vector.new act_max i-> make_json_for_value (c.get i)) [] - columns = dataframe.columns - header = ["header", columns.map .name] - value_type = ["value_type", columns.map .value_type] - data = ["data", columns.map get_vector] - all_rows = ["all_rows_count", all_rows_count] - has_index_col = ["has_index_col", include_index_col] - links = ["get_child_node_action", "get_row"] + get_vector c = Warning.set (Vector.new act_max i-> make_json_for_value (c.get i)) [] + columns = dataframe.columns + header = ["header", columns.map .name] + value_type = ["value_type", columns.map .value_type] + data = ["data", columns.map get_vector] + all_rows = ["all_rows_count", all_rows_count] + has_index_col = ["has_index_col", include_index_col] + links = ["get_child_node_action", "get_row"] data_quality_metrics = if is_db_table then [] else number_nothing = JS_Object.from_pairs [["name", "Count nothings"], ["percentage_value", columns.map .count_nothing]] - number_untrimmed = JS_Object.from_pairs [["name", "Count untrimmed whitespace"], ["percentage_value", columns.map c-> if c.length > 10000 then Nothing else c.count_untrimmed]] - number_untrimmed_sampled = JS_Object.from_pairs [["name", "Count untrimmed whitespace (sampled)"], ["percentage_value", columns.map c-> if c.length <= 10000 then Nothing else c.count_untrimmed]] - [number_nothing, number_untrimmed, number_untrimmed_sampled] - pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_metrics", data_quality_metrics] ,["type", "Table"]] + number_untrimmed = case all_rows_count > Column.default_sample_size of + False -> JS_Object.from_pairs [["name", "Count untrimmed whitespace"], ["percentage_value", columns.map .count_untrimmed]] + True -> JS_Object.from_pairs [["name", "Count untrimmed whitespace (sampled)"], ["percentage_value", columns.map .count_untrimmed]] + [number_nothing, number_untrimmed] + pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_metrics", data_quality_metrics] ,["type", "Table"]] JS_Object.from_pairs pairs ## PRIVATE diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java index 62ac2e170b27..570a64b302de 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java @@ -16,44 +16,51 @@ public class CountUntrimmed { public static final long DEFAULT_SAMPLE_SIZE = 10000; /** Counts the number of cells in the columns with leading or trailing whitespace. */ - public static long apply(Column column, long sampleSize) { + public static Long apply(Column column, long sampleSize) throws InterruptedException { ColumnStorage storage = column.getStorage(); return applyToStorage(storage, sampleSize); } /** Counts the number of cells in the given storage with leading or trailing whitespace. */ - public static long applyToStorage(ColumnStorage storage, long sampleSize) { + public static Long applyToStorage(ColumnStorage storage, long sampleSize) + throws InterruptedException { return (sampleSize == DEFAULT_SAMPLE_SIZE && storage instanceof StringStorage stringStorage) ? stringStorage.countUntrimmed() - : compute(storage, sampleSize); + : (Long) compute(storage, sampleSize, Context.getCurrent()); } /** Internal method performing the calculation on a storage. */ - public static long compute(ColumnStorage storage, long sampleSize) { + public static long compute(ColumnStorage storage, long sampleSize, Context context) { long size = storage.getSize(); - boolean sample = sampleSize < size; - Random rng = sample ? new Random(RANDOM_SEED) : null; - double sampleRate = sample ? (double) sampleSize / size : 1.0; - Context context = Context.getCurrent(); long count = 0; - for (long i = 0; i < storage.getSize(); i++) { - if (sample && rng.nextDouble() > sampleRate) { - continue; - } + if (sampleSize < size) { + var rng = new Random(RANDOM_SEED); + for (int i = 0; i < sampleSize; i++) { + long idx = rng.nextInt((int) size); + var val = storage.getItemAsObject(idx); + if (val instanceof String str && Text_Utils.has_leading_trailing_whitespace(str)) { + count++; + } - var val = storage.getItemAsObject(i); - if (val instanceof String str) { - if (Text_Utils.has_leading_trailing_whitespace(str)) { + if (context != null) { + context.safepoint(); + } + } + count = Math.min(size, (long) Math.ceil((double) count / sampleSize * size)); + } else { + for (long i = 0; i < storage.getSize(); i++) { + var val = storage.getItemAsObject(i); + if (val instanceof String str && Text_Utils.has_leading_trailing_whitespace(str)) { count++; } + + if (context != null) { + context.safepoint(); + } } - context.safepoint(); } - if (sample) { - count = Math.min(size, (long) Math.ceil((double) count / sampleRate)); - } return count; } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 2581fd839998..9ea92c0f6973 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -2,6 +2,8 @@ import java.util.BitSet; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; import org.enso.base.CompareException; import org.enso.base.Text_Utils; import org.enso.table.data.column.operation.CountUntrimmed; @@ -21,7 +23,7 @@ /** A column storing strings. */ public final class StringStorage extends SpecializedStorage { private final TextType type; - private long countUntrimmed = -1; + private Future countUntrimmed; /** * @param data the underlying data @@ -32,7 +34,9 @@ public StringStorage(String[] data, int size, TextType type) { super(data, size, buildOps()); this.type = type; - CompletableFuture.runAsync(this::countUntrimmed); + countUntrimmed = + CompletableFuture.supplyAsync( + () -> CountUntrimmed.compute(this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, null)); } @Override @@ -51,19 +55,25 @@ public TextType getType() { } /** - * Counts the number of cells in the columns with whitespace. Memoized into the storage for - * performance. + * Counts the number of cells in the columns with whitespace. If the calculation fails then it + * returns null. * * @return the number of cells with whitespace */ - public Long countUntrimmed() { - if (countUntrimmed != -1) { - return countUntrimmed; + public Long countUntrimmed() throws InterruptedException { + if (countUntrimmed.isCancelled()) { + // Need to recompute the value, as was cancelled. + countUntrimmed = CompletableFuture.supplyAsync( + () -> + CountUntrimmed.compute( + this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, Context.getCurrent())); } - countUntrimmed = - CountUntrimmed.compute(this, CountUntrimmed.DEFAULT_SAMPLE_SIZE); - return countUntrimmed; + try { + return countUntrimmed.get(); + } catch (ExecutionException e) { + return null; + } } private static MapOperationStorage> buildOps() { diff --git a/test/Visualization_Tests/src/Table_Spec.enso b/test/Visualization_Tests/src/Table_Spec.enso index 8ef061632461..3d5de6016179 100644 --- a/test/Visualization_Tests/src/Table_Spec.enso +++ b/test/Visualization_Tests/src/Table_Spec.enso @@ -86,13 +86,13 @@ add_specs suite_builder = group_builder.specify "should visualize dataframe tables" <| vis = Visualization.prepare_visualization data.t2 1 value_type_int = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Integer"], ["display_text", "Integer (64 bits)"], ["bits", 64]] - json = make_json header=["A", "B", "C"] data=[[1], [4], [7]] all_rows=3 value_type=[value_type_int, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0,0,0] number_of_whitespace=[Nothing, Nothing, Nothing] number_of_whitespace_sampled=[Nothing, Nothing, Nothing] + json = make_json header=["A", "B", "C"] data=[[1], [4], [7]] all_rows=3 value_type=[value_type_int, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0,0,0] number_of_whitespace=[Nothing, Nothing, Nothing] vis . should_equal json group_builder.specify "should visualize dataframe columns" <| vis = Visualization.prepare_visualization (data.t2.at "A") 2 value_type_int = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Integer"], ["display_text", "Integer (64 bits)"], ["bits", 64]] - json = make_json header=["A"] data=[[1, 2]] all_rows=3 value_type=[value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0] number_of_whitespace=[Nothing] number_of_whitespace_sampled=[Nothing] + json = make_json header=["A"] data=[[1, 2]] all_rows=3 value_type=[value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0] number_of_whitespace=[Nothing] vis . should_equal json group_builder.specify "should handle Vectors" <| @@ -135,14 +135,14 @@ add_specs suite_builder = group_builder.specify "should indicate number of Nothing/Nulls" <| vis = Visualization.prepare_visualization data.t3_with_nulls 3 value_type_int = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Integer"], ["display_text", "Integer (64 bits)"], ["bits", 64]] - json = make_json header=["A", "B", "C"] data=[[1,Nothing,3],[4,Nothing,Nothing],[7,Nothing,Nothing]] all_rows=3 value_type=[value_type_int, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[1, 2, 2] number_of_whitespace=[Nothing, Nothing, Nothing] number_of_whitespace_sampled=[Nothing, Nothing, Nothing] + json = make_json header=["A", "B", "C"] data=[[1,Nothing,3],[4,Nothing,Nothing],[7,Nothing,Nothing]] all_rows=3 value_type=[value_type_int, value_type_int, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[1, 2, 2] number_of_whitespace=[Nothing, Nothing, Nothing] vis . should_equal json group_builder.specify "should indicate number of leading/trailing whitespace" <| vis = Visualization.prepare_visualization data.t4_with_space 3 value_type_char = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Char"], ["display_text", "Char (variable length, max_size=unlimited)"], ["size", Nothing], ["variable_length", True]] value_type_int = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Integer"], ["display_text", "Integer (64 bits)"], ["bits", 64]] - json = make_json header=["A", "B", "C"] data=[['hello', ' leading space', 'trailing space '],['a', 'b', 'c'],[7, 8, 9]] all_rows=3 value_type=[value_type_char, value_type_char, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0, 0, 0] number_of_whitespace=[2, 0, Nothing] number_of_whitespace_sampled=[Nothing, Nothing, Nothing] + json = make_json header=["A", "B", "C"] data=[['hello', ' leading space', 'trailing space '],['a', 'b', 'c'],[7, 8, 9]] all_rows=3 value_type=[value_type_char, value_type_char, value_type_int] has_index_col=True get_child_node="get_row" number_of_nothing=[0, 0, 0] number_of_whitespace=[2, 0, Nothing] vis . should_equal json group_builder.specify "should indicate number of leading/trailing whitespace with sample if more than 10000 rows" <| @@ -154,7 +154,7 @@ add_specs suite_builder = space_table = Table.new [["A", space_data]] vis = Visualization.prepare_visualization space_table 1000 value_type_char = JS_Object.from_pairs [["type", "Value_Type"], ["constructor", "Char"], ["display_text", "Char (variable length, max_size=unlimited)"], ["size", Nothing], ["variable_length", True]] - json = make_json header=["A"] data=[space_data.take 1000] all_rows=11000 value_type=[value_type_char] has_index_col=True get_child_node="get_row" number_of_nothing=[0] number_of_whitespace=[Nothing] number_of_whitespace_sampled=[8236] + json = make_json header=["A"] data=[space_data.take 1000] all_rows=11000 value_type=[value_type_char] has_index_col=True get_child_node="get_row" number_of_nothing=[0] number_of_whitespace_sampled=[8266] vis . should_equal json main filter=Nothing = From 5a6b6412c4a6b9c41c45a74645feb29d5639d643 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 29 Nov 2024 14:14:17 +0000 Subject: [PATCH 14/15] Java format. --- .../enso/table/data/column/storage/StringStorage.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 9ea92c0f6973..3c11e807ab99 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -63,10 +63,11 @@ public TextType getType() { public Long countUntrimmed() throws InterruptedException { if (countUntrimmed.isCancelled()) { // Need to recompute the value, as was cancelled. - countUntrimmed = CompletableFuture.supplyAsync( - () -> - CountUntrimmed.compute( - this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, Context.getCurrent())); + countUntrimmed = + CompletableFuture.supplyAsync( + () -> + CountUntrimmed.compute( + this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, Context.getCurrent())); } try { From e8af625e1922ca443d14546f66c358338c7ef014 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Fri, 29 Nov 2024 15:38:49 +0000 Subject: [PATCH 15/15] PR comments. --- .../main/java/org/enso/base/Text_Utils.java | 2 +- .../data/column/operation/CountUntrimmed.java | 4 ++-- .../data/column/storage/StringStorage.java | 23 +++++++++++-------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java index ceb031c9cadd..1c6da9b4f901 100644 --- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java @@ -42,7 +42,7 @@ public static String substring(String string, int from, int to) { * @return whether the string has leading or trailing whitespace */ public static boolean has_leading_trailing_whitespace(String s) { - if (s == null && s.isEmpty()) { + if (s == null || s.isEmpty()) { return false; } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java index 570a64b302de..9e0f5923a015 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java @@ -25,7 +25,7 @@ public static Long apply(Column column, long sampleSize) throws InterruptedExcep public static Long applyToStorage(ColumnStorage storage, long sampleSize) throws InterruptedException { return (sampleSize == DEFAULT_SAMPLE_SIZE && storage instanceof StringStorage stringStorage) - ? stringStorage.countUntrimmed() + ? stringStorage.cachedUntrimmedCount() : (Long) compute(storage, sampleSize, Context.getCurrent()); } @@ -37,7 +37,7 @@ public static long compute(ColumnStorage storage, long sampleSize, Context conte if (sampleSize < size) { var rng = new Random(RANDOM_SEED); for (int i = 0; i < sampleSize; i++) { - long idx = rng.nextInt((int) size); + long idx = rng.nextInt(Math.toIntExact(size)); var val = storage.getItemAsObject(idx); if (val instanceof String str && Text_Utils.has_leading_trailing_whitespace(str)) { count++; diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 3c11e807ab99..b7268a8f431f 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -19,11 +19,14 @@ import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.data.column.storage.type.TextType; import org.graalvm.polyglot.Context; +import org.slf4j.Logger; /** A column storing strings. */ public final class StringStorage extends SpecializedStorage { + private static final Logger LOGGER = org.slf4j.LoggerFactory.getLogger(StringStorage.class); + private final TextType type; - private Future countUntrimmed; + private Future untrimmedCount; /** * @param data the underlying data @@ -34,7 +37,7 @@ public StringStorage(String[] data, int size, TextType type) { super(data, size, buildOps()); this.type = type; - countUntrimmed = + untrimmedCount = CompletableFuture.supplyAsync( () -> CountUntrimmed.compute(this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, null)); } @@ -60,19 +63,19 @@ public TextType getType() { * * @return the number of cells with whitespace */ - public Long countUntrimmed() throws InterruptedException { - if (countUntrimmed.isCancelled()) { + public Long cachedUntrimmedCount() throws InterruptedException { + if (untrimmedCount.isCancelled()) { // Need to recompute the value, as was cancelled. - countUntrimmed = - CompletableFuture.supplyAsync( - () -> - CountUntrimmed.compute( - this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, Context.getCurrent())); + untrimmedCount = + CompletableFuture.completedFuture( + CountUntrimmed.compute( + this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, Context.getCurrent())); } try { - return countUntrimmed.get(); + return untrimmedCount.get(); } catch (ExecutionException e) { + LOGGER.error("Failed to compute untrimmed count", e); return null; } }