Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for Data Quality perf and issue other tweaks. #11711

Merged
merged 15 commits into from
Nov 29, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ type DB_Table
result = self.updated_columns (new_columns.map _.as_internal)
Warning.attach (Deprecated.Warning "Standard.Database.DB_Table.DB_Table" "remove_columns_by_type" "Deprecated: use `remove_columns` with a `By_Type` instead.") result

## ALIAS select_blank_fields, select_missing_columns, select_na
## ALIAS select_blank_fields, select_missing_columns, select_na, filter_blank_columns
GROUP Standard.Base.Selections
ICON select_column

Expand Down Expand Up @@ -2559,7 +2559,7 @@ type DB_Table
_ = [columns, shrink_types, error_on_missing_columns, on_problems]
Error.throw (Unsupported_Database_Operation.Error "auto_cast")

## ALIAS drop_missing_rows, dropna
## ALIAS drop_missing_rows, dropna, remove_blank_rows, remove_empty_rows, remove_missing_rows, filter_empty_rows, drop_empty_rows
GROUP Standard.Base.Selections
ICON preparation
Remove rows which are all blank or containing blank values.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,4 +327,4 @@ type Snowflake_Connection
Converts this value to a JSON serializable object.
to_js_object : JS_Object
to_js_object self =
JS_Object.from_pairs [["type", "Snowflake_Connection"], ["links", self.tables.at "Name" . to_vector]]
JS_Object.from_pairs [["type", "Snowflake_Connection"], ["links", self.tables.at "Name" . to_vector], ["get_child_node_action", "query"]]
14 changes: 14 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ from project.Internal.Storage import enso_to_java, java_to_enso
polyglot java import org.enso.base.Time_Utils
polyglot java import org.enso.table.data.column.operation.cast.CastProblemAggregator
polyglot java import org.enso.table.data.column.operation.CountNothing
polyglot java import org.enso.table.data.column.operation.CountUntrimmed
polyglot java import org.enso.table.data.column.operation.unary.DatePartOperation
polyglot java import org.enso.table.data.column.operation.unary.IsEmptyOperation
polyglot java import org.enso.table.data.column.operation.unary.IsFiniteOperation
Expand Down Expand Up @@ -2213,6 +2214,19 @@ type Column
count_nothing : Integer
count_nothing self = CountNothing.apply self.java_column

## PRIVATE
Counts the number of text values with leading or trailing whitespace.
Used for data quality indicator in Table Viz.
count_untrimmed : Integer -> Integer | Nothing
count_untrimmed self sample_size:Integer=Column.default_sample_size =
if (self.value_type == Value_Type.Mixed || self.value_type.is_text).not then Nothing else
CountUntrimmed.apply self.java_column sample_size

## PRIVATE
Default size for sampling data quality indicators.
default_sample_size -> Integer =
CountUntrimmed.DEFAULT_SAMPLE_SIZE

## GROUP Standard.Base.Metadata
ICON metadata
Returns the number of non-null items in this column.
Expand Down
4 changes: 2 additions & 2 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -690,7 +690,7 @@ type Table
result = Table.new new_columns
Warning.attach (Deprecated.Warning "Standard.Table.Table.Table" "remove_columns_by_type" "Deprecated: use `remove_columns` with a `By_Type` instead.") result

## ALIAS select_blank_fields, select_missing_columns, select_na
## ALIAS drop_missing_rows, dropna, remove_blank_rows, remove_empty_rows, remove_missing_rows, filter_empty_rows, drop_empty_rows
GROUP Standard.Base.Selections
ICON select_column

Expand Down Expand Up @@ -730,7 +730,7 @@ type Table
new_columns = self.columns_helper.select_blank_columns_helper when treat_nans_as_blank
Table.new new_columns

## ALIAS drop_missing_columns, drop_na, select_blank_columns, select_blank_fields, select_missing_columns, select_na
## ALIAS drop_missing_columns, drop_na, select_blank_columns, select_blank_fields, select_missing_columns, select_na, filter_blank_columns
GROUP Standard.Base.Selections
ICON select_column

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,13 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error
_ : Row -> make_json_for_dictionary x.to_dictionary max_rows "column"
_ : Column -> prepare_visualization x.to_table max_rows
_ : Table ->
dataframe = x.take max_rows
all_rows_count = x.row_count
make_json_for_table dataframe all_rows_count True False
make_json_for_table x max_rows all_rows_count True False
_ : DB_Column -> prepare_visualization x.to_table max_rows
_ : DB_Table ->
dataframe = x.read (..First max_rows)
all_rows_count = x.row_count
make_json_for_table dataframe all_rows_count True True
make_json_for_table dataframe max_rows all_rows_count True True
_ : Function ->
pairs = [['_display_text_', '[Function '+x.to_text+']']]
value = JS_Object.from_pairs pairs
Expand All @@ -59,14 +58,6 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error
Column Limit
max_columns = 250

## PRIVATE
whitespace_count : Column -> Integer | Nothing
whitespace_count col =
find_whitespace col =
filtered = col.to_vector.filter (c-> c.is_a Text && c.is_empty.not && (c.first.is_whitespace || c.last.is_whitespace))
filtered.length
if (col.value_type == Value_Type.Mixed || col.value_type.is_text) then find_whitespace col else Nothing

## PRIVATE
Render Error to JSON
make_json_for_error : Any -> JS_Object
Expand Down Expand Up @@ -187,22 +178,24 @@ make_json_for_xml_element xml_element max_items type:Text="XML_Element" =
to display.
- all_rows_count: the number of all rows in the underlying data, useful if
only a fragment is displayed.
make_json_for_table : Table -> Integer -> Boolean -> Boolean -> JS_Object
make_json_for_table dataframe all_rows_count include_index_col is_db_table =
get_vector c = Warning.set (c.to_vector.map v-> make_json_for_value v) []
columns = dataframe.columns
header = ["header", columns.map .name]
value_type = ["value_type", columns.map .value_type]
data = ["data", columns.map get_vector]
all_rows = ["all_rows_count", all_rows_count]
has_index_col = ["has_index_col", include_index_col]
links = ["get_child_node_action", "get_row"]
number_of_nothing = if is_db_table then Nothing else columns.map c-> c.count_nothing
number_of_whitespace= if is_db_table then Nothing else columns.map c-> whitespace_count c
nothing_p = JS_Object.from_pairs [["name", "Number of nothings"], ["percentage_value", number_of_nothing]]
whitespace_p = JS_Object.from_pairs [["name", "Number of untrimmed whitespace"], ["percentage_value",number_of_whitespace]]
data_quality_metrics = [nothing_p, whitespace_p]
pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_metrics", data_quality_metrics] ,["type", "Table"]]
make_json_for_table : Table -> Integer -> Integer -> Boolean -> Boolean -> JS_Object
make_json_for_table dataframe max_rows all_rows_count include_index_col is_db_table =
act_max = if max_rows < all_rows_count then max_rows else all_rows_count
get_vector c = Warning.set (Vector.new act_max i-> make_json_for_value (c.get i)) []
columns = dataframe.columns
header = ["header", columns.map .name]
value_type = ["value_type", columns.map .value_type]
data = ["data", columns.map get_vector]
all_rows = ["all_rows_count", all_rows_count]
has_index_col = ["has_index_col", include_index_col]
links = ["get_child_node_action", "get_row"]
data_quality_metrics = if is_db_table then [] else
number_nothing = JS_Object.from_pairs [["name", "Count nothings"], ["percentage_value", columns.map .count_nothing]]
number_untrimmed = case all_rows_count > Column.default_sample_size of
False -> JS_Object.from_pairs [["name", "Count untrimmed whitespace"], ["percentage_value", columns.map .count_untrimmed]]
True -> JS_Object.from_pairs [["name", "Count untrimmed whitespace (sampled)"], ["percentage_value", columns.map .count_untrimmed]]
[number_nothing, number_untrimmed]
pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_metrics", data_quality_metrics] ,["type", "Table"]]
JS_Object.from_pairs pairs

## PRIVATE
Expand Down
24 changes: 24 additions & 0 deletions std-bits/base/src/main/java/org/enso/base/Text_Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,30 @@ public static String substring(String string, int from, int to) {
return string.substring(from, to);
}

/**
* Checks if the string has leading or trailing whitespace.
*
* @param s the string to check
* @return whether the string has leading or trailing whitespace
*/
public static boolean has_leading_trailing_whitespace(String s) {
if (s == null || s.isEmpty()) {
return false;
}

var leading = Text_Utils.take_prefix(s, 1);
if (leading != null && is_all_whitespace(leading)) {
return true;
}

var trailing = Text_Utils.take_suffix(s, 1);
if (trailing != null && is_all_whitespace(trailing)) {
return true;
}

return false;
}

/**
* Returns a new string containing characters starting at the given UTF-16 index.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package org.enso.table.data.column.operation;

import java.util.Random;
import org.enso.base.Text_Utils;
import org.enso.table.data.column.storage.ColumnStorage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.data.table.Column;
import org.graalvm.polyglot.Context;

public class CountUntrimmed {
// Default seed for random number generation (no specific reason for this value, just stability on
// result).
private static final long RANDOM_SEED = 677280131;

// Default sample size for counting untrimmed cells.
public static final long DEFAULT_SAMPLE_SIZE = 10000;
jdunkerley marked this conversation as resolved.
Show resolved Hide resolved

/** Counts the number of cells in the columns with leading or trailing whitespace. */
public static Long apply(Column column, long sampleSize) throws InterruptedException {
ColumnStorage storage = column.getStorage();
return applyToStorage(storage, sampleSize);
}

/** Counts the number of cells in the given storage with leading or trailing whitespace. */
public static Long applyToStorage(ColumnStorage storage, long sampleSize)
throws InterruptedException {
return (sampleSize == DEFAULT_SAMPLE_SIZE && storage instanceof StringStorage stringStorage)
? stringStorage.cachedUntrimmedCount()
: (Long) compute(storage, sampleSize, Context.getCurrent());
}

/** Internal method performing the calculation on a storage. */
public static long compute(ColumnStorage storage, long sampleSize, Context context) {
long size = storage.getSize();

long count = 0;
if (sampleSize < size) {
var rng = new Random(RANDOM_SEED);
for (int i = 0; i < sampleSize; i++) {
long idx = rng.nextInt(Math.toIntExact(size));
var val = storage.getItemAsObject(idx);
if (val instanceof String str && Text_Utils.has_leading_trailing_whitespace(str)) {
count++;
}

if (context != null) {
context.safepoint();
}
}
count = Math.min(size, (long) Math.ceil((double) count / sampleSize * size));
} else {
for (long i = 0; i < storage.getSize(); i++) {
var val = storage.getItemAsObject(i);
if (val instanceof String str && Text_Utils.has_leading_trailing_whitespace(str)) {
count++;
}

if (context != null) {
context.safepoint();
}
}
}

return count;
}
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
package org.enso.table.data.column.storage;

import java.util.BitSet;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import org.enso.base.CompareException;
import org.enso.base.Text_Utils;
import org.enso.table.data.column.operation.CountUntrimmed;
import org.enso.table.data.column.operation.map.BinaryMapOperation;
import org.enso.table.data.column.operation.map.MapOperationProblemAggregator;
import org.enso.table.data.column.operation.map.MapOperationStorage;
Expand All @@ -15,11 +19,14 @@
import org.enso.table.data.column.storage.type.StorageType;
import org.enso.table.data.column.storage.type.TextType;
import org.graalvm.polyglot.Context;
import org.slf4j.Logger;

/** A column storing strings. */
public final class StringStorage extends SpecializedStorage<String> {
private static final Logger LOGGER = org.slf4j.LoggerFactory.getLogger(StringStorage.class);

private final TextType type;
private Future<Long> untrimmedCount;

/**
* @param data the underlying data
Expand All @@ -29,6 +36,10 @@ public final class StringStorage extends SpecializedStorage<String> {
public StringStorage(String[] data, int size, TextType type) {
super(data, size, buildOps());
this.type = type;

untrimmedCount =
CompletableFuture.supplyAsync(
() -> CountUntrimmed.compute(this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, null));
}

@Override
Expand All @@ -46,6 +57,29 @@ public TextType getType() {
return type;
}

/**
* Counts the number of cells in the columns with whitespace. If the calculation fails then it
* returns null.
*
* @return the number of cells with whitespace
*/
public Long cachedUntrimmedCount() throws InterruptedException {
if (untrimmedCount.isCancelled()) {
// Need to recompute the value, as was cancelled.
untrimmedCount =
CompletableFuture.completedFuture(
CountUntrimmed.compute(
this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, Context.getCurrent()));
}

try {
return untrimmedCount.get();
} catch (ExecutionException e) {
LOGGER.error("Failed to compute untrimmed count", e);
return null;
jdunkerley marked this conversation as resolved.
Show resolved Hide resolved
}
}

private static MapOperationStorage<String, SpecializedStorage<String>> buildOps() {
MapOperationStorage<String, SpecializedStorage<String>> t = ObjectStorage.buildObjectOps();
t.add(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,8 +303,18 @@ private static MapOperationStorage<Double, DoubleStorage> buildOps() {
@Override
public Storage<Double> slice(int offset, int limit) {
int newSize = Math.min(size - offset, limit);
long[] newData = new long[newSize];
System.arraycopy(data, offset, newData, 0, newSize);
long[] newData;

// Special case if slice is after the actual data
if (offset >= data.length) {
newData = new long[0];
} else {
// Can only copy as much as there is data
int newDataSize = Math.min(data.length - offset, newSize);
newData = new long[newDataSize];
System.arraycopy(data, offset, newData, 0, newDataSize);
}

BitSet newMask = isNothing.get(offset, offset + limit);
return new DoubleStorage(newData, newSize, newMask);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,18 @@ public long[] getRawData() {
@Override
public LongStorage slice(int offset, int limit) {
int newSize = Math.min(size - offset, limit);
long[] newData = new long[newSize];
System.arraycopy(data, offset, newData, 0, newSize);
long[] newData;

// Special case if slice is after the actual data
if (offset >= data.length) {
newData = new long[0];
} else {
// Can only copy as much as there is data
int newDataSize = Math.min(data.length - offset, newSize);
newData = new long[newDataSize];
System.arraycopy(data, offset, newData, 0, newDataSize);
}

BitSet newMask = isNothing.get(offset, offset + limit);
return new LongStorage(newData, newSize, newMask, type);
}
Expand Down
Loading
Loading