Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom parsers / null literals should use pre-legalized names (issue #190) #223

Merged
merged 1 commit into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions src/main/java/io/deephaven/csv/CsvSpecs.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,12 @@ public interface Builder {
Builder customTimeZoneParser(Tokenizer.CustomTimeZoneParser customTimeZoneParser);

/**
* An optional legalizer for column headers. The legalizer is a function that takes column names (as a
* {@code String[]}) names and returns legal column names (as a {@code String[]}). The legalizer function is
* permitted to reuse its input data structure. Defaults to {@code Function#identity()}.
* An optional legalizer for column headers. The legalizer is a function that takes the column names from the
* input (as a {@code String[]}) and returns "legal" column names (as a {@code String[]}). What constitutes
* "legal" is entirely up to the caller. For example, some applications cannot tolerate punctuation characters
* in column names and need to remove them. The CSV library itself has no limitations with regard to column
* names. The legalizer function is permitted to return the input array (perhaps with some elements modified) as
* the return value. Defaults to {@code Function#identity()}.
*/
Builder headerLegalizer(Function<String[], String[]> headerLegalizer);

Expand Down
16 changes: 9 additions & 7 deletions src/main/java/io/deephaven/csv/reading/CsvReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,7 @@ private static Result delimitedReadLogic(
headersTemp2 = headersTemp;
}
final int numOutputCols = headersTemp2.length;
final String[] headersToUse = canonicalizeHeaders(specs, headersTemp2);

return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory);
return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersTemp2, sinkFactory);
}

private static Result fixedReadLogic(
Expand All @@ -113,14 +111,17 @@ private static Result fixedReadLogic(

private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow,
int numInputCols, int numOutputCols,
String[] headersToUse, final SinkFactory sinkFactory)
String[] headersBeforeLegalization, final SinkFactory sinkFactory)
throws CsvReaderException {

final String[][] nullValueLiteralsToUse = new String[numOutputCols][];
for (int ii = 0; ii < numOutputCols; ++ii) {
nullValueLiteralsToUse[ii] =
calcNullValueLiteralsToUse(specs, headersToUse[ii], ii).toArray(new String[0]);
calcNullValueLiteralsToUse(specs, headersBeforeLegalization[ii], ii).toArray(new String[0]);
}

final String[] headersToUse = canonicalizeHeaders(specs, headersBeforeLegalization);

// Create a DenseStorageWriter for each column. The arrays are sized to "numInputCols" but only populated up to
// "numOutputCols". The remaining (numInputCols - numOutputCols) are set to null. The code in
// parseInputToDenseStorge knows that having a null DenseStorageWriter means that the column is all-empty and
Expand Down Expand Up @@ -156,7 +157,7 @@ private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber,
final ArrayList<Future<Object>> sinkFutures = new ArrayList<>();
try {
for (int ii = 0; ii < numOutputCols; ++ii) {
final List<Parser<?>> parsersToUse = calcParsersToUse(specs, headersToUse[ii], ii);
final List<Parser<?>> parsersToUse = calcParsersToUse(specs, headersBeforeLegalization[ii], ii);

final int iiCopy = ii;
final Future<Object> fcb = ecs.submit(
Expand Down Expand Up @@ -230,7 +231,8 @@ private static List<String> calcNullValueLiteralsToUse(final CsvSpecs specs, fin
}

private static String[] canonicalizeHeaders(CsvSpecs specs, final String[] headers) throws CsvReaderException {
final String[] legalized = specs.headerLegalizer().apply(headers);
// legalizer is allowed to mutate the input in-place, so we clone it before passing it.
final String[] legalized = specs.headerLegalizer().apply(headers.clone());
final Set<String> unique = new HashSet<>();
final List<String> repeats = new ArrayList<>();
final List<String> invalidNames = new ArrayList<>();
Expand Down
65 changes: 65 additions & 0 deletions src/test/java/io/deephaven/csv/CsvReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import java.util.Collections;
import java.util.List;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

Expand Down Expand Up @@ -206,6 +207,70 @@ public void bug162() throws CsvReaderException {
invokeTest(defaultCsvBuilder().parsers(Parsers.DEFAULT).build(), input, expected);
}

/**
* Reported in <a href="https://github.com/deephaven/deephaven-csv/issues/190">Deephaven CSV Issue #190</a>. That
* issue report misidentifies the root cause as having to do with reserved keywords. This is not correct because the
* library doesn't care whether a column header is a reserved keyword. The actual root cause is an interaction
* between the user-supplied "legalizer" and user-specified parsers or null literals that are specified by column
* names. Specifically the question is whether column names mentioned in {@link CsvSpecs.Builder#putParserForName}
* and {@link CsvSpecs.Builder#putNullValueLiteralsForName} should refer to the name that the column had *before* it
* was transformed by the legalizer, or *after*. The expected behavior is "before", but prior to this fix the
* library was doing the "after" behavior. This is a parameterized test that invokes the behavior for {delimited,
* fixed columns} x {without and with a legalizer}.
*/
@ParameterizedTest
@CsvSource({"false,false", "false,true", "true,false", "true,true"})
public void bug190(boolean hasFixedWidthColumns, boolean invokeLegalizer) throws CsvReaderException {
// +++ is the null value literal for Col1
// *** is the null value literal for Col2
// ??? is the null value literal for Col3

final String input;

if (!hasFixedWidthColumns) {
input = "Col1,Col2,Col3\n" +
"+++,20,30\n" +
"100,***,300\n" +
"1000,2000,???\n";
} else {
input = "Col1 Col2 Col3\n" +
"+++ 20 30\n" +
"100 *** 300\n" +
"1000 2000 ???\n";
}

final String[] expectedColumnNames = !invokeLegalizer ? new String[] {"Col1", "Col2", "Col3"}
: new String[] {"xyzCol1", "xyzCol2", "xyzCol3"};

final ColumnSet expected =
ColumnSet.of(
Column.ofValues(expectedColumnNames[0], Sentinels.NULL_LONG, (long) 100, (long) 1000),
Column.ofValues(expectedColumnNames[1], (double) 20, Sentinels.NULL_DOUBLE, (double) 2000),
Column.ofRefs(expectedColumnNames[2], "30", "300", null));

Function<String[], String[]> legalizer = in -> {
for (int i = 0; i != in.length; ++i) {
// e.g. transform Col1 to xyzCol1
in[i] = "xyz" + in[i];
}
return in;
};

CsvSpecs.Builder specsBase =
defaultCsvBuilder().hasFixedWidthColumns(hasFixedWidthColumns).parsers(Parsers.DEFAULT)
.putParserForName("Col1", Parsers.LONG).putParserForName("Col2", Parsers.DOUBLE)
.putParserForName("Col3", Parsers.STRING)
.putNullValueLiteralsForName("Col1", Collections.singletonList("+++"))
.putNullValueLiteralsForName("Col2", Collections.singletonList("***"))
.putNullValueLiteralsForName("Col3", Collections.singletonList("???"));

if (invokeLegalizer) {
specsBase = specsBase.headerLegalizer(legalizer);
}

invokeTest(specsBase.build(), input, expected);
}

@Test
public void validates() {
final String lengthyMessage = "CsvSpecs failed validation for the following reasons: "
Expand Down