From 8bd5ae16fe584637138dabfbc9f3461150b23e6f Mon Sep 17 00:00:00 2001 From: Corey Kosak Date: Sun, 3 Nov 2024 20:03:56 -0500 Subject: [PATCH] Add fixed-width column support --- src/main/java/io/deephaven/csv/CsvSpecs.java | 53 ++++ .../io/deephaven/csv/reading/CsvReader.java | 16 +- .../csv/reading/cells/FixedCellGrabber.java | 114 ++++++++ .../reading/headers/FixedHeaderFinder.java | 175 ++++++++++++ .../java/io/deephaven/csv/CsvReaderTest.java | 257 +++++++++++++++++- 5 files changed, 608 insertions(+), 7 deletions(-) create mode 100644 src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java create mode 100644 src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java diff --git a/src/main/java/io/deephaven/csv/CsvSpecs.java b/src/main/java/io/deephaven/csv/CsvSpecs.java index ffe76365..53d34224 100644 --- a/src/main/java/io/deephaven/csv/CsvSpecs.java +++ b/src/main/java/io/deephaven/csv/CsvSpecs.java @@ -117,6 +117,35 @@ public interface Builder { */ Builder headerValidator(Predicate headerValidator); + /** + * True if the input is organized into fixed width columns rather than delimited by a delimiter. + */ + Builder hasFixedWidthColumns(boolean hasFixedWidthColumns); + + /** + * When {@link #hasFixedWidthColumns} is set, the library either determines the column widths from the header + * row (provided {@link #hasHeaderRow} is set), or the column widths can be specified explictly by the caller. + * If the caller wants to specify them explicitly, they can use this method. + * @param fixedColumnWidths The caller-specified widths of the columns. + */ + Builder fixedColumnWidths(Iterable fixedColumnWidths); + + /** + * This setting controls what units fixed width columns are measured in. + * When true, fixed width columns are measured in Unicode code points. + * When false, fixed width columns are measured in UTF-16 units (aka Java chars). + * The difference arises when encountering characters outside the Unicode Basic Multilingual Plane. + * For example, the Unicode code point πŸ’” (U+1F494) is one Unicode code point, but takes + * two Java chars to represent. Along these lines, the string πŸ’”πŸ’”πŸ’” would fit in a column of width 3 + * when utf32CountingMode is true, but would require a column width of at least 6 when utf32CountingMode + * is false. + * + * The default setting of true is arguably more natural for users (the number of characters they see + * matches the visual width of the column). But some programs may want the value of false because they + * are counting Java chars. + */ + Builder useUtf32CountingConvention(boolean useUtf32CountingConvention); + /** * Number of data rows to skip before processing data. This is useful when you want to parse data in chunks. * Typically used together with {@link Builder#numRows}. Defaults to 0. @@ -340,6 +369,30 @@ public Predicate headerValidator() { return c -> true; } + /** + * See {@link Builder#hasFixedWidthColumns}. + */ + @Default + public boolean hasFixedWidthColumns() { + return false; + } + + /** + * See {@link Builder#fixedColumnWidths}. + */ + @Default + public List fixedColumnWidths() { + return Collections.emptyList(); + } + + /** + * See {@link Builder#useUtf32CountingConvention}. + */ + @Default + public boolean useUtf32CountingConvention() { + return true; + } + /** * See {@link Builder#skipRows}. */ diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java index 9a9944aa..822ce4dd 100644 --- a/src/main/java/io/deephaven/csv/reading/CsvReader.java +++ b/src/main/java/io/deephaven/csv/reading/CsvReader.java @@ -7,7 +7,9 @@ import io.deephaven.csv.parsers.Parser; import io.deephaven.csv.reading.cells.CellGrabber; import io.deephaven.csv.reading.cells.DelimitedCellGrabber; +import io.deephaven.csv.reading.cells.FixedCellGrabber; import io.deephaven.csv.reading.headers.DelimitedHeaderFinder; +import io.deephaven.csv.reading.headers.FixedHeaderFinder; import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.sinks.SinkFactory; import io.deephaven.csv.util.*; @@ -63,7 +65,9 @@ private CsvReader() {} */ public static Result read(final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory) throws CsvReaderException { - return delimitedReadLogic(specs, stream, sinkFactory); + return specs.hasFixedWidthColumns() ? + fixedReadLogic(specs, stream, sinkFactory) : + delimitedReadLogic(specs, stream, sinkFactory); } private static Result delimitedReadLogic( @@ -97,6 +101,16 @@ private static Result delimitedReadLogic( return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory); } + private static Result fixedReadLogic( + final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory) throws CsvReaderException { + final CellGrabber lineGrabber = FixedCellGrabber.makeLineGrabber(stream); + MutableObject columnWidths = new MutableObject<>(); + final String[] headers = FixedHeaderFinder.determineHeadersToUse(specs, lineGrabber, columnWidths); + final int numCols = headers.length; + final CellGrabber grabber = new FixedCellGrabber(lineGrabber, columnWidths.getValue(), + specs.ignoreSurroundingSpaces(), specs.useUtf32CountingConvention()); + return commonReadLogic(specs, grabber, null, numCols, numCols, headers, sinkFactory); + } private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow, int numInputCols, int numOutputCols, diff --git a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java new file mode 100644 index 00000000..32225925 --- /dev/null +++ b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java @@ -0,0 +1,114 @@ +package io.deephaven.csv.reading.cells; + +import io.deephaven.csv.containers.ByteSlice; +import io.deephaven.csv.reading.ReaderUtil; +import io.deephaven.csv.util.CsvReaderException; +import io.deephaven.csv.util.MutableBoolean; +import io.deephaven.csv.util.MutableInt; + +import java.io.InputStream; + +/** + * This class uses an underlying DelimitedCellGrabber to grab whole lines at a time from the input stream, + * and then it breaks them into fixed-sized cells to return to the caller. + */ +public class FixedCellGrabber implements CellGrabber { + /** + * Makes a degenerate CellGrabber that has no delimiters or quotes and therefore returns whole lines. + * This is a somewhat quick-and-dirty way to reuse the buffering and newline logic in DelimitedCellGrabber + * without rewriting it. + * @param stream The underlying stream. + * @return The "line grabber" + */ + public static CellGrabber makeLineGrabber(InputStream stream) { + final byte IllegalUtf8 = (byte)0xff; + return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false); + } + + private final CellGrabber lineGrabber; + private final int[] columnWidths; + private final boolean ignoreSurroundingSpaces; + private final boolean utf32CountingMode; + private final ByteSlice rowText; + private boolean needsUnderlyingRefresh; + private int colIndex; + private final MutableBoolean dummy1; + private final MutableInt dummy2; + + /** Constructor. */ + public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces, + boolean utf32CountingMode) { + this.lineGrabber = lineGrabber; + this.columnWidths = columnWidths; + this.ignoreSurroundingSpaces = ignoreSurroundingSpaces; + this.utf32CountingMode = utf32CountingMode; + this.rowText = new ByteSlice(); + this.needsUnderlyingRefresh = true; + this.colIndex = 0; + this.dummy1 = new MutableBoolean(); + this.dummy2 = new MutableInt(); + } + + @Override + public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean endOfInput) throws CsvReaderException { + if (needsUnderlyingRefresh) { + // Underlying row used up, and all columns provided. Ask underlying CellGrabber for the next line. + lineGrabber.grabNext(rowText, dummy1, endOfInput); + + if (endOfInput.booleanValue()) { + // Set dest to the empty string, and leave 'endOfInput' set to true. + dest.reset(rowText.data(), rowText.end(), rowText.end()); + return; + } + + needsUnderlyingRefresh = false; + colIndex = 0; + } + + // There is data to return. Count off N characters. The final column gets all remaining characters. + final boolean lastCol = colIndex == columnWidths.length - 1; + final int numCharsToTake = lastCol ? Integer.MAX_VALUE : columnWidths[colIndex]; + takeNCharactersInCharset(rowText, dest, numCharsToTake, utf32CountingMode, dummy2); + ++colIndex; + needsUnderlyingRefresh = lastCol || dest.size() == 0; + lastInRow.setValue(needsUnderlyingRefresh); + endOfInput.setValue(false); + + if (ignoreSurroundingSpaces) { + ReaderUtil.trimWhitespace(dest); + } + } + + private static void takeNCharactersInCharset(ByteSlice src, ByteSlice dest, int numCharsToTake, + boolean utf32CountingMode, MutableInt tempInt) { + final byte[] data = src.data(); + final int cellBegin = src.begin(); + int current = cellBegin; + while (numCharsToTake > 0) { + if (current == src.end()) { + break; + } + final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(data[current], src.end() - current, + utf32CountingMode, tempInt); + if (numCharsToTake < tempInt.intValue()) { + // There is not enough space left in the field to store this character. + // This can happen if CsvSpecs is set for the UTF16 counting convention, + // there is one unit left in the field, and we encounter a character outside + // the Basic Multilingual Plane, which would require two units. + break; + } + numCharsToTake -= tempInt.intValue(); + current += utf8Length; + if (current > src.end()) { + throw new RuntimeException("Data error: partial UTF-8 sequence found in input"); + } + } + dest.reset(src.data(), cellBegin, current); + src.reset(src.data(), current, src.end()); + } + + @Override + public int physicalRowNum() { + return lineGrabber.physicalRowNum(); + } +} diff --git a/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java new file mode 100644 index 00000000..e9ad7b5e --- /dev/null +++ b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java @@ -0,0 +1,175 @@ +package io.deephaven.csv.reading.headers; + +import io.deephaven.csv.CsvSpecs; +import io.deephaven.csv.containers.ByteSlice; +import io.deephaven.csv.reading.ReaderUtil; +import io.deephaven.csv.reading.cells.CellGrabber; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.util.CsvReaderException; +import io.deephaven.csv.util.MutableBoolean; +import io.deephaven.csv.util.MutableInt; +import io.deephaven.csv.util.MutableObject; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class FixedHeaderFinder { + /** + * Determine which headers to use. The result comes from either the first row of the file or the user-specified + * overrides. + */ + public static String[] determineHeadersToUse( + final CsvSpecs specs, + final CellGrabber lineGrabber, + MutableObject columnWidthsResult) + throws CsvReaderException { + String[] headersToUse; + // Get user-specified column widths, if any. If not, this will be an array of length 0. + // UNITS: UTF8 CHARACTERS + int[] columnWidthsToUse = specs.fixedColumnWidths().stream().mapToInt(Integer::intValue).toArray(); + if (specs.hasHeaderRow()) { + long skipCount = specs.skipHeaderRows(); + final ByteSlice headerRow = new ByteSlice(); + MutableBoolean lastInRow = new MutableBoolean(); + MutableBoolean endOfInput = new MutableBoolean(); + while (true) { + lineGrabber.grabNext(headerRow, lastInRow, endOfInput); + if (endOfInput.booleanValue()) { + throw new CsvReaderException( + "Can't proceed because hasHeaderRow is set but input file is empty or shorter than skipHeaderRows"); + } + if (skipCount == 0) { + break; + } + --skipCount; + } + final byte paddingByte = (byte)specs.delimiter(); + if (columnWidthsToUse.length == 0) { + // UNITS: UTF8 CHARACTERS + columnWidthsToUse = inferColumnWidths(headerRow, paddingByte, specs.useUtf32CountingConvention()); + } + + // DESIRED UNITS: UTF8 CHARACTERS + headersToUse = extractHeaders(headerRow, columnWidthsToUse, paddingByte, specs.useUtf32CountingConvention()); + } else { + if (columnWidthsToUse.length == 0) { + throw new CsvReaderException("Can't proceed because hasHeaderRow is false but fixedColumnWidths is unspecified"); + } + headersToUse = ReaderUtil.makeSyntheticHeaders(columnWidthsToUse.length); + } + + // Whether or not the input had headers, maybe override with client-specified headers. + if (specs.headers().size() != 0) { + if (specs.headers().size() != headersToUse.length) { + final String message = String.format("Library determined %d headers; caller overrode with %d headers", + headersToUse.length, specs.headers().size()); + throw new CsvReaderException(message); + } + headersToUse = specs.headers().toArray(new String[0]); + } + + // Apply column specific overrides. + for (Map.Entry entry : specs.headerForIndex().entrySet()) { + headersToUse[entry.getKey()] = entry.getValue(); + } + + // DESIRED UNITS: UTF8 CHARACTERS + columnWidthsResult.setValue(columnWidthsToUse); + return headersToUse; + } + + // RETURNS UNITS: UTF8 CHARACTERS + private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, boolean useUtf32CountingConvention) { + // A column start is a non-delimiter character preceded by a delimiter (or present at the start of line). + // If the start of the line is a delimiter, that is an error. + final List columnWidths = new ArrayList<>(); + final MutableInt charCountResult = new MutableInt(); + boolean prevCharIsDelimiter = false; + final byte[] data = row.data(); + int numChars = 0; + int currentIndex = row.begin(); + while (true) { + if (currentIndex == row.end()) { + columnWidths.add(numChars); + return columnWidths.stream().mapToInt(Integer::intValue).toArray(); + } + // If this character is not a delimiter, but the previous one was, then this is the start of a new column. + byte ch = data[currentIndex]; + boolean thisCharIsDelimiter = ch == delimiterAsByte; + if (currentIndex == row.begin() && thisCharIsDelimiter) { + throw new IllegalArgumentException( + String.format("Header row cannot start with the delimiter character '%c'", (char)delimiterAsByte)); + } + if (!thisCharIsDelimiter && prevCharIsDelimiter) { + columnWidths.add(numChars); + numChars = 0; + } + prevCharIsDelimiter = thisCharIsDelimiter; + final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(ch, row.end() - currentIndex, + useUtf32CountingConvention, charCountResult); + currentIndex += utf8Length; + numChars += charCountResult.intValue(); + } + } + + // UNITS: UTF8 CHARACTERS + private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte paddingByte, + boolean utf32CountingMode) { + final int numCols = columnWidths.length; + if (numCols == 0) { + return new String[0]; + } + final int[] byteWidths = new int[numCols]; + final ByteSlice tempSlice = new ByteSlice(); + final int excessBytes = charWidthsToByteWidths(row, columnWidths, utf32CountingMode, byteWidths); + // Our policy is that the last column gets any excess bytes that are in the row. + byteWidths[numCols - 1] += excessBytes; + final String[] result = new String[numCols]; + + int beginByte = row.begin(); + for (int colNum = 0; colNum != numCols; ++colNum) { + final int proposedEndByte = beginByte + byteWidths[colNum]; + final int actualEndByte = Math.min(proposedEndByte, row.end()); + tempSlice.reset(row.data(), beginByte, actualEndByte); + tempSlice.trimPadding(paddingByte); + result[colNum] = tempSlice.toString(); + beginByte = actualEndByte; + } + return result; + } + + private static int charWidthsToByteWidths(ByteSlice row, int[] charWidths, boolean utf32CountingMode, + int[] byteWidths) { + int numCols = charWidths.length; + if (byteWidths.length != numCols) { + throw new IllegalArgumentException(String.format("Expected charWidths.length (%d) == byteWidths.length (%d)", + charWidths.length, byteWidths.length)); + } + final MutableInt charCountResult = new MutableInt(); + final byte[] data = row.data(); + int start = row.begin(); + int current = start; + int colIndex = 0; + int charCount = 0; + while (true) { + if (colIndex == numCols) { + // Excess bytes not claimed by any column + return row.end() - current; + } + if (charCount == charWidths[colIndex]) { + byteWidths[colIndex] = current - start; + start = current; + charCount = 0; + ++colIndex; + continue; + } + + final byte ch = data[current]; + final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(ch, row.end() - current, utf32CountingMode, + charCountResult); + current += utf8Length; + charCount += charCountResult.intValue(); + } + } +} diff --git a/src/test/java/io/deephaven/csv/CsvReaderTest.java b/src/test/java/io/deephaven/csv/CsvReaderTest.java index 4f5e5867..7d852b1b 100644 --- a/src/test/java/io/deephaven/csv/CsvReaderTest.java +++ b/src/test/java/io/deephaven/csv/CsvReaderTest.java @@ -26,6 +26,8 @@ import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import java.io.*; import java.lang.reflect.Array; @@ -1853,12 +1855,6 @@ public void lotsOfDataDoesntChoke() throws CsvReaderException { public void colnumPassedThrough() throws CsvReaderException { final String input = "" + "Col1,Col2,Col3\n" + "1,2,3\n" + "4,5,6\n" + "7,8,9\n"; - final ColumnSet expected = - ColumnSet.of( - Column.ofValues("Col1", 1, 4, 7), - Column.ofValues("Col2", 2, 5, 8), - Column.ofValues("Col3", 3, 6, 9)); - final InputStream inputStream = toInputStream(input); final CsvSpecs specs = defaultCsvSpecs(); final SinkFactory sinkFactory = makeBlackholeSinkFactory(); @@ -1873,6 +1869,255 @@ public void colnumPassedThrough() throws CsvReaderException { Assertions.assertThat(bh2Num).isEqualTo(2); } + /** + * Addresses A user requested that the library + * be able to read files like this. + */ + @Test + public void bug212() throws CsvReaderException { + final String input = + "" + + "NAME STATUS AGE LABELS\n" + + "argo-events Not Active 2y77d app.kubernetes.io/instance=argo-events,kubernetes.io/metadata.name=argo-events\n" + + "argo-workflows Active 2y77d app.kubernetes.io/instance=argo-workflows,kubernetes.io/metadata.name=argo-workflows\n" + + "argocd Active 5y18d kubernetes.io/metadata.name=argocd\n" + + "beta Not Active 4y235d kubernetes.io/metadata.name=beta\n"; + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + .ignoreSurroundingSpaces(true).build(); + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("NAME", "argo-events", "argo-workflows", "argocd", "beta"), + Column.ofRefs("STATUS", "Not Active", "Active", "Active", "Not Active"), + Column.ofRefs("AGE", "2y77d", "2y77d", "5y18d", "4y235d"), + Column.ofRefs("LABELS", "app.kubernetes.io/instance=argo-events,kubernetes.io/metadata.name=argo-events", + "app.kubernetes.io/instance=argo-workflows,kubernetes.io/metadata.name=argo-workflows", + "kubernetes.io/metadata.name=argocd", + "kubernetes.io/metadata.name=beta")); + + invokeTest(specs, input, expected); + } + + @Test + public void simpleFixedColumnWidths() throws CsvReaderException { + final String input = + "" + + "Sym Type Price SecurityId\n" + + "GOOG Dividend 0.25 200\n" + + "T Dividend 0.15 300\n" + + "Z Dividend 0.18 500\n"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build(); + + invokeTest(specs, input, expected); + } + + /** + * We allow data fields to fill the whole cell, without a padding character + * @throws CsvReaderException + */ + @Test + public void fixedColumnWidthsFullCell() throws CsvReaderException { + final String input = + "" + + "Sym Type Price SecurityId\n" + + "GOOGLEDividend!0.25 200\n" + + "T Dividend 0.15 300\n"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Sym", "GOOGLE", "T"), + Column.ofRefs("Type", "Dividend!", "Dividend"), + Column.ofValues("Price", 0.25, 0.15), + Column.ofValues("SecurityId", 200, 300)); + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build(); + invokeTest(specs, input, expected); + } + + /** + * As usual, we allow rows to be short + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void fixedColumnWidthsShortRows(boolean allowMissingColumns) throws CsvReaderException { + final String input = + "" + + "Sym Type Price SecurityId\n" + + "GOOG\n" + + "T Dividend 0.15 300\n" + + "Z Dividend 0.18 500\n" + + "QQQ Coupon\n"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", "Z", "QQQ"), + Column.ofRefs("Type", null, "Dividend", "Dividend", "Coupon"), + Column.ofValues("Price", Sentinels.NULL_DOUBLE, 0.15, 0.18, Sentinels.NULL_DOUBLE), + Column.ofValues("SecurityId", Sentinels.NULL_INT, 300, 500, Sentinels.NULL_INT)); + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + .ignoreSurroundingSpaces(true).allowMissingColumns(allowMissingColumns).build(); + + if (allowMissingColumns) { + invokeTest(specs, input, expected); + } else { + Assertions.assertThatThrownBy(() -> invokeTest(specs, input, expected)) + .hasRootCauseMessage("Row 2 has too few columns (expected 4)"); + } + } + + /** + * All six Unicode characters β™‘β™₯β₯❦◑╳ are in the Basic Multilingual Plane and can all be represented + * with a single Java char. Therefore, they are counted the same with both counting conventions. + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void countsBMPCharactersTheSame(boolean useUtf32CountingConvention) throws CsvReaderException { + final String input = + "" + + "Sym Type Price SecurityId\n" + + "β™‘β™₯β₯❦◑╳Dividend 0.15 300\n" + + "Z Dividend 0.18 500\n"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Sym", "β™‘β™₯β₯❦◑╳", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend"), + Column.ofValues("Price", 0.15, 0.18), + Column.ofValues("SecurityId", 300, 500)); + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build(); + + invokeTest(specs, input, expected); + } + + /** + * All six Unicode characters πŸ₯°πŸ˜»πŸ§‘πŸ’“πŸ’•πŸ’– are _outside_ the Basic Multilingual Plane and all are represented + * with two Java chars. The Sym column has a width of six. They will fit in the "Sym" column if the caller + * uses UTF-32 counting convention. They will not fit in the column if the caller uses the UTF-16 counting + * convention (because it takes 12 Java chars to express them). + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void countsNonBMPCharactersDifferently(boolean useUtf32CountingConvention) throws CsvReaderException { + final String input = + "" + + "Sym Type\n" + + "πŸ₯°πŸ˜»πŸ§‘πŸ’“πŸ’•πŸ’–Dividend\n" + + "Z Dividend\n"; + + final ColumnSet expected; + + if (useUtf32CountingConvention) { + expected = ColumnSet.of( + Column.ofRefs("Sym", "πŸ₯°πŸ˜»πŸ§‘πŸ’“πŸ’•πŸ’–", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend")); + } else { + expected = ColumnSet.of( + Column.ofRefs("Sym", "πŸ₯°πŸ˜»πŸ§‘", "Z"), + Column.ofRefs("Type", "πŸ’“πŸ’•πŸ’–Dividend", "Dividend")); + } + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build(); + + invokeTest(specs, input, expected); + } + + /** + * Using Unicode characters as column headers. We give one column a header with characters from the BMP + * and one with characters outside the BMP and show how the behavior differs depending on the + * useUtf32CountingConvention flag. + * ╔═╗ + * All six Unicode characters πŸ₯°πŸ˜»πŸ§‘πŸ’“πŸ’•πŸ’– are _outside_ the Basic Multilingual Plane and all are represented + * with two Java chars. The Sym column has a width of six. They will fit in the "Sym" column if the caller + * uses UTF-32 counting convention. They will not fit in the column if the caller uses the UTF-16 counting + * convention (because it takes 12 Java chars to express them). + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void unicodeColumnHeaders(boolean useUtf32CountingConvention) throws CsvReaderException { + // In the UTF-32 counting convention, this is a column of width 4 (three Unicode characters plus the space) + // followed by a column of width 5. The first cell of the data would therefore be "abc", and the next cell + // would be "def". + + // In the UTF-16 counting convention, this is a column of width 7 (six UTF-16 units plus the space) + // followed by a column of width 5. The first cell of the data would therefore be "abc def" and the next + // cell woult be "gh". + final String input = + "" + + "πŸ₯°πŸ˜»πŸ§‘ ╔═╀═╗\n" + + "abc defgh\n"; + + final ColumnSet expected; + + if (useUtf32CountingConvention) { + expected = ColumnSet.of( + Column.ofRefs("πŸ₯°πŸ˜»πŸ§‘", "abc"), + Column.ofRefs("╔═╀═╗", "defgh")); + } else { + expected = ColumnSet.of( + Column.ofRefs("πŸ₯°πŸ˜»πŸ§‘", "abc def"), + Column.ofRefs("╔═╀═╗", "gh")); + } + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build(); + + invokeTest(specs, input, expected); + } + + /** + * If the library is configured for the UTF-16 counting convention, and there is only one unit of space left + * in the field, and the next character is a character outside the Basic Multilingual Plane that requires two units, + * the library will include that character in the next field rather than this one. + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void brokenSurrogatePair(boolean useUtf32CountingConvention) throws CsvReaderException { + // This test has a column of width 3 (three characters plus the space) + // followed by a column of width 2. + // + // In the UTF-32 counting convention, the first column will get "πŸ₯°πŸ˜» " and the second column will + // get "πŸ§‘πŸ’“". We turn off ignoreSurroundingSpaces to highlight how this is counted. + // + // In the UTF-16 counting convention, the first column will get πŸ₯° (because πŸ₯°πŸ˜» uses characters + // outside the Basic Multilingual Plane and takes four units to represent, but the first field + // only has space for three). The next column will get "😻 πŸ§‘πŸ’“" (the rest of the row). + final String input = + "" + + "C1 C2\n" + + "πŸ₯°πŸ˜» πŸ§‘πŸ’“\n"; + + final ColumnSet expected; + + if (useUtf32CountingConvention) { + expected = ColumnSet.of( + Column.ofRefs("C1", "πŸ₯°πŸ˜» "), + Column.ofRefs("C2", "πŸ§‘πŸ’“")); + } else { + expected = ColumnSet.of( + Column.ofRefs("C1", "πŸ₯°"), + Column.ofRefs("C2", "😻 πŸ§‘πŸ’“")); + } + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + .ignoreSurroundingSpaces(false).useUtf32CountingConvention(useUtf32CountingConvention).build(); + + invokeTest(specs, input, expected); + } + + + private static final class RepeatingInputStream extends InputStream { private byte[] data; private final byte[] body;