From 8bd5ae16fe584637138dabfbc9f3461150b23e6f Mon Sep 17 00:00:00 2001
From: Corey Kosak <coreykosak@deephaven.io>
Date: Sun, 3 Nov 2024 20:03:56 -0500
Subject: [PATCH] Add fixed-width column support

---
 src/main/java/io/deephaven/csv/CsvSpecs.java  |  53 ++++
 .../io/deephaven/csv/reading/CsvReader.java   |  16 +-
 .../csv/reading/cells/FixedCellGrabber.java   | 114 ++++++++
 .../reading/headers/FixedHeaderFinder.java    | 175 ++++++++++++
 .../java/io/deephaven/csv/CsvReaderTest.java  | 257 +++++++++++++++++-
 5 files changed, 608 insertions(+), 7 deletions(-)
 create mode 100644 src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
 create mode 100644 src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java
diff --git a/src/main/java/io/deephaven/csv/CsvSpecs.java b/src/main/java/io/deephaven/csv/CsvSpecs.java
index ffe76365..53d34224 100644
--- a/src/main/java/io/deephaven/csv/CsvSpecs.java
+++ b/src/main/java/io/deephaven/csv/CsvSpecs.java
@@ -117,6 +117,35 @@ public interface Builder {
          */
         Builder headerValidator(Predicate<String> headerValidator);
 
+        /**
+         * True if the input is organized into fixed width columns rather than delimited by a delimiter.
+         */
+        Builder hasFixedWidthColumns(boolean hasFixedWidthColumns);
+
+        /**
+         * When {@link #hasFixedWidthColumns} is set, the library either determines the column widths from the header
+         * row (provided {@link #hasHeaderRow} is set), or the column widths can be specified explictly by the caller.
+         * If the caller wants to specify them explicitly, they can use this method.
+         * @param fixedColumnWidths The caller-specified widths of the columns.
+         */
+        Builder fixedColumnWidths(Iterable<Integer> fixedColumnWidths);
+
+        /**
+         * This setting controls what units fixed width columns are measured in.
+         * When true, fixed width columns are measured in Unicode code points.
+         * When false, fixed width columns are measured in UTF-16 units (aka Java chars).
+         * The difference arises when encountering characters outside the Unicode Basic Multilingual Plane.
+         * For example, the Unicode code point 💔 (U+1F494) is one Unicode code point, but takes
+         * two Java chars to represent. Along these lines, the string 💔💔💔 would fit in a column of width 3
+         * when utf32CountingMode is true, but would require a column width of at least 6 when utf32CountingMode
+         * is false.
+         *
+         * The default setting of true is arguably more natural for users (the number of characters they see
+         * matches the visual width of the column). But some programs may want the value of false because they
+         * are counting Java chars.
+         */
+        Builder useUtf32CountingConvention(boolean useUtf32CountingConvention);
+
         /**
          * Number of data rows to skip before processing data. This is useful when you want to parse data in chunks.
          * Typically used together with {@link Builder#numRows}. Defaults to 0.
@@ -340,6 +369,30 @@ public Predicate<String> headerValidator() {
         return c -> true;
     }
 
+    /**
+     * See {@link Builder#hasFixedWidthColumns}.
+     */
+    @Default
+    public boolean hasFixedWidthColumns() {
+        return false;
+    }
+
+    /**
+     * See {@link Builder#fixedColumnWidths}.
+     */
+    @Default
+    public List<Integer> fixedColumnWidths() {
+        return Collections.emptyList();
+    }
+
+    /**
+     * See {@link Builder#useUtf32CountingConvention}.
+     */
+    @Default
+    public boolean useUtf32CountingConvention() {
+        return true;
+    }
+
     /**
      * See {@link Builder#skipRows}.
      */
diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java
index 9a9944aa..822ce4dd 100644
--- a/src/main/java/io/deephaven/csv/reading/CsvReader.java
+++ b/src/main/java/io/deephaven/csv/reading/CsvReader.java
@@ -7,7 +7,9 @@
 import io.deephaven.csv.parsers.Parser;
 import io.deephaven.csv.reading.cells.CellGrabber;
 import io.deephaven.csv.reading.cells.DelimitedCellGrabber;
+import io.deephaven.csv.reading.cells.FixedCellGrabber;
 import io.deephaven.csv.reading.headers.DelimitedHeaderFinder;
+import io.deephaven.csv.reading.headers.FixedHeaderFinder;
 import io.deephaven.csv.sinks.Sink;
 import io.deephaven.csv.sinks.SinkFactory;
 import io.deephaven.csv.util.*;
@@ -63,7 +65,9 @@ private CsvReader() {}
      */
     public static Result read(final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory)
             throws CsvReaderException {
-        return delimitedReadLogic(specs, stream, sinkFactory);
+        return specs.hasFixedWidthColumns() ?
+                fixedReadLogic(specs, stream, sinkFactory) :
+                delimitedReadLogic(specs, stream, sinkFactory);
     }
 
     private static Result delimitedReadLogic(
@@ -97,6 +101,16 @@ private static Result delimitedReadLogic(
         return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory);
     }
 
+    private static Result fixedReadLogic(
+            final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory) throws CsvReaderException {
+        final CellGrabber lineGrabber = FixedCellGrabber.makeLineGrabber(stream);
+        MutableObject<int[]> columnWidths = new MutableObject<>();
+        final String[] headers = FixedHeaderFinder.determineHeadersToUse(specs, lineGrabber, columnWidths);
+        final int numCols = headers.length;
+        final CellGrabber grabber = new FixedCellGrabber(lineGrabber, columnWidths.getValue(),
+                specs.ignoreSurroundingSpaces(), specs.useUtf32CountingConvention());
+        return commonReadLogic(specs, grabber, null, numCols, numCols, headers, sinkFactory);
+    }
 
     private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow,
             int numInputCols, int numOutputCols,
diff --git a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
new file mode 100644
index 00000000..32225925
--- /dev/null
+++ b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
@@ -0,0 +1,114 @@
+package io.deephaven.csv.reading.cells;
+
+import io.deephaven.csv.containers.ByteSlice;
+import io.deephaven.csv.reading.ReaderUtil;
+import io.deephaven.csv.util.CsvReaderException;
+import io.deephaven.csv.util.MutableBoolean;
+import io.deephaven.csv.util.MutableInt;
+
+import java.io.InputStream;
+
+/**
+ * This class uses an underlying DelimitedCellGrabber to grab whole lines at a time from the input stream,
+ * and then it breaks them into fixed-sized cells to return to the caller.
+ */
+public class FixedCellGrabber implements CellGrabber {
+    /**
+     * Makes a degenerate CellGrabber that has no delimiters or quotes and therefore returns whole lines.
+     * This is a somewhat quick-and-dirty way to reuse the buffering and newline logic in DelimitedCellGrabber
+     * without rewriting it.
+     * @param stream The underlying stream.
+     * @return The "line grabber"
+     */
+    public static CellGrabber makeLineGrabber(InputStream stream) {
+        final byte IllegalUtf8 = (byte)0xff;
+        return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false);
+    }
+
+    private final CellGrabber lineGrabber;
+    private final int[] columnWidths;
+    private final boolean ignoreSurroundingSpaces;
+    private final boolean utf32CountingMode;
+    private final ByteSlice rowText;
+    private boolean needsUnderlyingRefresh;
+    private int colIndex;
+    private final MutableBoolean dummy1;
+    private final MutableInt dummy2;
+
+    /** Constructor. */
+    public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces,
+                            boolean utf32CountingMode) {
+        this.lineGrabber = lineGrabber;
+        this.columnWidths = columnWidths;
+        this.ignoreSurroundingSpaces = ignoreSurroundingSpaces;
+        this.utf32CountingMode = utf32CountingMode;
+        this.rowText = new ByteSlice();
+        this.needsUnderlyingRefresh = true;
+        this.colIndex = 0;
+        this.dummy1 = new MutableBoolean();
+        this.dummy2 = new MutableInt();
+    }
+
+    @Override
+    public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean endOfInput) throws CsvReaderException {
+        if (needsUnderlyingRefresh) {
+            // Underlying row used up, and all columns provided. Ask underlying CellGrabber for the next line.
+            lineGrabber.grabNext(rowText, dummy1, endOfInput);
+
+            if (endOfInput.booleanValue()) {
+                // Set dest to the empty string, and leave 'endOfInput' set to true.
+                dest.reset(rowText.data(), rowText.end(), rowText.end());
+                return;
+            }
+
+            needsUnderlyingRefresh = false;
+            colIndex = 0;
+        }
+
+        // There is data to return. Count off N characters. The final column gets all remaining characters.
+        final boolean lastCol  = colIndex == columnWidths.length - 1;
+        final int numCharsToTake = lastCol ? Integer.MAX_VALUE : columnWidths[colIndex];
+        takeNCharactersInCharset(rowText, dest, numCharsToTake, utf32CountingMode, dummy2);
+        ++colIndex;
+        needsUnderlyingRefresh = lastCol || dest.size() == 0;
+        lastInRow.setValue(needsUnderlyingRefresh);
+        endOfInput.setValue(false);
+
+        if (ignoreSurroundingSpaces) {
+            ReaderUtil.trimWhitespace(dest);
+        }
+    }
+
+    private static void takeNCharactersInCharset(ByteSlice src, ByteSlice dest, int numCharsToTake,
+                                                 boolean utf32CountingMode, MutableInt tempInt) {
+        final byte[] data = src.data();
+        final int cellBegin = src.begin();
+        int current = cellBegin;
+        while (numCharsToTake > 0) {
+            if (current == src.end()) {
+                break;
+            }
+            final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(data[current], src.end() - current,
+                    utf32CountingMode, tempInt);
+            if (numCharsToTake < tempInt.intValue()) {
+                // There is not enough space left in the field to store this character.
+                // This can happen if CsvSpecs is set for the UTF16 counting convention,
+                // there is one unit left in the field, and we encounter a character outside
+                // the Basic Multilingual Plane, which would require two units.
+                break;
+            }
+            numCharsToTake -= tempInt.intValue();
+            current += utf8Length;
+            if (current > src.end()) {
+                throw new RuntimeException("Data error: partial UTF-8 sequence found in input");
+            }
+        }
+        dest.reset(src.data(), cellBegin, current);
+        src.reset(src.data(), current, src.end());
+    }
+
+    @Override
+    public int physicalRowNum() {
+        return lineGrabber.physicalRowNum();
+    }
+}
diff --git a/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java
new file mode 100644
index 00000000..e9ad7b5e
--- /dev/null
+++ b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java
@@ -0,0 +1,175 @@
+package io.deephaven.csv.reading.headers;
+
+import io.deephaven.csv.CsvSpecs;
+import io.deephaven.csv.containers.ByteSlice;
+import io.deephaven.csv.reading.ReaderUtil;
+import io.deephaven.csv.reading.cells.CellGrabber;
+import io.deephaven.csv.tokenization.Tokenizer;
+import io.deephaven.csv.util.CsvReaderException;
+import io.deephaven.csv.util.MutableBoolean;
+import io.deephaven.csv.util.MutableInt;
+import io.deephaven.csv.util.MutableObject;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class FixedHeaderFinder {
+    /**
+     * Determine which headers to use. The result comes from either the first row of the file or the user-specified
+     * overrides.
+     */
+    public static String[] determineHeadersToUse(
+            final CsvSpecs specs,
+            final CellGrabber lineGrabber,
+            MutableObject<int[]> columnWidthsResult)
+            throws CsvReaderException {
+        String[] headersToUse;
+        // Get user-specified column widths, if any. If not, this will be an array of length 0.
+        // UNITS: UTF8 CHARACTERS
+        int[] columnWidthsToUse = specs.fixedColumnWidths().stream().mapToInt(Integer::intValue).toArray();
+        if (specs.hasHeaderRow()) {
+            long skipCount = specs.skipHeaderRows();
+            final ByteSlice headerRow = new ByteSlice();
+            MutableBoolean lastInRow = new MutableBoolean();
+            MutableBoolean endOfInput = new MutableBoolean();
+            while (true) {
+                lineGrabber.grabNext(headerRow, lastInRow, endOfInput);
+                if (endOfInput.booleanValue()) {
+                    throw new CsvReaderException(
+                            "Can't proceed because hasHeaderRow is set but input file is empty or shorter than skipHeaderRows");
+                }
+                if (skipCount == 0) {
+                    break;
+                }
+                --skipCount;
+            }
+            final byte paddingByte = (byte)specs.delimiter();
+            if (columnWidthsToUse.length == 0) {
+                // UNITS: UTF8 CHARACTERS
+                columnWidthsToUse = inferColumnWidths(headerRow, paddingByte, specs.useUtf32CountingConvention());
+            }
+
+            // DESIRED UNITS: UTF8 CHARACTERS
+            headersToUse = extractHeaders(headerRow, columnWidthsToUse, paddingByte, specs.useUtf32CountingConvention());
+        } else {
+            if (columnWidthsToUse.length == 0) {
+                throw new CsvReaderException("Can't proceed because hasHeaderRow is false but fixedColumnWidths is unspecified");
+            }
+            headersToUse = ReaderUtil.makeSyntheticHeaders(columnWidthsToUse.length);
+        }
+
+        // Whether or not the input had headers, maybe override with client-specified headers.
+        if (specs.headers().size() != 0) {
+            if (specs.headers().size() != headersToUse.length) {
+                final String message = String.format("Library determined %d headers; caller overrode with %d headers",
+                        headersToUse.length, specs.headers().size());
+                throw new CsvReaderException(message);
+            }
+            headersToUse = specs.headers().toArray(new String[0]);
+        }
+
+        // Apply column specific overrides.
+        for (Map.Entry<Integer, String> entry : specs.headerForIndex().entrySet()) {
+            headersToUse[entry.getKey()] = entry.getValue();
+        }
+
+        // DESIRED UNITS: UTF8 CHARACTERS
+        columnWidthsResult.setValue(columnWidthsToUse);
+        return headersToUse;
+    }
+
+    // RETURNS UNITS: UTF8 CHARACTERS
+    private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, boolean useUtf32CountingConvention) {
+        // A column start is a non-delimiter character preceded by a delimiter (or present at the start of line).
+        // If the start of the line is a delimiter, that is an error.
+        final List<Integer> columnWidths = new ArrayList<>();
+        final MutableInt charCountResult = new MutableInt();
+        boolean prevCharIsDelimiter = false;
+        final byte[] data = row.data();
+        int numChars = 0;
+        int currentIndex = row.begin();
+        while (true) {
+            if (currentIndex == row.end()) {
+                columnWidths.add(numChars);
+                return columnWidths.stream().mapToInt(Integer::intValue).toArray();
+            }
+            // If this character is not a delimiter, but the previous one was, then this is the start of a new column.
+            byte ch = data[currentIndex];
+            boolean thisCharIsDelimiter = ch == delimiterAsByte;
+            if (currentIndex == row.begin() && thisCharIsDelimiter) {
+                throw new IllegalArgumentException(
+                        String.format("Header row cannot start with the delimiter character '%c'", (char)delimiterAsByte));
+            }
+            if (!thisCharIsDelimiter && prevCharIsDelimiter) {
+                columnWidths.add(numChars);
+                numChars = 0;
+            }
+            prevCharIsDelimiter = thisCharIsDelimiter;
+            final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(ch, row.end() - currentIndex,
+                    useUtf32CountingConvention, charCountResult);
+            currentIndex += utf8Length;
+            numChars += charCountResult.intValue();
+        }
+    }
+
+    // UNITS: UTF8 CHARACTERS
+    private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte paddingByte,
+                                           boolean utf32CountingMode) {
+        final int numCols = columnWidths.length;
+        if (numCols == 0) {
+            return new String[0];
+        }
+        final int[] byteWidths = new int[numCols];
+        final ByteSlice tempSlice = new ByteSlice();
+        final int excessBytes = charWidthsToByteWidths(row, columnWidths, utf32CountingMode, byteWidths);
+        // Our policy is that the last column gets any excess bytes that are in the row.
+        byteWidths[numCols - 1] += excessBytes;
+        final String[] result = new String[numCols];
+
+        int beginByte = row.begin();
+        for (int colNum = 0; colNum != numCols; ++colNum) {
+            final int proposedEndByte = beginByte + byteWidths[colNum];
+            final int actualEndByte = Math.min(proposedEndByte, row.end());
+            tempSlice.reset(row.data(), beginByte, actualEndByte);
+            tempSlice.trimPadding(paddingByte);
+            result[colNum] = tempSlice.toString();
+            beginByte = actualEndByte;
+        }
+        return result;
+    }
+
+    private static int charWidthsToByteWidths(ByteSlice row, int[] charWidths, boolean utf32CountingMode,
+                                              int[] byteWidths) {
+        int numCols = charWidths.length;
+        if (byteWidths.length != numCols) {
+            throw new IllegalArgumentException(String.format("Expected charWidths.length (%d) == byteWidths.length (%d)",
+                    charWidths.length, byteWidths.length));
+        }
+        final MutableInt charCountResult = new MutableInt();
+        final byte[] data = row.data();
+        int start = row.begin();
+        int current = start;
+        int colIndex = 0;
+        int charCount = 0;
+        while (true) {
+            if (colIndex == numCols) {
+                // Excess bytes not claimed by any column
+                return row.end() - current;
+            }
+            if (charCount == charWidths[colIndex]) {
+                byteWidths[colIndex] = current - start;
+                start = current;
+                charCount = 0;
+                ++colIndex;
+                continue;
+            }
+
+            final byte ch = data[current];
+            final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(ch, row.end() - current, utf32CountingMode,
+                    charCountResult);
+            current += utf8Length;
+            charCount += charCountResult.intValue();
+        }
+    }
+}
diff --git a/src/test/java/io/deephaven/csv/CsvReaderTest.java b/src/test/java/io/deephaven/csv/CsvReaderTest.java
index 4f5e5867..7d852b1b 100644
--- a/src/test/java/io/deephaven/csv/CsvReaderTest.java
+++ b/src/test/java/io/deephaven/csv/CsvReaderTest.java
@@ -26,6 +26,8 @@
 import org.jetbrains.annotations.NotNull;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.Timeout;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
 
 import java.io.*;
 import java.lang.reflect.Array;
@@ -1853,12 +1855,6 @@ public void lotsOfDataDoesntChoke() throws CsvReaderException {
     public void colnumPassedThrough() throws CsvReaderException {
         final String input = "" + "Col1,Col2,Col3\n" + "1,2,3\n" + "4,5,6\n" + "7,8,9\n";
 
-        final ColumnSet expected =
-                ColumnSet.of(
-                        Column.ofValues("Col1", 1, 4, 7),
-                        Column.ofValues("Col2", 2, 5, 8),
-                        Column.ofValues("Col3", 3, 6, 9));
-
         final InputStream inputStream = toInputStream(input);
         final CsvSpecs specs = defaultCsvSpecs();
         final SinkFactory sinkFactory = makeBlackholeSinkFactory();
@@ -1873,6 +1869,255 @@ public void colnumPassedThrough() throws CsvReaderException {
         Assertions.assertThat(bh2Num).isEqualTo(2);
     }
 
+    /**
+     * Addresses <a href="https://github.com/deephaven/deephaven-csv/issues/212"> A user requested that the library
+     * be able to read files like this.
+     */
+    @Test
+    public void bug212() throws CsvReaderException {
+        final String input =
+                ""
+                        + "NAME                     STATUS       AGE      LABELS\n"
+                        + "argo-events              Not Active   2y77d    app.kubernetes.io/instance=argo-events,kubernetes.io/metadata.name=argo-events\n"
+                        + "argo-workflows           Active       2y77d    app.kubernetes.io/instance=argo-workflows,kubernetes.io/metadata.name=argo-workflows\n"
+                        + "argocd                   Active       5y18d    kubernetes.io/metadata.name=argocd\n"
+                        + "beta                     Not Active   4y235d   kubernetes.io/metadata.name=beta\n";
+
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+                .ignoreSurroundingSpaces(true).build();
+
+        final ColumnSet expected = ColumnSet.of(
+                Column.ofRefs("NAME", "argo-events", "argo-workflows", "argocd", "beta"),
+                Column.ofRefs("STATUS", "Not Active", "Active", "Active", "Not Active"),
+                Column.ofRefs("AGE", "2y77d", "2y77d", "5y18d", "4y235d"),
+                Column.ofRefs("LABELS", "app.kubernetes.io/instance=argo-events,kubernetes.io/metadata.name=argo-events",
+                        "app.kubernetes.io/instance=argo-workflows,kubernetes.io/metadata.name=argo-workflows",
+                        "kubernetes.io/metadata.name=argocd",
+                        "kubernetes.io/metadata.name=beta"));
+
+        invokeTest(specs, input, expected);
+    }
+
+    @Test
+    public void simpleFixedColumnWidths() throws CsvReaderException {
+        final String input =
+                ""
+                        + "Sym   Type     Price   SecurityId\n"
+                        + "GOOG  Dividend 0.25    200\n"
+                        + "T     Dividend 0.15    300\n"
+                        + "Z     Dividend 0.18    500\n";
+
+        final ColumnSet expected =
+                ColumnSet.of(
+                        Column.ofRefs("Sym", "GOOG", "T", "Z"),
+                        Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"),
+                        Column.ofValues("Price", 0.25, 0.15, 0.18),
+                        Column.ofValues("SecurityId", 200, 300, 500));
+
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build();
+
+        invokeTest(specs, input, expected);
+    }
+
+    /**
+     * We allow data fields to fill the whole cell, without a padding character
+     * @throws CsvReaderException
+     */
+    @Test
+    public void fixedColumnWidthsFullCell() throws CsvReaderException {
+        final String input =
+                ""
+                        + "Sym   Type     Price   SecurityId\n"
+                        + "GOOGLEDividend!0.25    200\n"
+                        + "T     Dividend 0.15    300\n";
+
+        final ColumnSet expected =
+                ColumnSet.of(
+                        Column.ofRefs("Sym", "GOOGLE", "T"),
+                        Column.ofRefs("Type", "Dividend!", "Dividend"),
+                        Column.ofValues("Price", 0.25, 0.15),
+                        Column.ofValues("SecurityId", 200, 300));
+
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build();
+        invokeTest(specs, input, expected);
+    }
+
+    /**
+     * As usual, we allow rows to be short
+     */
+    @ParameterizedTest
+    @ValueSource(booleans = {false, true})
+    public void fixedColumnWidthsShortRows(boolean allowMissingColumns) throws CsvReaderException {
+        final String input =
+                ""
+                        + "Sym   Type     Price   SecurityId\n"
+                        + "GOOG\n"
+                        + "T     Dividend 0.15    300\n"
+                        + "Z     Dividend 0.18    500\n"
+                        + "QQQ   Coupon\n";
+
+        final ColumnSet expected =
+                ColumnSet.of(
+                        Column.ofRefs("Sym", "GOOG", "T", "Z", "QQQ"),
+                        Column.ofRefs("Type", null, "Dividend", "Dividend", "Coupon"),
+                        Column.ofValues("Price", Sentinels.NULL_DOUBLE, 0.15, 0.18, Sentinels.NULL_DOUBLE),
+                        Column.ofValues("SecurityId", Sentinels.NULL_INT, 300, 500, Sentinels.NULL_INT));
+
+        final CsvSpecs specs  = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+                .ignoreSurroundingSpaces(true).allowMissingColumns(allowMissingColumns).build();
+
+        if (allowMissingColumns) {
+            invokeTest(specs, input, expected);
+        } else {
+            Assertions.assertThatThrownBy(() -> invokeTest(specs, input, expected))
+                    .hasRootCauseMessage("Row 2 has too few columns (expected 4)");
+        }
+    }
+
+    /**
+     * All six Unicode characters ♡♥❥❦◑╳ are in the Basic Multilingual Plane and can all be represented
+     * with a single Java char. Therefore, they are counted the same with both counting conventions.
+     */
+    @ParameterizedTest
+    @ValueSource(booleans =  {false, true})
+    public void countsBMPCharactersTheSame(boolean useUtf32CountingConvention) throws CsvReaderException {
+        final String input =
+                ""
+                        + "Sym   Type     Price   SecurityId\n"
+                        + "♡♥❥❦◑╳Dividend 0.15    300\n"
+                        + "Z     Dividend 0.18    500\n";
+
+        final ColumnSet expected =
+                ColumnSet.of(
+                        Column.ofRefs("Sym", "♡♥❥❦◑╳", "Z"),
+                        Column.ofRefs("Type", "Dividend", "Dividend"),
+                        Column.ofValues("Price", 0.15, 0.18),
+                        Column.ofValues("SecurityId", 300, 500));
+
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+                .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build();
+
+        invokeTest(specs, input, expected);
+    }
+
+    /**
+     * All six Unicode characters 🥰😻🧡💓💕💖 are _outside_ the Basic Multilingual Plane and all are represented
+     * with two Java chars. The Sym column has a width of six. They will fit in the "Sym" column if the caller
+     * uses UTF-32 counting convention. They will not fit in the column if the caller uses the UTF-16 counting
+     * convention (because it takes 12 Java chars to express them).
+     */
+    @ParameterizedTest
+    @ValueSource(booleans =  {false, true})
+    public void countsNonBMPCharactersDifferently(boolean useUtf32CountingConvention) throws CsvReaderException {
+        final String input =
+                ""
+                        + "Sym   Type\n"
+                        + "🥰😻🧡💓💕💖Dividend\n"
+                        + "Z     Dividend\n";
+
+        final ColumnSet expected;
+
+        if (useUtf32CountingConvention) {
+            expected = ColumnSet.of(
+                    Column.ofRefs("Sym", "🥰😻🧡💓💕💖", "Z"),
+                    Column.ofRefs("Type", "Dividend", "Dividend"));
+        } else {
+            expected = ColumnSet.of(
+                    Column.ofRefs("Sym", "🥰😻🧡", "Z"),
+                    Column.ofRefs("Type", "💓💕💖Dividend", "Dividend"));
+        }
+
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+                .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build();
+
+        invokeTest(specs, input, expected);
+    }
+
+    /**
+     * Using Unicode characters as column headers. We give one column a header with characters from the BMP
+     * and one with characters outside the BMP and show how the behavior differs depending on the
+     * useUtf32CountingConvention flag.
+     * ╔═╗
+     * All six Unicode characters 🥰😻🧡💓💕💖 are _outside_ the Basic Multilingual Plane and all are represented
+     * with two Java chars. The Sym column has a width of six. They will fit in the "Sym" column if the caller
+     * uses UTF-32 counting convention. They will not fit in the column if the caller uses the UTF-16 counting
+     * convention (because it takes 12 Java chars to express them).
+     */
+    @ParameterizedTest
+    @ValueSource(booleans =  {false, true})
+    public void unicodeColumnHeaders(boolean useUtf32CountingConvention) throws CsvReaderException {
+        // In the UTF-32 counting convention, this is a column of width 4 (three Unicode characters plus the space)
+        // followed by a column of width 5. The first cell of the data would therefore be "abc", and the next cell
+        // would be "def".
+
+        // In the UTF-16 counting convention, this is a column of width 7 (six UTF-16 units plus the space)
+        // followed by a column of width 5. The first cell of the data would therefore be "abc def" and the next
+        // cell woult be "gh".
+        final String input =
+                ""
+                        + "🥰😻🧡 ╔═╤═╗\n"
+                        + "abc defgh\n";
+
+        final ColumnSet expected;
+
+        if (useUtf32CountingConvention) {
+            expected = ColumnSet.of(
+                    Column.ofRefs("🥰😻🧡", "abc"),
+                    Column.ofRefs("╔═╤═╗", "defgh"));
+        } else {
+            expected = ColumnSet.of(
+                    Column.ofRefs("🥰😻🧡", "abc def"),
+                    Column.ofRefs("╔═╤═╗", "gh"));
+        }
+
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+                .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build();
+
+        invokeTest(specs, input, expected);
+    }
+
+    /**
+     * If the library is configured for the UTF-16 counting convention, and there is only one unit of space left
+     * in the field, and the next character is a character outside the Basic Multilingual Plane that requires two units,
+     * the library will include that character in the next field rather than this one.
+     */
+    @ParameterizedTest
+    @ValueSource(booleans =  {false, true})
+    public void brokenSurrogatePair(boolean useUtf32CountingConvention) throws CsvReaderException {
+        // This test has a column of width 3 (three characters plus the space)
+        // followed by a column of width 2.
+        //
+        // In the UTF-32 counting convention, the first column will get "🥰😻 " and the second column will
+        // get "🧡💓". We turn off ignoreSurroundingSpaces to highlight how this is counted.
+        //
+        // In the UTF-16 counting convention, the first column will get 🥰 (because 🥰😻 uses characters
+        // outside the Basic Multilingual Plane and takes four units to represent, but the first field
+        // only has space for three). The next column will get "😻 🧡💓" (the rest of the row).
+        final String input =
+                ""
+                        + "C1 C2\n"
+                        + "🥰😻 🧡💓\n";
+
+        final ColumnSet expected;
+
+        if (useUtf32CountingConvention) {
+            expected = ColumnSet.of(
+                    Column.ofRefs("C1", "🥰😻 "),
+                    Column.ofRefs("C2", "🧡💓"));
+        } else {
+            expected = ColumnSet.of(
+                    Column.ofRefs("C1", "🥰"),
+                    Column.ofRefs("C2", "😻 🧡💓"));
+        }
+
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+                .ignoreSurroundingSpaces(false).useUtf32CountingConvention(useUtf32CountingConvention).build();
+
+        invokeTest(specs, input, expected);
+    }
+
+
+
     private static final class RepeatingInputStream extends InputStream {
         private byte[] data;
         private final byte[] body;