blah

deephaven · Nov 5, 2024 · ed02a8e · ed02a8e
1 parent e2da685
commit ed02a8e
Showing 1 changed file with 30 additions and 7 deletions.
diff --git a/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java
@@ -25,8 +25,8 @@ public static String[] determineHeadersToUse(
             MutableObject<int[]> columnWidthsResult)
             throws CsvReaderException {
         String[] headersToUse;
-        // Get user-specified column widths, if any. If not, this will be an array of length 0.
-        // UNITS: UTF8 CHARACTERS
+        // Get user-specified column widths, if any. If none were specified, this will be an array of length 0.
+        // The column widths are in units of the specified convention (either UTF-16 or UTF-32 units).
         int[] columnWidthsToUse = specs.fixedColumnWidths().stream().mapToInt(Integer::intValue).toArray();
         if (specs.hasHeaderRow()) {
             long skipCount = specs.skipHeaderRows();
@@ -46,11 +46,9 @@ public static String[] determineHeadersToUse(
             }
             final byte paddingByte = (byte) specs.delimiter();
             if (columnWidthsToUse.length == 0) {
-                // UNITS: UTF8 CHARACTERS
                 columnWidthsToUse = inferColumnWidths(headerRow, paddingByte, specs.useUtf32CountingConvention());
             }
 
-            // DESIRED UNITS: UTF8 CHARACTERS
             headersToUse =
                     extractHeaders(headerRow, columnWidthsToUse, paddingByte, specs.useUtf32CountingConvention());
         } else {
@@ -76,12 +74,18 @@ public static String[] determineHeadersToUse(
             headersToUse[entry.getKey()] = entry.getValue();
         }
 
-        // DESIRED UNITS: UTF8 CHARACTERS
         columnWidthsResult.setValue(columnWidthsToUse);
         return headersToUse;
     }
 
-    // RETURNS UNITS: UTF8 CHARACTERS
+    /**
+     * Infer the column widths by looking for the transition from delimiter char to non-delimiter char.
+     * @param row The input row
+     * @param delimiterAsByte The delimiter. As elsewhere, only 7-bit ASCII delimiters are supported.
+     * @param useUtf32CountingConvention The character set convention we are using for units of width (either UTF-32 or
+     *                                   UTF-16)
+     * @return The widths of the columns, in the specified character set convention.
+     */
     private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, boolean useUtf32CountingConvention) {
         // A column start is a non-delimiter character preceded by a delimiter (or present at the start of line).
         // If the start of the line is a delimiter, that is an error.
@@ -116,7 +120,14 @@ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, bool
         }
     }
 
-    // UNITS: UTF8 CHARACTERS
+    /**
+     * Extract the headers names from 'row'.
+     * @param row The header row
+     * @param columnWidths The width of the columns, in the UTF-32 or UTF-16 counting convention.
+     * @param paddingByte The delimiter character
+     * @param utf32CountingMode Whether we are in the UTF-32 or UTF-16 counting mode
+     * @return The array of headers
+     */
     private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte paddingByte,
             boolean utf32CountingMode) {
         final int numCols = columnWidths.length;
@@ -142,6 +153,18 @@ private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte p
         return result;
     }
 
+    /**
+     * Convert character widths to UTF-8 widths. This converts the character widths, which are in the specified
+     * convention (either UTF-16 or UTF-32), which are fixed for the whole input, and which are determined by
+     * reading the headers (or specified by the user), into UTF-8 widths, which are specific to this row.
+     * For example if a charWidth is 2 and the utf32CountingMode is true, then we need to scan the row for the
+     * next two Unicode characters and count how many UTF-8 bytes that took up.
+     * @param row The row we are processing
+     * @param charWidths The column widths, in units of UTF-32 or UTF-16 units.
+     * @param utf32CountingMode Whether we are counting in UTF-32 or UTF-16 mode
+     * @param byteWidths The corresponding number of UTF-8 bytes corresponding to the charWidths for this row.
+     * @return The number of excess UTF-8 bytes in this row that go beyond all the charWidths.
+     */
     private static int charWidthsToByteWidths(ByteSlice row, int[] charWidths, boolean utf32CountingMode,
             int[] byteWidths) {
         int numCols = charWidths.length;