From ed02a8e11d310bb818bccac3d078effed7904620 Mon Sep 17 00:00:00 2001 From: Corey Kosak Date: Mon, 4 Nov 2024 21:53:28 -0500 Subject: [PATCH] blah --- .../reading/headers/FixedHeaderFinder.java | 37 +++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java index cfea90a..e14e1d5 100644 --- a/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java +++ b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java @@ -25,8 +25,8 @@ public static String[] determineHeadersToUse( MutableObject columnWidthsResult) throws CsvReaderException { String[] headersToUse; - // Get user-specified column widths, if any. If not, this will be an array of length 0. - // UNITS: UTF8 CHARACTERS + // Get user-specified column widths, if any. If none were specified, this will be an array of length 0. + // The column widths are in units of the specified convention (either UTF-16 or UTF-32 units). int[] columnWidthsToUse = specs.fixedColumnWidths().stream().mapToInt(Integer::intValue).toArray(); if (specs.hasHeaderRow()) { long skipCount = specs.skipHeaderRows(); @@ -46,11 +46,9 @@ public static String[] determineHeadersToUse( } final byte paddingByte = (byte) specs.delimiter(); if (columnWidthsToUse.length == 0) { - // UNITS: UTF8 CHARACTERS columnWidthsToUse = inferColumnWidths(headerRow, paddingByte, specs.useUtf32CountingConvention()); } - // DESIRED UNITS: UTF8 CHARACTERS headersToUse = extractHeaders(headerRow, columnWidthsToUse, paddingByte, specs.useUtf32CountingConvention()); } else { @@ -76,12 +74,18 @@ public static String[] determineHeadersToUse( headersToUse[entry.getKey()] = entry.getValue(); } - // DESIRED UNITS: UTF8 CHARACTERS columnWidthsResult.setValue(columnWidthsToUse); return headersToUse; } - // RETURNS UNITS: UTF8 CHARACTERS + /** + * Infer the column widths by looking for the transition from delimiter char to non-delimiter char. + * @param row The input row + * @param delimiterAsByte The delimiter. As elsewhere, only 7-bit ASCII delimiters are supported. + * @param useUtf32CountingConvention The character set convention we are using for units of width (either UTF-32 or + * UTF-16) + * @return The widths of the columns, in the specified character set convention. + */ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, boolean useUtf32CountingConvention) { // A column start is a non-delimiter character preceded by a delimiter (or present at the start of line). // If the start of the line is a delimiter, that is an error. @@ -116,7 +120,14 @@ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, bool } } - // UNITS: UTF8 CHARACTERS + /** + * Extract the headers names from 'row'. + * @param row The header row + * @param columnWidths The width of the columns, in the UTF-32 or UTF-16 counting convention. + * @param paddingByte The delimiter character + * @param utf32CountingMode Whether we are in the UTF-32 or UTF-16 counting mode + * @return The array of headers + */ private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte paddingByte, boolean utf32CountingMode) { final int numCols = columnWidths.length; @@ -142,6 +153,18 @@ private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte p return result; } + /** + * Convert character widths to UTF-8 widths. This converts the character widths, which are in the specified + * convention (either UTF-16 or UTF-32), which are fixed for the whole input, and which are determined by + * reading the headers (or specified by the user), into UTF-8 widths, which are specific to this row. + * For example if a charWidth is 2 and the utf32CountingMode is true, then we need to scan the row for the + * next two Unicode characters and count how many UTF-8 bytes that took up. + * @param row The row we are processing + * @param charWidths The column widths, in units of UTF-32 or UTF-16 units. + * @param utf32CountingMode Whether we are counting in UTF-32 or UTF-16 mode + * @param byteWidths The corresponding number of UTF-8 bytes corresponding to the charWidths for this row. + * @return The number of excess UTF-8 bytes in this row that go beyond all the charWidths. + */ private static int charWidthsToByteWidths(ByteSlice row, int[] charWidths, boolean utf32CountingMode, int[] byteWidths) { int numCols = charWidths.length;