Skip to content

Commit

Permalink
blah
Browse files Browse the repository at this point in the history
  • Loading branch information
kosak committed Nov 5, 2024
1 parent e2da685 commit ed02a8e
Showing 1 changed file with 30 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ public static String[] determineHeadersToUse(
MutableObject<int[]> columnWidthsResult)
throws CsvReaderException {
String[] headersToUse;
// Get user-specified column widths, if any. If not, this will be an array of length 0.
// UNITS: UTF8 CHARACTERS
// Get user-specified column widths, if any. If none were specified, this will be an array of length 0.
// The column widths are in units of the specified convention (either UTF-16 or UTF-32 units).
int[] columnWidthsToUse = specs.fixedColumnWidths().stream().mapToInt(Integer::intValue).toArray();
if (specs.hasHeaderRow()) {
long skipCount = specs.skipHeaderRows();
Expand All @@ -46,11 +46,9 @@ public static String[] determineHeadersToUse(
}
final byte paddingByte = (byte) specs.delimiter();
if (columnWidthsToUse.length == 0) {
// UNITS: UTF8 CHARACTERS
columnWidthsToUse = inferColumnWidths(headerRow, paddingByte, specs.useUtf32CountingConvention());
}

// DESIRED UNITS: UTF8 CHARACTERS
headersToUse =
extractHeaders(headerRow, columnWidthsToUse, paddingByte, specs.useUtf32CountingConvention());
} else {
Expand All @@ -76,12 +74,18 @@ public static String[] determineHeadersToUse(
headersToUse[entry.getKey()] = entry.getValue();
}

// DESIRED UNITS: UTF8 CHARACTERS
columnWidthsResult.setValue(columnWidthsToUse);
return headersToUse;
}

// RETURNS UNITS: UTF8 CHARACTERS
/**
* Infer the column widths by looking for the transition from delimiter char to non-delimiter char.
* @param row The input row
* @param delimiterAsByte The delimiter. As elsewhere, only 7-bit ASCII delimiters are supported.
* @param useUtf32CountingConvention The character set convention we are using for units of width (either UTF-32 or
* UTF-16)
* @return The widths of the columns, in the specified character set convention.
*/
private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, boolean useUtf32CountingConvention) {
// A column start is a non-delimiter character preceded by a delimiter (or present at the start of line).
// If the start of the line is a delimiter, that is an error.
Expand Down Expand Up @@ -116,7 +120,14 @@ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, bool
}
}

// UNITS: UTF8 CHARACTERS
/**
* Extract the headers names from 'row'.
* @param row The header row
* @param columnWidths The width of the columns, in the UTF-32 or UTF-16 counting convention.
* @param paddingByte The delimiter character
* @param utf32CountingMode Whether we are in the UTF-32 or UTF-16 counting mode
* @return The array of headers
*/
private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte paddingByte,
boolean utf32CountingMode) {
final int numCols = columnWidths.length;
Expand All @@ -142,6 +153,18 @@ private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte p
return result;
}

/**
* Convert character widths to UTF-8 widths. This converts the character widths, which are in the specified
* convention (either UTF-16 or UTF-32), which are fixed for the whole input, and which are determined by
* reading the headers (or specified by the user), into UTF-8 widths, which are specific to this row.
* For example if a charWidth is 2 and the utf32CountingMode is true, then we need to scan the row for the
* next two Unicode characters and count how many UTF-8 bytes that took up.
* @param row The row we are processing
* @param charWidths The column widths, in units of UTF-32 or UTF-16 units.
* @param utf32CountingMode Whether we are counting in UTF-32 or UTF-16 mode
* @param byteWidths The corresponding number of UTF-8 bytes corresponding to the charWidths for this row.
* @return The number of excess UTF-8 bytes in this row that go beyond all the charWidths.
*/
private static int charWidthsToByteWidths(ByteSlice row, int[] charWidths, boolean utf32CountingMode,
int[] byteWidths) {
int numCols = charWidths.length;
Expand Down

0 comments on commit ed02a8e

Please sign in to comment.