Skip to content

Commit

Permalink
spotless
Browse files Browse the repository at this point in the history
  • Loading branch information
kosak committed Nov 4, 2024
1 parent 8bd5ae1 commit 66cd8a7
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 64 deletions.
21 changes: 10 additions & 11 deletions src/main/java/io/deephaven/csv/CsvSpecs.java
Original file line number Diff line number Diff line change
Expand Up @@ -126,23 +126,22 @@ public interface Builder {
* When {@link #hasFixedWidthColumns} is set, the library either determines the column widths from the header
* row (provided {@link #hasHeaderRow} is set), or the column widths can be specified explictly by the caller.
* If the caller wants to specify them explicitly, they can use this method.
*
* @param fixedColumnWidths The caller-specified widths of the columns.
*/
Builder fixedColumnWidths(Iterable<Integer> fixedColumnWidths);

/**
* This setting controls what units fixed width columns are measured in.
* When true, fixed width columns are measured in Unicode code points.
* When false, fixed width columns are measured in UTF-16 units (aka Java chars).
* The difference arises when encountering characters outside the Unicode Basic Multilingual Plane.
* For example, the Unicode code point πŸ’” (U+1F494) is one Unicode code point, but takes
* two Java chars to represent. Along these lines, the string πŸ’”πŸ’”πŸ’” would fit in a column of width 3
* when utf32CountingMode is true, but would require a column width of at least 6 when utf32CountingMode
* is false.
* This setting controls what units fixed width columns are measured in. When true, fixed width columns are
* measured in Unicode code points. When false, fixed width columns are measured in UTF-16 units (aka Java
* chars). The difference arises when encountering characters outside the Unicode Basic Multilingual Plane. For
* example, the Unicode code point πŸ’” (U+1F494) is one Unicode code point, but takes two Java chars to
* represent. Along these lines, the string πŸ’”πŸ’”πŸ’” would fit in a column of width 3 when utf32CountingMode is
* true, but would require a column width of at least 6 when utf32CountingMode is false.
*
* The default setting of true is arguably more natural for users (the number of characters they see
* matches the visual width of the column). But some programs may want the value of false because they
* are counting Java chars.
* The default setting of true is arguably more natural for users (the number of characters they see matches the
* visual width of the column). But some programs may want the value of false because they are counting Java
* chars.
*/
Builder useUtf32CountingConvention(boolean useUtf32CountingConvention);

Expand Down
5 changes: 2 additions & 3 deletions src/main/java/io/deephaven/csv/reading/CsvReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,8 @@ private CsvReader() {}
*/
public static Result read(final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory)
throws CsvReaderException {
return specs.hasFixedWidthColumns() ?
fixedReadLogic(specs, stream, sinkFactory) :
delimitedReadLogic(specs, stream, sinkFactory);
return specs.hasFixedWidthColumns() ? fixedReadLogic(specs, stream, sinkFactory)
: delimitedReadLogic(specs, stream, sinkFactory);
}

private static Result delimitedReadLogic(
Expand Down
10 changes: 5 additions & 5 deletions src/main/java/io/deephaven/csv/reading/ReaderUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ public static void trimWhitespace(final ByteSlice cs) {
}

/**
* Get the expected length of a UTF-8 sequence, given its first byte, and its
* corresponding length in the specified units (UTF-16 or UTF-32).
* Get the expected length of a UTF-8 sequence, given its first byte, and its corresponding length in the specified
* units (UTF-16 or UTF-32).
*
* @param firstByte The first byte of the UTF-8 sequence.
* @param numBytes The number of remaining bytes in the input field (including firstByte). If the UTF-8
* sequence specifies a number of bytes larger than the number of remaining bytes, an
* exception is thrown.
* @param numBytes The number of remaining bytes in the input field (including firstByte). If the UTF-8 sequence
* specifies a number of bytes larger than the number of remaining bytes, an exception is thrown.
* @param useUtf32CountingConvention Whether 'charCountResult' should be in units of UTF-32 or UTF-16.
* @param charCountResult The number of UTF-32 or UTF-16 units specified by the UTF-8 character.
* @return The length of the UTF-8 sequence.
Expand Down
22 changes: 12 additions & 10 deletions src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,20 @@
import java.io.InputStream;

/**
* This class uses an underlying DelimitedCellGrabber to grab whole lines at a time from the input stream,
* and then it breaks them into fixed-sized cells to return to the caller.
* This class uses an underlying DelimitedCellGrabber to grab whole lines at a time from the input stream, and then it
* breaks them into fixed-sized cells to return to the caller.
*/
public class FixedCellGrabber implements CellGrabber {
/**
* Makes a degenerate CellGrabber that has no delimiters or quotes and therefore returns whole lines.
* This is a somewhat quick-and-dirty way to reuse the buffering and newline logic in DelimitedCellGrabber
* without rewriting it.
* Makes a degenerate CellGrabber that has no delimiters or quotes and therefore returns whole lines. This is a
* somewhat quick-and-dirty way to reuse the buffering and newline logic in DelimitedCellGrabber without rewriting
* it.
*
* @param stream The underlying stream.
* @return The "line grabber"
*/
public static CellGrabber makeLineGrabber(InputStream stream) {
final byte IllegalUtf8 = (byte)0xff;
final byte IllegalUtf8 = (byte) 0xff;
return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false);
}

Expand All @@ -37,7 +38,7 @@ public static CellGrabber makeLineGrabber(InputStream stream) {

/** Constructor. */
public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces,
boolean utf32CountingMode) {
boolean utf32CountingMode) {
this.lineGrabber = lineGrabber;
this.columnWidths = columnWidths;
this.ignoreSurroundingSpaces = ignoreSurroundingSpaces;
Expand All @@ -50,7 +51,8 @@ public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths,
}

@Override
public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean endOfInput) throws CsvReaderException {
public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean endOfInput)
throws CsvReaderException {
if (needsUnderlyingRefresh) {
// Underlying row used up, and all columns provided. Ask underlying CellGrabber for the next line.
lineGrabber.grabNext(rowText, dummy1, endOfInput);
Expand All @@ -66,7 +68,7 @@ public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean en
}

// There is data to return. Count off N characters. The final column gets all remaining characters.
final boolean lastCol = colIndex == columnWidths.length - 1;
final boolean lastCol = colIndex == columnWidths.length - 1;
final int numCharsToTake = lastCol ? Integer.MAX_VALUE : columnWidths[colIndex];
takeNCharactersInCharset(rowText, dest, numCharsToTake, utf32CountingMode, dummy2);
++colIndex;
Expand All @@ -80,7 +82,7 @@ public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean en
}

private static void takeNCharactersInCharset(ByteSlice src, ByteSlice dest, int numCharsToTake,
boolean utf32CountingMode, MutableInt tempInt) {
boolean utf32CountingMode, MutableInt tempInt) {
final byte[] data = src.data();
final int cellBegin = src.begin();
int current = cellBegin;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,19 @@ public static String[] determineHeadersToUse(
}
--skipCount;
}
final byte paddingByte = (byte)specs.delimiter();
final byte paddingByte = (byte) specs.delimiter();
if (columnWidthsToUse.length == 0) {
// UNITS: UTF8 CHARACTERS
columnWidthsToUse = inferColumnWidths(headerRow, paddingByte, specs.useUtf32CountingConvention());
}

// DESIRED UNITS: UTF8 CHARACTERS
headersToUse = extractHeaders(headerRow, columnWidthsToUse, paddingByte, specs.useUtf32CountingConvention());
headersToUse =
extractHeaders(headerRow, columnWidthsToUse, paddingByte, specs.useUtf32CountingConvention());
} else {
if (columnWidthsToUse.length == 0) {
throw new CsvReaderException("Can't proceed because hasHeaderRow is false but fixedColumnWidths is unspecified");
throw new CsvReaderException(
"Can't proceed because hasHeaderRow is false but fixedColumnWidths is unspecified");
}
headersToUse = ReaderUtil.makeSyntheticHeaders(columnWidthsToUse.length);
}
Expand Down Expand Up @@ -99,7 +101,8 @@ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, bool
boolean thisCharIsDelimiter = ch == delimiterAsByte;
if (currentIndex == row.begin() && thisCharIsDelimiter) {
throw new IllegalArgumentException(
String.format("Header row cannot start with the delimiter character '%c'", (char)delimiterAsByte));
String.format("Header row cannot start with the delimiter character '%c'",
(char) delimiterAsByte));
}
if (!thisCharIsDelimiter && prevCharIsDelimiter) {
columnWidths.add(numChars);
Expand All @@ -115,7 +118,7 @@ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, bool

// UNITS: UTF8 CHARACTERS
private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte paddingByte,
boolean utf32CountingMode) {
boolean utf32CountingMode) {
final int numCols = columnWidths.length;
if (numCols == 0) {
return new String[0];
Expand All @@ -140,11 +143,12 @@ private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte p
}

private static int charWidthsToByteWidths(ByteSlice row, int[] charWidths, boolean utf32CountingMode,
int[] byteWidths) {
int[] byteWidths) {
int numCols = charWidths.length;
if (byteWidths.length != numCols) {
throw new IllegalArgumentException(String.format("Expected charWidths.length (%d) == byteWidths.length (%d)",
charWidths.length, byteWidths.length));
throw new IllegalArgumentException(
String.format("Expected charWidths.length (%d) == byteWidths.length (%d)",
charWidths.length, byteWidths.length));
}
final MutableInt charCountResult = new MutableInt();
final byte[] data = row.data();
Expand Down
56 changes: 29 additions & 27 deletions src/test/java/io/deephaven/csv/CsvReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1870,8 +1870,8 @@ public void colnumPassedThrough() throws CsvReaderException {
}

/**
* Addresses <a href="https://github.com/deephaven/deephaven-csv/issues/212"> A user requested that the library
* be able to read files like this.
* Addresses <a href="https://github.com/deephaven/deephaven-csv/issues/212"> A user requested that the library be
* able to read files like this.
*/
@Test
public void bug212() throws CsvReaderException {
Expand All @@ -1890,7 +1890,8 @@ public void bug212() throws CsvReaderException {
Column.ofRefs("NAME", "argo-events", "argo-workflows", "argocd", "beta"),
Column.ofRefs("STATUS", "Not Active", "Active", "Active", "Not Active"),
Column.ofRefs("AGE", "2y77d", "2y77d", "5y18d", "4y235d"),
Column.ofRefs("LABELS", "app.kubernetes.io/instance=argo-events,kubernetes.io/metadata.name=argo-events",
Column.ofRefs("LABELS",
"app.kubernetes.io/instance=argo-events,kubernetes.io/metadata.name=argo-events",
"app.kubernetes.io/instance=argo-workflows,kubernetes.io/metadata.name=argo-workflows",
"kubernetes.io/metadata.name=argocd",
"kubernetes.io/metadata.name=beta"));
Expand All @@ -1914,13 +1915,15 @@ public void simpleFixedColumnWidths() throws CsvReaderException {
Column.ofValues("Price", 0.25, 0.15, 0.18),
Column.ofValues("SecurityId", 200, 300, 500));

final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build();
final CsvSpecs specs =
defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build();

invokeTest(specs, input, expected);
}

/**
* We allow data fields to fill the whole cell, without a padding character
*
* @throws CsvReaderException
*/
@Test
Expand All @@ -1938,7 +1941,8 @@ public void fixedColumnWidthsFullCell() throws CsvReaderException {
Column.ofValues("Price", 0.25, 0.15),
Column.ofValues("SecurityId", 200, 300));

final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build();
final CsvSpecs specs =
defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build();
invokeTest(specs, input, expected);
}

Expand All @@ -1963,7 +1967,7 @@ public void fixedColumnWidthsShortRows(boolean allowMissingColumns) throws CsvRe
Column.ofValues("Price", Sentinels.NULL_DOUBLE, 0.15, 0.18, Sentinels.NULL_DOUBLE),
Column.ofValues("SecurityId", Sentinels.NULL_INT, 300, 500, Sentinels.NULL_INT));

final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
.ignoreSurroundingSpaces(true).allowMissingColumns(allowMissingColumns).build();

if (allowMissingColumns) {
Expand All @@ -1975,11 +1979,11 @@ public void fixedColumnWidthsShortRows(boolean allowMissingColumns) throws CsvRe
}

/**
* All six Unicode characters β™‘β™₯β₯❦◑╳ are in the Basic Multilingual Plane and can all be represented
* with a single Java char. Therefore, they are counted the same with both counting conventions.
* All six Unicode characters β™‘β™₯β₯❦◑╳ are in the Basic Multilingual Plane and can all be represented with a single
* Java char. Therefore, they are counted the same with both counting conventions.
*/
@ParameterizedTest
@ValueSource(booleans = {false, true})
@ValueSource(booleans = {false, true})
public void countsBMPCharactersTheSame(boolean useUtf32CountingConvention) throws CsvReaderException {
final String input =
""
Expand All @@ -2001,13 +2005,13 @@ public void countsBMPCharactersTheSame(boolean useUtf32CountingConvention) throw
}

/**
* All six Unicode characters πŸ₯°πŸ˜»πŸ§‘πŸ’“πŸ’•πŸ’– are _outside_ the Basic Multilingual Plane and all are represented
* with two Java chars. The Sym column has a width of six. They will fit in the "Sym" column if the caller
* uses UTF-32 counting convention. They will not fit in the column if the caller uses the UTF-16 counting
* convention (because it takes 12 Java chars to express them).
* All six Unicode characters πŸ₯°πŸ˜»πŸ§‘πŸ’“πŸ’•πŸ’– are _outside_ the Basic Multilingual Plane and all are represented with
* two Java chars. The Sym column has a width of six. They will fit in the "Sym" column if the caller uses UTF-32
* counting convention. They will not fit in the column if the caller uses the UTF-16 counting convention (because
* it takes 12 Java chars to express them).
*/
@ParameterizedTest
@ValueSource(booleans = {false, true})
@ValueSource(booleans = {false, true})
public void countsNonBMPCharactersDifferently(boolean useUtf32CountingConvention) throws CsvReaderException {
final String input =
""
Expand All @@ -2034,17 +2038,15 @@ public void countsNonBMPCharactersDifferently(boolean useUtf32CountingConvention
}

/**
* Using Unicode characters as column headers. We give one column a header with characters from the BMP
* and one with characters outside the BMP and show how the behavior differs depending on the
* useUtf32CountingConvention flag.
* ╔═╗
* All six Unicode characters πŸ₯°πŸ˜»πŸ§‘πŸ’“πŸ’•πŸ’– are _outside_ the Basic Multilingual Plane and all are represented
* with two Java chars. The Sym column has a width of six. They will fit in the "Sym" column if the caller
* uses UTF-32 counting convention. They will not fit in the column if the caller uses the UTF-16 counting
* convention (because it takes 12 Java chars to express them).
* Using Unicode characters as column headers. We give one column a header with characters from the BMP and one with
* characters outside the BMP and show how the behavior differs depending on the useUtf32CountingConvention flag.
* ╔═╗ All six Unicode characters πŸ₯°πŸ˜»πŸ§‘πŸ’“πŸ’•πŸ’– are _outside_ the Basic Multilingual Plane and all are represented
* with two Java chars. The Sym column has a width of six. They will fit in the "Sym" column if the caller uses
* UTF-32 counting convention. They will not fit in the column if the caller uses the UTF-16 counting convention
* (because it takes 12 Java chars to express them).
*/
@ParameterizedTest
@ValueSource(booleans = {false, true})
@ValueSource(booleans = {false, true})
public void unicodeColumnHeaders(boolean useUtf32CountingConvention) throws CsvReaderException {
// In the UTF-32 counting convention, this is a column of width 4 (three Unicode characters plus the space)
// followed by a column of width 5. The first cell of the data would therefore be "abc", and the next cell
Expand Down Expand Up @@ -2077,12 +2079,12 @@ public void unicodeColumnHeaders(boolean useUtf32CountingConvention) throws CsvR
}

/**
* If the library is configured for the UTF-16 counting convention, and there is only one unit of space left
* in the field, and the next character is a character outside the Basic Multilingual Plane that requires two units,
* the library will include that character in the next field rather than this one.
* If the library is configured for the UTF-16 counting convention, and there is only one unit of space left in the
* field, and the next character is a character outside the Basic Multilingual Plane that requires two units, the
* library will include that character in the next field rather than this one.
*/
@ParameterizedTest
@ValueSource(booleans = {false, true})
@ValueSource(booleans = {false, true})
public void brokenSurrogatePair(boolean useUtf32CountingConvention) throws CsvReaderException {
// This test has a column of width 3 (three characters plus the space)
// followed by a column of width 2.
Expand Down

0 comments on commit 66cd8a7

Please sign in to comment.