From 6ad6e254adcf37a16bd37d8492ff49d934ead1e7 Mon Sep 17 00:00:00 2001 From: Corey Kosak Date: Tue, 5 Dec 2023 13:31:02 -0500 Subject: [PATCH] Issue #162 - internal error when last cell is empty and last line is not terminated (#163) --- .../io/deephaven/csv/reading/CellGrabber.java | 53 +++++++++---------- .../io/deephaven/csv/reading/CsvReader.java | 10 ++-- .../csv/reading/ParseInputToDenseStorage.java | 28 +++++----- .../java/io/deephaven/csv/CsvReaderTest.java | 19 +++++++ 4 files changed, 63 insertions(+), 47 deletions(-) diff --git a/src/main/java/io/deephaven/csv/reading/CellGrabber.java b/src/main/java/io/deephaven/csv/reading/CellGrabber.java index 3c7da931..4452b719 100644 --- a/src/main/java/io/deephaven/csv/reading/CellGrabber.java +++ b/src/main/java/io/deephaven/csv/reading/CellGrabber.java @@ -75,37 +75,33 @@ public CellGrabber( * trimming. * * @param dest The result, as a {@link ByteSlice}. The ByteSlice is invalidated by the next call to grabNext. - * @param lastInRow An out parameter whose contents are only specified if this method returns true. Its contents - * will be set to true if the cell just read was the last cell in the row, otherwise they will be set to - * false. - * @return true if a cell was read; false if at end of input. + * @param lastInRow An out parameter which will be set to true if the cell just read was the last cell in the row, + * otherwise it will be set to false. + * @param endOfInput An out parameter which will be set to true if the cell just read encountered the end of the + * input, otherwise it will be set to false. */ - public boolean grabNext(final ByteSlice dest, final MutableBoolean lastInRow) - throws CsvReaderException { + public void grabNext(final ByteSlice dest, final MutableBoolean lastInRow, + final MutableBoolean endOfInput) throws CsvReaderException { spillBuffer.clear(); startOffset = offset; if (ignoreSurroundingSpaces) { skipWhitespace(); } - if (!tryEnsureMore()) { - return false; - } // Is first char the quote char? - if (buffer[offset] == quoteChar) { + if (tryEnsureMore() && buffer[offset] == quoteChar) { ++offset; - processQuotedMode(dest, lastInRow); + processQuotedMode(dest, lastInRow, endOfInput); if (trim) { trimWhitespace(dest); } } else { - processUnquotedMode(dest, lastInRow); + processUnquotedMode(dest, lastInRow, endOfInput); if (ignoreSurroundingSpaces) { trimWhitespace(dest); } } - return true; } /** @@ -114,8 +110,8 @@ public boolean grabNext(final ByteSlice dest, final MutableBoolean lastInRow) * @param lastInRow An out parameter. Its contents will be set to true if the cell just read was the last cell in * the row, otherwise the contents will be set to false. */ - private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastInRow) - throws CsvReaderException { + private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastInRow, + final MutableBoolean endOfInput) throws CsvReaderException { startOffset = offset; boolean prevCharWasCarriageReturn = false; while (true) { @@ -162,7 +158,7 @@ private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastIn // We got out of the quoted string. Consume any trailing matter after the quote and before the // field // delimiter. Hopefully that trailing matter is just whitespace, but we shall see. - finishField(dest, lastInRow); + finishField(dest, lastInRow, endOfInput); // From this point on, note that dest is a slice that may point to the underlying input buffer // or the spill buffer. Take care from this point on to not disturb the input (e.g. by reading @@ -183,10 +179,10 @@ private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastIn /** * Process characters in "unquoted mode". This is easy: eat characters until the next field or line delimiter. */ - private void processUnquotedMode(final ByteSlice dest, final MutableBoolean lastInRow) - throws CsvReaderException { + private void processUnquotedMode(final ByteSlice dest, final MutableBoolean lastInRow, + final MutableBoolean endOfInput) throws CsvReaderException { startOffset = offset; - finishField(dest, lastInRow); + finishField(dest, lastInRow, endOfInput); } /** Skip whitespace but do not consider the field delimiter to be whitespace. */ @@ -211,28 +207,30 @@ private void skipWhitespace() throws CsvReaderException { * @param lastInRow An out parameter. Its contents are set to true if the cell was the last one in the row. * Otherwise, its contents are set to false. */ - private void finishField(final ByteSlice dest, final MutableBoolean lastInRow) + private void finishField(final ByteSlice dest, final MutableBoolean lastInRow, + final MutableBoolean endOfInput) throws CsvReaderException { while (true) { - if (offset == size) { - if (!tryEnsureMore()) { - finish(dest); - // End of file sets last in row. - lastInRow.setValue(true); - return; - } + if (!tryEnsureMore()) { + finish(dest); + // End of input sets both flags. + lastInRow.setValue(true); + endOfInput.setValue(true); + return; } final byte ch = buffer[offset]; if (ch == fieldDelimiter) { finish(dest); ++offset; // ... and skip over the field delimiter. lastInRow.setValue(false); + endOfInput.setValue(false); return; } if (ch == '\n') { finish(dest); ++offset; lastInRow.setValue(true); + endOfInput.setValue(false); ++physicalRowNum; return; } @@ -252,6 +250,7 @@ private void finishField(final ByteSlice dest, final MutableBoolean lastInRow) finish(dest); dest.reset(dest.data(), dest.begin(), dest.end() - excess); lastInRow.setValue(true); + endOfInput.setValue(false); ++physicalRowNum; return; } diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java index 19d60eac..ad8036e8 100644 --- a/src/main/java/io/deephaven/csv/reading/CsvReader.java +++ b/src/main/java/io/deephaven/csv/reading/CsvReader.java @@ -287,7 +287,7 @@ private static String[] canonicalizeHeaders(CsvSpecs specs, final String[] heade } /** - * Try to read one row from the input. Returns false if the input ends before one row has been read. + * Try to read one row from the input. Returns null if the input is empty * * @return The first row as a byte[][] or null if the input was exhausted. */ @@ -297,14 +297,16 @@ private static byte[][] tryReadOneRow(final CellGrabber grabber) throws CsvReade // Grab the header final ByteSlice slice = new ByteSlice(); final MutableBoolean lastInRow = new MutableBoolean(); + final MutableBoolean endOfInput = new MutableBoolean(); do { - if (!grabber.grabNext(slice, lastInRow)) { - return null; - } + grabber.grabNext(slice, lastInRow, endOfInput); final byte[] item = new byte[slice.size()]; slice.copyTo(item, 0); headers.add(item); } while (!lastInRow.booleanValue()); + if (headers.size() == 1 && headers.get(0).length == 0 && endOfInput.booleanValue()) { + return null; + } return headers.toArray(new byte[0][]); } diff --git a/src/main/java/io/deephaven/csv/reading/ParseInputToDenseStorage.java b/src/main/java/io/deephaven/csv/reading/ParseInputToDenseStorage.java index 82e777c5..33ad20e6 100644 --- a/src/main/java/io/deephaven/csv/reading/ParseInputToDenseStorage.java +++ b/src/main/java/io/deephaven/csv/reading/ParseInputToDenseStorage.java @@ -90,6 +90,7 @@ private static class RowAppender { private final int numCols; private final ByteSlice byteSlice; private final MutableBoolean lastInRow; + private final MutableBoolean endOfInput; private final byte[][] nullValueLiteralsAsUtf8; public RowAppender(final String[] columnHeaders, final byte[][] optionalFirstDataRow, final CellGrabber grabber, @@ -109,6 +110,7 @@ public RowAppender(final String[] columnHeaders, final byte[][] optionalFirstDat } byteSlice = new ByteSlice(); lastInRow = new MutableBoolean(); + endOfInput = new MutableBoolean(); // Here we prepare ahead of time what we are going to do when we encounter a short row. Say the input looks // like: // 10,20,30,40,50 @@ -174,18 +176,15 @@ public RowResult processNextRow(final boolean writeToConsumer) throws CsvReaderE int colNum = 0; for (colNum = 0; colNum < numCols; ++colNum) { try { - if (!grabber.grabNext(byteSlice, lastInRow)) { - if (colNum == 0) { - // Input exhausted - return RowResult.END_OF_INPUT; - } - // Can't get here. If there is any data at all in the last row, and *then* the file - // ends, grabNext() will return true, with lastInRow set. - throw new RuntimeException("Logic error: uncaught short last row"); - } + grabber.grabNext(byteSlice, lastInRow, endOfInput); if (lastInRow.booleanValue()) { - if (byteSlice.size() == 0 && colNum == 0 && specs.ignoreEmptyLines()) { - return RowResult.IGNORED_EMPTY_ROW; + if (colNum == 0 && byteSlice.size() == 0) { + if (endOfInput.booleanValue()) { + return RowResult.END_OF_INPUT; + } + if (specs.ignoreEmptyLines()) { + return RowResult.IGNORED_EMPTY_ROW; + } } appendToDenseStorageWriter(dsws[colNum], byteSlice, writeToConsumer); ++colNum; @@ -208,10 +207,7 @@ public RowResult processNextRow(final boolean writeToConsumer) throws CsvReaderE } // Eat. while (!lastInRow.booleanValue()) { - if (!grabber.grabNext(byteSlice, lastInRow)) { - // Can't happen. Won't get end of input while finishing excess row. - throw new RuntimeException("Logic error: end of input while finishing excess row"); - } + grabber.grabNext(byteSlice, lastInRow, endOfInput); } } @@ -226,7 +222,7 @@ public RowResult processNextRow(final boolean writeToConsumer) throws CsvReaderE throw new CsvReaderException(message); } - // Pad the row with a null vlaue literal appropriate for each column. + // Pad the row with a null value literal appropriate for each column. while (colNum < numCols) { final byte[] nvl = nullValueLiteralsAsUtf8[colNum]; if (nvl == null) { diff --git a/src/test/java/io/deephaven/csv/CsvReaderTest.java b/src/test/java/io/deephaven/csv/CsvReaderTest.java index 80cbbebb..a1901648 100644 --- a/src/test/java/io/deephaven/csv/CsvReaderTest.java +++ b/src/test/java/io/deephaven/csv/CsvReaderTest.java @@ -184,6 +184,25 @@ public void bug133() throws CsvReaderException { ignoringSpaces); } + /** + * Reported in Deephaven CSV Issue #162. The + * library was throwing an internal error when the last cell is empty and the last line is not terminated with a + * newline. + */ + @Test + public void bug162() throws CsvReaderException { + // Last cell is empty and the line is not terminated. + final String input = "A,B\n" + + "apple,banana\n" + + "cherry,"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("A", "apple", "cherry"), + Column.ofRefs("B", "banana", null)); + invokeTest(defaultCsvBuilder().parsers(Parsers.DEFAULT).build(), input, expected); + } + @Test public void validates() { final String lengthyMessage = "CsvSpecs failed validation for the following reasons: "