-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
608 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
114 changes: 114 additions & 0 deletions
114
src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
package io.deephaven.csv.reading.cells; | ||
|
||
import io.deephaven.csv.containers.ByteSlice; | ||
import io.deephaven.csv.reading.ReaderUtil; | ||
import io.deephaven.csv.util.CsvReaderException; | ||
import io.deephaven.csv.util.MutableBoolean; | ||
import io.deephaven.csv.util.MutableInt; | ||
|
||
import java.io.InputStream; | ||
|
||
/** | ||
* This class uses an underlying DelimitedCellGrabber to grab whole lines at a time from the input stream, | ||
* and then it breaks them into fixed-sized cells to return to the caller. | ||
*/ | ||
public class FixedCellGrabber implements CellGrabber { | ||
/** | ||
* Makes a degenerate CellGrabber that has no delimiters or quotes and therefore returns whole lines. | ||
* This is a somewhat quick-and-dirty way to reuse the buffering and newline logic in DelimitedCellGrabber | ||
* without rewriting it. | ||
* @param stream The underlying stream. | ||
* @return The "line grabber" | ||
*/ | ||
public static CellGrabber makeLineGrabber(InputStream stream) { | ||
final byte IllegalUtf8 = (byte)0xff; | ||
return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false); | ||
} | ||
|
||
private final CellGrabber lineGrabber; | ||
private final int[] columnWidths; | ||
private final boolean ignoreSurroundingSpaces; | ||
private final boolean utf32CountingMode; | ||
private final ByteSlice rowText; | ||
private boolean needsUnderlyingRefresh; | ||
private int colIndex; | ||
private final MutableBoolean dummy1; | ||
private final MutableInt dummy2; | ||
|
||
/** Constructor. */ | ||
public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces, | ||
boolean utf32CountingMode) { | ||
this.lineGrabber = lineGrabber; | ||
this.columnWidths = columnWidths; | ||
this.ignoreSurroundingSpaces = ignoreSurroundingSpaces; | ||
this.utf32CountingMode = utf32CountingMode; | ||
this.rowText = new ByteSlice(); | ||
this.needsUnderlyingRefresh = true; | ||
this.colIndex = 0; | ||
this.dummy1 = new MutableBoolean(); | ||
this.dummy2 = new MutableInt(); | ||
} | ||
|
||
@Override | ||
public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean endOfInput) throws CsvReaderException { | ||
if (needsUnderlyingRefresh) { | ||
// Underlying row used up, and all columns provided. Ask underlying CellGrabber for the next line. | ||
lineGrabber.grabNext(rowText, dummy1, endOfInput); | ||
|
||
if (endOfInput.booleanValue()) { | ||
// Set dest to the empty string, and leave 'endOfInput' set to true. | ||
dest.reset(rowText.data(), rowText.end(), rowText.end()); | ||
return; | ||
} | ||
|
||
needsUnderlyingRefresh = false; | ||
colIndex = 0; | ||
} | ||
|
||
// There is data to return. Count off N characters. The final column gets all remaining characters. | ||
final boolean lastCol = colIndex == columnWidths.length - 1; | ||
final int numCharsToTake = lastCol ? Integer.MAX_VALUE : columnWidths[colIndex]; | ||
takeNCharactersInCharset(rowText, dest, numCharsToTake, utf32CountingMode, dummy2); | ||
++colIndex; | ||
needsUnderlyingRefresh = lastCol || dest.size() == 0; | ||
lastInRow.setValue(needsUnderlyingRefresh); | ||
endOfInput.setValue(false); | ||
|
||
if (ignoreSurroundingSpaces) { | ||
ReaderUtil.trimWhitespace(dest); | ||
} | ||
} | ||
|
||
private static void takeNCharactersInCharset(ByteSlice src, ByteSlice dest, int numCharsToTake, | ||
boolean utf32CountingMode, MutableInt tempInt) { | ||
final byte[] data = src.data(); | ||
final int cellBegin = src.begin(); | ||
int current = cellBegin; | ||
while (numCharsToTake > 0) { | ||
if (current == src.end()) { | ||
break; | ||
} | ||
final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(data[current], src.end() - current, | ||
utf32CountingMode, tempInt); | ||
if (numCharsToTake < tempInt.intValue()) { | ||
// There is not enough space left in the field to store this character. | ||
// This can happen if CsvSpecs is set for the UTF16 counting convention, | ||
// there is one unit left in the field, and we encounter a character outside | ||
// the Basic Multilingual Plane, which would require two units. | ||
break; | ||
} | ||
numCharsToTake -= tempInt.intValue(); | ||
current += utf8Length; | ||
if (current > src.end()) { | ||
throw new RuntimeException("Data error: partial UTF-8 sequence found in input"); | ||
} | ||
} | ||
dest.reset(src.data(), cellBegin, current); | ||
src.reset(src.data(), current, src.end()); | ||
} | ||
|
||
@Override | ||
public int physicalRowNum() { | ||
return lineGrabber.physicalRowNum(); | ||
} | ||
} |
175 changes: 175 additions & 0 deletions
175
src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
package io.deephaven.csv.reading.headers; | ||
|
||
import io.deephaven.csv.CsvSpecs; | ||
import io.deephaven.csv.containers.ByteSlice; | ||
import io.deephaven.csv.reading.ReaderUtil; | ||
import io.deephaven.csv.reading.cells.CellGrabber; | ||
import io.deephaven.csv.tokenization.Tokenizer; | ||
import io.deephaven.csv.util.CsvReaderException; | ||
import io.deephaven.csv.util.MutableBoolean; | ||
import io.deephaven.csv.util.MutableInt; | ||
import io.deephaven.csv.util.MutableObject; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
public class FixedHeaderFinder { | ||
/** | ||
* Determine which headers to use. The result comes from either the first row of the file or the user-specified | ||
* overrides. | ||
*/ | ||
public static String[] determineHeadersToUse( | ||
final CsvSpecs specs, | ||
final CellGrabber lineGrabber, | ||
MutableObject<int[]> columnWidthsResult) | ||
throws CsvReaderException { | ||
String[] headersToUse; | ||
// Get user-specified column widths, if any. If not, this will be an array of length 0. | ||
// UNITS: UTF8 CHARACTERS | ||
int[] columnWidthsToUse = specs.fixedColumnWidths().stream().mapToInt(Integer::intValue).toArray(); | ||
if (specs.hasHeaderRow()) { | ||
long skipCount = specs.skipHeaderRows(); | ||
final ByteSlice headerRow = new ByteSlice(); | ||
MutableBoolean lastInRow = new MutableBoolean(); | ||
MutableBoolean endOfInput = new MutableBoolean(); | ||
while (true) { | ||
lineGrabber.grabNext(headerRow, lastInRow, endOfInput); | ||
if (endOfInput.booleanValue()) { | ||
throw new CsvReaderException( | ||
"Can't proceed because hasHeaderRow is set but input file is empty or shorter than skipHeaderRows"); | ||
} | ||
if (skipCount == 0) { | ||
break; | ||
} | ||
--skipCount; | ||
} | ||
final byte paddingByte = (byte)specs.delimiter(); | ||
if (columnWidthsToUse.length == 0) { | ||
// UNITS: UTF8 CHARACTERS | ||
columnWidthsToUse = inferColumnWidths(headerRow, paddingByte, specs.useUtf32CountingConvention()); | ||
} | ||
|
||
// DESIRED UNITS: UTF8 CHARACTERS | ||
headersToUse = extractHeaders(headerRow, columnWidthsToUse, paddingByte, specs.useUtf32CountingConvention()); | ||
} else { | ||
if (columnWidthsToUse.length == 0) { | ||
throw new CsvReaderException("Can't proceed because hasHeaderRow is false but fixedColumnWidths is unspecified"); | ||
} | ||
headersToUse = ReaderUtil.makeSyntheticHeaders(columnWidthsToUse.length); | ||
} | ||
|
||
// Whether or not the input had headers, maybe override with client-specified headers. | ||
if (specs.headers().size() != 0) { | ||
if (specs.headers().size() != headersToUse.length) { | ||
final String message = String.format("Library determined %d headers; caller overrode with %d headers", | ||
headersToUse.length, specs.headers().size()); | ||
throw new CsvReaderException(message); | ||
} | ||
headersToUse = specs.headers().toArray(new String[0]); | ||
} | ||
|
||
// Apply column specific overrides. | ||
for (Map.Entry<Integer, String> entry : specs.headerForIndex().entrySet()) { | ||
headersToUse[entry.getKey()] = entry.getValue(); | ||
} | ||
|
||
// DESIRED UNITS: UTF8 CHARACTERS | ||
columnWidthsResult.setValue(columnWidthsToUse); | ||
return headersToUse; | ||
} | ||
|
||
// RETURNS UNITS: UTF8 CHARACTERS | ||
private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, boolean useUtf32CountingConvention) { | ||
// A column start is a non-delimiter character preceded by a delimiter (or present at the start of line). | ||
// If the start of the line is a delimiter, that is an error. | ||
final List<Integer> columnWidths = new ArrayList<>(); | ||
final MutableInt charCountResult = new MutableInt(); | ||
boolean prevCharIsDelimiter = false; | ||
final byte[] data = row.data(); | ||
int numChars = 0; | ||
int currentIndex = row.begin(); | ||
while (true) { | ||
if (currentIndex == row.end()) { | ||
columnWidths.add(numChars); | ||
return columnWidths.stream().mapToInt(Integer::intValue).toArray(); | ||
} | ||
// If this character is not a delimiter, but the previous one was, then this is the start of a new column. | ||
byte ch = data[currentIndex]; | ||
boolean thisCharIsDelimiter = ch == delimiterAsByte; | ||
if (currentIndex == row.begin() && thisCharIsDelimiter) { | ||
throw new IllegalArgumentException( | ||
String.format("Header row cannot start with the delimiter character '%c'", (char)delimiterAsByte)); | ||
} | ||
if (!thisCharIsDelimiter && prevCharIsDelimiter) { | ||
columnWidths.add(numChars); | ||
numChars = 0; | ||
} | ||
prevCharIsDelimiter = thisCharIsDelimiter; | ||
final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(ch, row.end() - currentIndex, | ||
useUtf32CountingConvention, charCountResult); | ||
currentIndex += utf8Length; | ||
numChars += charCountResult.intValue(); | ||
} | ||
} | ||
|
||
// UNITS: UTF8 CHARACTERS | ||
private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte paddingByte, | ||
boolean utf32CountingMode) { | ||
final int numCols = columnWidths.length; | ||
if (numCols == 0) { | ||
return new String[0]; | ||
} | ||
final int[] byteWidths = new int[numCols]; | ||
final ByteSlice tempSlice = new ByteSlice(); | ||
final int excessBytes = charWidthsToByteWidths(row, columnWidths, utf32CountingMode, byteWidths); | ||
// Our policy is that the last column gets any excess bytes that are in the row. | ||
byteWidths[numCols - 1] += excessBytes; | ||
final String[] result = new String[numCols]; | ||
|
||
int beginByte = row.begin(); | ||
for (int colNum = 0; colNum != numCols; ++colNum) { | ||
final int proposedEndByte = beginByte + byteWidths[colNum]; | ||
final int actualEndByte = Math.min(proposedEndByte, row.end()); | ||
tempSlice.reset(row.data(), beginByte, actualEndByte); | ||
tempSlice.trimPadding(paddingByte); | ||
result[colNum] = tempSlice.toString(); | ||
beginByte = actualEndByte; | ||
} | ||
return result; | ||
} | ||
|
||
private static int charWidthsToByteWidths(ByteSlice row, int[] charWidths, boolean utf32CountingMode, | ||
int[] byteWidths) { | ||
int numCols = charWidths.length; | ||
if (byteWidths.length != numCols) { | ||
throw new IllegalArgumentException(String.format("Expected charWidths.length (%d) == byteWidths.length (%d)", | ||
charWidths.length, byteWidths.length)); | ||
} | ||
final MutableInt charCountResult = new MutableInt(); | ||
final byte[] data = row.data(); | ||
int start = row.begin(); | ||
int current = start; | ||
int colIndex = 0; | ||
int charCount = 0; | ||
while (true) { | ||
if (colIndex == numCols) { | ||
// Excess bytes not claimed by any column | ||
return row.end() - current; | ||
} | ||
if (charCount == charWidths[colIndex]) { | ||
byteWidths[colIndex] = current - start; | ||
start = current; | ||
charCount = 0; | ||
++colIndex; | ||
continue; | ||
} | ||
|
||
final byte ch = data[current]; | ||
final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(ch, row.end() - current, utf32CountingMode, | ||
charCountResult); | ||
current += utf8Length; | ||
charCount += charCountResult.intValue(); | ||
} | ||
} | ||
} |
Oops, something went wrong.