Skip to content

Commit

Permalink
RD-9442 Truffle's CSV query renders \N as \\N instead of N in S…
Browse files Browse the repository at this point in the history
…cala (#102)

- Added escape char handling to CSV parser which defaults to "\"
- Added test for it
- Formatting
  • Loading branch information
alexzerntev authored Aug 23, 2023
1 parent 942b10d commit 9292167
Show file tree
Hide file tree
Showing 6 changed files with 826 additions and 728 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class TruffleCsvParseEntry extends CsvParseEntry with TruffleEntryExtension {
class CsvColumnParser(
encoding: ExpressionNode,
skip: ExpressionNode,
escape: ExpressionNode,
delimiter: ExpressionNode,
quote: ExpressionNode,
nulls: ExpressionNode,
Expand All @@ -76,6 +77,7 @@ class CsvColumnParser(
val iterableParser = new IterableParseCsvString(
str,
skip,
escape,
delimiter,
quote,
new ProgramExpressionNode(lang, frameDescriptor, recordParser),
Expand Down Expand Up @@ -103,6 +105,7 @@ class CsvColumnParser(
url,
encoding,
skip,
escape,
delimiter,
quote,
new ProgramExpressionNode(lang, frameDescriptor, recordParser),
Expand Down Expand Up @@ -168,6 +171,7 @@ object CsvColumnParser {

val encoding = arg("encoding").getOrElse(new StringNode("utf-8"))
val skip = arg("skip").getOrElse(new IntNode("0"))
val escape = arg("escape").getOrElse(OptionSomeNodeGen.create(new StringNode("\\")))
val delimiter = arg("delimiter").getOrElse(new StringNode(","))
val quote = arg("quote").getOrElse(OptionSomeNodeGen.create(new StringNode("\"")))
val nulls =
Expand All @@ -176,7 +180,7 @@ object CsvColumnParser {
val timeFormat = arg("timeFormat").getOrElse(new StringNode("HH:mm[:ss[.SSS]]"))
val dateFormat = arg("dateFormat").getOrElse(new StringNode("yyyy-M-d"))
val timestampFormat = arg("timestampFormat").getOrElse(new StringNode("yyyy-M-d['T'][ ]HH:mm[:ss[.SSS]]"))
new CsvColumnParser(encoding, skip, delimiter, quote, nulls, nans, timeFormat, dateFormat, timestampFormat)
new CsvColumnParser(encoding, skip, escape, delimiter, quote, nulls, nans, timeFormat, dateFormat, timestampFormat)
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,19 @@ import raw.compiler.rql2.tests.{CompilerTestContext, FailAfterNServer}

trait CsvPackageTest extends CompilerTestContext with FailAfterNServer {

val ttt = "\"\"\""

private val data = tempFile("""a|b|c
|1|10|100
|2|20|200
|3|30|300""".stripMargin)

private val dataWithEscaped = tempFile("""a|b|c
|1|10|\N""".stripMargin)

private val dataWithQuoted = tempFile("""a|b|c
|1|10|"N"""".stripMargin)

private val headerLessData = tempFile("""1|10|100
|2|20|200
|3|30|300""".stripMargin)
Expand Down Expand Up @@ -140,12 +148,6 @@ trait CsvPackageTest extends CompilerTestContext with FailAfterNServer {
|{3, 30, 300}
|]""".stripMargin))

test(rql"""Csv.InferAndRead("$data")""".stripMargin)(it => it should evaluateTo("""[
|{a: 1, b: 10, c: 100},
|{a: 2, b: 20, c: 200},
|{a: 3, b: 30, c: 300}
|]""".stripMargin))

test(rql"""
|let data = Csv.InferAndRead("$data")
|in
Expand Down Expand Up @@ -480,7 +482,7 @@ trait CsvPackageTest extends CompilerTestContext with FailAfterNServer {
)

// Infer and Parse
val ttt = "\"\"\""

test(
s"""Csv.InferAndParse("1,2,3")"""
)(_ should evaluateTo("""[{_1: 1, _2: 2, _3: 3}]"""))
Expand Down Expand Up @@ -525,4 +527,44 @@ trait CsvPackageTest extends CompilerTestContext with FailAfterNServer {

test(rql"""Csv.InferAndParse("1,2,3", escape="\\", quotes=["\""])""")(it => it should run)

test(rql"""Csv.InferAndRead("$dataWithEscaped")""".stripMargin)(it => it should evaluateTo("""[
|{a: 1, b: 10, c: "N"}
|]""".stripMargin))

test(
rql"""Csv.Read("$dataWithEscaped", type collection(record(a:int,b:int,c:string)), delimiter="|", skip=1)""".stripMargin
)(it => it should evaluateTo("""[
|{a: 1, b: 10, c: "N"}
|]""".stripMargin))

test(
rql"""Csv.Read("$dataWithEscaped", type collection(record(a:int,b:int,c:string)), delimiter="|", skip=1, escape=null)""".stripMargin
)(it => it should evaluateTo("""[
|{a: 1, b: 10, c: "\\N"}
|]""".stripMargin))

test(
rql"""Csv.Read("$dataWithEscaped", type collection(record(a:int,b:int,c:string)), delimiter="|", skip=1, escape="\\")""".stripMargin
)(it => it should evaluateTo("""[
|{a: 1, b: 10, c: "N"}
|]""".stripMargin))

test(
rql"""Csv.Read("$dataWithQuoted", type collection(record(a:int,b:int,c:string)), delimiter="|", skip=1, quote=null)""".stripMargin
)(it => it should evaluateTo("""[
|{a: 1, b: 10, c: "\"N\""}
|]""".stripMargin))

test(
rql"""Csv.Read("$dataWithQuoted", type collection(record(a:int,b:int,c:string)), delimiter="|", skip=1)""".stripMargin
)(it => it should evaluateTo("""[
|{a: 1, b: 10, c: "N"}
|]""".stripMargin))

test(
rql"""Csv.Read("$dataWithQuoted", type collection(record(a:int,b:int,c:string)), delimiter="|", skip=1 , quote="\"")""".stripMargin
)(it => it should evaluateTo("""[
|{a: 1, b: 10, c: "N"}
|]""".stripMargin))

}
Original file line number Diff line number Diff line change
Expand Up @@ -29,81 +29,105 @@
@NodeInfo(shortName = "IterableParseCsvFile")
public class IterableParseCsvFile extends ExpressionNode {

@Child private DirectCallNode childDirectCall;
@Child private ExpressionNode location;
@Child private ExpressionNode encodingExp;
@Child private ExpressionNode skipExp;
@Child private ExpressionNode delimiterExp;
@Child private ExpressionNode quoteExp;
@Child private ExpressionNode nullsExp;
@Child private ExpressionNode nansExp;
@Child private ExpressionNode dateFormatExp;
@Child private ExpressionNode timeFormatExp;
@Child private ExpressionNode datetimeFormatExp;
@Child
private DirectCallNode childDirectCall;
@Child
private ExpressionNode location;
@Child
private ExpressionNode encodingExp;
@Child
private ExpressionNode skipExp;
@Child
private ExpressionNode delimiterExp;
@Child
private ExpressionNode quoteExp;
@Child
private ExpressionNode escapeExp;
@Child
private ExpressionNode nullsExp;
@Child
private ExpressionNode nansExp;
@Child
private ExpressionNode dateFormatExp;
@Child
private ExpressionNode timeFormatExp;
@Child
private ExpressionNode datetimeFormatExp;

private final OptionLibrary options = OptionLibrary.getFactory().getUncached();
private final OptionLibrary options = OptionLibrary.getFactory().getUncached();

public IterableParseCsvFile(
ExpressionNode location,
ExpressionNode encodingExp,
ExpressionNode skipExp,
ExpressionNode delimiterExp,
ExpressionNode quoteExp,
ProgramExpressionNode columnParser,
ExpressionNode nullsExp,
ExpressionNode nansExp,
ExpressionNode dateFormatExp,
ExpressionNode timeFormatExp,
ExpressionNode datetimeFormatExp) {
this.childDirectCall = DirectCallNode.create(columnParser.getCallTarget());
this.location = location;
this.encodingExp = encodingExp;
this.skipExp = skipExp;
this.delimiterExp = delimiterExp;
this.quoteExp = quoteExp;
this.nullsExp = nullsExp;
this.nansExp = nansExp;
this.dateFormatExp = dateFormatExp;
this.timeFormatExp = timeFormatExp;
this.datetimeFormatExp = datetimeFormatExp;
}
public IterableParseCsvFile(
ExpressionNode location,
ExpressionNode encodingExp,
ExpressionNode skipExp,
ExpressionNode escapeExp,
ExpressionNode delimiterExp,
ExpressionNode quoteExp,
ProgramExpressionNode columnParser,
ExpressionNode nullsExp,
ExpressionNode nansExp,
ExpressionNode dateFormatExp,
ExpressionNode timeFormatExp,
ExpressionNode datetimeFormatExp) {
this.childDirectCall = DirectCallNode.create(columnParser.getCallTarget());
this.location = location;
this.encodingExp = encodingExp;
this.skipExp = skipExp;
this.delimiterExp = delimiterExp;
this.quoteExp = quoteExp;
this.escapeExp = escapeExp;
this.nullsExp = nullsExp;
this.nansExp = nansExp;
this.dateFormatExp = dateFormatExp;
this.timeFormatExp = timeFormatExp;
this.datetimeFormatExp = datetimeFormatExp;
}

public Object executeGeneric(VirtualFrame frame) {
LocationObject locationValue = (LocationObject) location.executeGeneric(frame);
RuntimeContext context = RawContext.get(this).getRuntimeContext();
try {
String encodingValue = encodingExp.executeString(frame);
int skipValue = skipExp.executeInt(frame);
String delimiterValue = delimiterExp.executeString(frame);
Object quoteValue = quoteExp.executeGeneric(frame);
char quoteChar = 0;
boolean useQuote = false;
if (options.isDefined(quoteValue)) {
String quoteCharString = (String) options.get(quoteValue);
if (quoteCharString.length() > 0) {
useQuote = true;
quoteChar = quoteCharString.charAt(0);
public Object executeGeneric(VirtualFrame frame) {
LocationObject locationValue = (LocationObject) location.executeGeneric(frame);
RuntimeContext context = RawContext.get(this).getRuntimeContext();
try {
String encodingValue = encodingExp.executeString(frame);
int skipValue = skipExp.executeInt(frame);
String delimiterValue = delimiterExp.executeString(frame);
Object quoteValue = quoteExp.executeGeneric(frame);
char quoteChar = 0;
boolean useQuote = false;
if (options.isDefined(quoteValue)) {
String quoteCharString = (String) options.get(quoteValue);
if (!quoteCharString.isEmpty()) {
useQuote = true;
quoteChar = quoteCharString.charAt(0);
}
}
Object escapeValue = escapeExp.executeGeneric(frame);
char escapeChar = 0;
if (options.isDefined(escapeValue)) {
String escapeCharString = (String) options.get(escapeValue);
if (!escapeCharString.isEmpty()) {
escapeChar = escapeCharString.charAt(0);
}
}
String[] nulls = ((StringList) nullsExp.executeGeneric(frame)).getInnerList();
String[] nans = ((StringList) nansExp.executeGeneric(frame)).getInnerList();
String dateFormat = dateFormatExp.executeString(frame);
String timeFormat = timeFormatExp.executeString(frame);
String datetimeFormat = datetimeFormatExp.executeString(frame);
RawTruffleCsvParserSettings settings =
new RawTruffleCsvParserSettings(
delimiterValue.charAt(0),
useQuote,
quoteChar,
escapeChar,
skipValue,
nulls,
nans,
dateFormat,
timeFormat,
datetimeFormat);
return new CsvCollection(locationValue, context, childDirectCall, encodingValue, settings);
} catch (UnexpectedResultException ex) {
throw new CsvParserRawTruffleException(ex.getMessage(), 0, 0, ex, this);
}
}
String[] nulls = ((StringList) nullsExp.executeGeneric(frame)).getInnerList();
String[] nans = ((StringList) nansExp.executeGeneric(frame)).getInnerList();
String dateFormat = dateFormatExp.executeString(frame);
String timeFormat = timeFormatExp.executeString(frame);
String datetimeFormat = datetimeFormatExp.executeString(frame);
RawTruffleCsvParserSettings settings =
new RawTruffleCsvParserSettings(
delimiterValue.charAt(0),
useQuote,
quoteChar,
skipValue,
nulls,
nans,
dateFormat,
timeFormat,
datetimeFormat);
return new CsvCollection(locationValue, context, childDirectCall, encodingValue, settings);
} catch (UnexpectedResultException ex) {
throw new CsvParserRawTruffleException(ex.getMessage(), 0, 0, ex, this);
}
}
}
Loading

0 comments on commit 9292167

Please sign in to comment.