From b0bc08aa7755f9938d113c17be8688fe5254a860 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 Sep 2023 15:25:59 -0600 Subject: [PATCH 1/5] Add recoverWithNull to JSONOptions and pass to Table.readJSON Signed-off-by: Andy Grove --- .../main/java/ai/rapids/cudf/JSONOptions.java | 13 +++++++ java/src/main/java/ai/rapids/cudf/Table.java | 12 ++++--- java/src/main/native/src/TableJni.cpp | 16 ++++++--- .../test/java/ai/rapids/cudf/TableTest.java | 35 +++++++++++++++++++ .../resources/people_with_invalid_lines.json | 4 +++ 5 files changed, 71 insertions(+), 9 deletions(-) create mode 100644 java/src/test/resources/people_with_invalid_lines.json diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 85a9eb7beb3..f55acfa6fcc 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -29,11 +29,13 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean dayFirst; private final boolean lines; + private final boolean recoverWithNull; private JSONOptions(Builder builder) { super(builder); dayFirst = builder.dayFirst; lines = builder.lines; + recoverWithNull = builder.recoverWithNull; } public boolean isDayFirst() { @@ -44,6 +46,10 @@ public boolean isLines() { return lines; } + public boolean isRecoverWithNull() { + return recoverWithNull; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -57,6 +63,8 @@ public static final class Builder extends ColumnFilterOptions.Builder= 0 && offset < buffer.length; return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len, - opts.isDayFirst(), opts.isLines())); + opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull())); } /** @@ -1121,7 +1122,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b assert offset >= 0 && offset < buffer.length; try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), null, - buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines()))) { + buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(), + opts.isRecoverWithNull()))) { return gatherJSONColumns(schema, twm); } } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index b05fc9b7bc4..0c029a128a9 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1331,7 +1331,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( - JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) { + JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, + jboolean recover_with_null) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1344,9 +1345,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( auto source = cudf::io::source_info{reinterpret_cast(buffer), static_cast(buffer_length)}; + auto const recovery_mode = recover_with_null ? + cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) - .lines(static_cast(lines)); + .lines(static_cast(lines)) + .recovery_mode(recovery_mode); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1404,7 +1408,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, - jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) { + jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, + jboolean recover_with_null) { bool read_buffer = true; if (buffer == 0) { @@ -1448,9 +1453,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( static_cast(buffer_length)} : cudf::io::source_info{filename.get()}; + cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? + cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) - .lines(static_cast(lines)); + .lines(static_cast(lines)) + .recovery_mode(recovery_mode); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 3740328615a..d4582105204 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -42,6 +42,7 @@ import org.apache.parquet.schema.OriginalType; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.function.Executable; import java.io.*; import java.math.BigDecimal; @@ -86,6 +87,7 @@ public class TableTest extends CudfTestBase { private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro"); private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv"); private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json"); + private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json"); private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder() .column(DType.INT32, "A") @@ -326,6 +328,39 @@ void testReadJSONFile() { } } + @Test + void testReadJSONFileWithInvalidLines() { + Schema schema = Schema.builder() + .column(DType.STRING, "name") + .column(DType.INT32, "age") + .build(); + + // test with recoverWithNulls=true + { + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withRecoverWithNull(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("Michael", "Andy", null, "Justin") + .column(null, 30, null, 19) + .build(); + Table table = Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE)) { + assertTablesAreEqual(expected, table); + } + } + + // test with recoverWithNulls=false + { + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withRecoverWithNull(false) + .build(); + assertThrows(CudfException.class, () -> + Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE)); + } + } + @Test void testReadJSONFileWithDifferentColumnOrder() { Schema schema = Schema.builder() diff --git a/java/src/test/resources/people_with_invalid_lines.json b/java/src/test/resources/people_with_invalid_lines.json new file mode 100644 index 00000000000..a99592e3eca --- /dev/null +++ b/java/src/test/resources/people_with_invalid_lines.json @@ -0,0 +1,4 @@ +{"name":"Michael"} +{"name":"Andy", "age":30} +this_line_is_not_valid +{"name":"Justin", "age":19} From ee7d309e03043e953635463e0462369f28b2ecce Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 Sep 2023 15:38:54 -0600 Subject: [PATCH 2/5] update copyright year --- java/src/main/java/ai/rapids/cudf/JSONOptions.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index f55acfa6fcc..b03ef8cd4f1 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 2dfa1bfb0448e45d9a56edd89cdb2d293831739f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 11 Sep 2023 16:06:44 -0600 Subject: [PATCH 3/5] Remove unused import --- java/src/test/java/ai/rapids/cudf/TableTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index d4582105204..59f0d180c6e 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -42,7 +42,6 @@ import org.apache.parquet.schema.OriginalType; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.function.Executable; import java.io.*; import java.math.BigDecimal; From 035400d00ed4e9535479dff1ee30b71a45b8ef47 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 Sep 2023 09:04:26 -0600 Subject: [PATCH 4/5] javadoc --- java/src/main/java/ai/rapids/cudf/JSONOptions.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index b03ef8cd4f1..f98687df5fa 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -46,6 +46,7 @@ public boolean isLines() { return lines; } + /** Return the value of the recoverWithNull option */ public boolean isRecoverWithNull() { return recoverWithNull; } @@ -86,6 +87,15 @@ public Builder withLines(boolean perLine) { return this; } + /** + * Specify how to handle invalid lines when parsing json. Setting + * recoverWithNull to true will cause null values to be returned + * for invalid lines. Setting recoverWithNull to false will cause + * the parsing to fail with an exception. + * + * @param recoverWithNull true: return nulls, false: throw exception + * @return builder for chaining + */ public Builder withRecoverWithNull(boolean recoverWithNull) { this.recoverWithNull = recoverWithNull; return this; From 9bd830875c6ffdb4407a2ecfad4062c9c57b1bc9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 Sep 2023 12:47:03 -0600 Subject: [PATCH 5/5] code formatting --- java/src/main/native/src/TableJni.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 0c029a128a9..b208ef8f381 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1346,7 +1346,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( static_cast(buffer_length)}; auto const recovery_mode = recover_with_null ? - cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : + cudf::io::json_recovery_mode_t::FAIL; cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) .lines(static_cast(lines)) @@ -1453,8 +1454,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( static_cast(buffer_length)} : cudf::io::source_info{filename.get()}; - cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? - cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_recovery_mode_t recovery_mode = + recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : + cudf::io::json_recovery_mode_t::FAIL; cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) .lines(static_cast(lines))