Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for deserializing list-encoded JSON structs [#6558] #6643

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions arrow-json/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,23 @@ pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer, WriterBuilder};
use half::f16;
use serde_json::{Number, Value};

/// Specifies what is considered valid JSON when parsing StructArrays.
///
/// If a struct with fields `("a", Int32)` and `("b", Utf8)`, it could be represented as
/// a JSON object (`{"a": 1, "b": "c"}`) or a JSON list (`[1, "c"]`). This enum controls
/// which form(s) the Reader will accept.
///
/// For objects, the order of the key does not matter.
/// For lists, the entries must be the same number and in the same order as the struct fields.
Comment on lines +77 to +84
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current doc here seems specific to reading/decoding; should also mention write since same enum is used as write option.

Could also put extra information such as justification for ListOnly (i.e. condensed version assuming schema is known separately, as you've explained in PR)

#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
pub enum StructMode {
#[default]
/// Encode/decode structs as objects (e.g., {"a": 1, "b": "c"})
ObjectOnly,
/// Encode/decode structs as lists (e.g., [1, "c"])
ListOnly,
}

/// Trait declaring any type that is serializable to JSON. This includes all primitive types (bool, i32, etc.).
pub trait JsonSerializable: 'static {
/// Converts self into json value if its possible
Expand Down Expand Up @@ -156,4 +173,72 @@ mod tests {
);
assert_eq!(None, f32::NAN.into_json_value());
}

#[test]
fn test_json_roundtrip_structs() {
use crate::writer::LineDelimited;
use arrow_schema::DataType;
use arrow_schema::Field;
use arrow_schema::Fields;
use arrow_schema::Schema;
use std::sync::Arc;

let schema = Arc::new(Schema::new(vec![
Field::new(
"c1",
DataType::Struct(Fields::from(vec![
Field::new("c11", DataType::Int32, true),
Field::new(
"c12",
DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()),
false,
),
])),
false,
),
Field::new("c2", DataType::Utf8, false),
]));

{
let object_input = r#"{"c1":{"c11":1,"c12":{"c121":"e"}},"c2":"a"}
{"c1":{"c12":{"c121":"f"}},"c2":"b"}
{"c1":{"c11":5,"c12":{"c121":"g"}},"c2":"c"}
"#
.as_bytes();
let object_reader = ReaderBuilder::new(schema.clone())
.with_struct_mode(StructMode::ObjectOnly)
.build(object_input)
.unwrap();

let mut object_output: Vec<u8> = Vec::new();
let mut object_writer = WriterBuilder::new()
.with_struct_mode(StructMode::ObjectOnly)
.build::<_, LineDelimited>(&mut object_output);
for batch_res in object_reader {
object_writer.write(&batch_res.unwrap()).unwrap();
}
assert_eq!(object_input, &object_output);
}

{
let list_input = r#"[[1,["e"]],"a"]
[[null,["f"]],"b"]
[[5,["g"]],"c"]
"#
.as_bytes();
let list_reader = ReaderBuilder::new(schema.clone())
.with_struct_mode(StructMode::ListOnly)
.build(list_input)
.unwrap();

let mut list_output: Vec<u8> = Vec::new();
let mut list_writer = WriterBuilder::new()
.with_struct_mode(StructMode::ListOnly)
.build::<_, LineDelimited>(&mut list_output);
for batch_res in list_reader {
list_writer.write(&batch_res.unwrap()).unwrap();
}
assert_eq!(list_input, &list_output);
}
}
}
3 changes: 3 additions & 0 deletions arrow-json/src/reader/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

use crate::reader::tape::{Tape, TapeElement};
use crate::reader::{make_decoder, ArrayDecoder};
use crate::StructMode;
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_array::OffsetSizeTrait;
use arrow_buffer::buffer::NullBuffer;
Expand All @@ -37,6 +38,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
coerce_primitive: bool,
strict_mode: bool,
is_nullable: bool,
struct_mode: StructMode,
) -> Result<Self, ArrowError> {
let field = match &data_type {
DataType::List(f) if !O::IS_LARGE => f,
Expand All @@ -48,6 +50,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
coerce_primitive,
strict_mode,
field.is_nullable(),
struct_mode,
)?;

Ok(Self {
Expand Down
4 changes: 4 additions & 0 deletions arrow-json/src/reader/map_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

use crate::reader::tape::{Tape, TapeElement};
use crate::reader::{make_decoder, ArrayDecoder};
use crate::StructMode;
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::ArrowNativeType;
Expand All @@ -36,6 +37,7 @@ impl MapArrayDecoder {
coerce_primitive: bool,
strict_mode: bool,
is_nullable: bool,
struct_mode: StructMode,
) -> Result<Self, ArrowError> {
let fields = match &data_type {
DataType::Map(_, true) => {
Expand All @@ -59,12 +61,14 @@ impl MapArrayDecoder {
coerce_primitive,
strict_mode,
fields[0].is_nullable(),
struct_mode,
)?;
let values = make_decoder(
fields[1].data_type().clone(),
coerce_primitive,
strict_mode,
fields[1].is_nullable(),
struct_mode,
)?;

Ok(Self {
Expand Down
Loading
Loading