From 24daa4227d62bc8d0877cc31c08932293048e4f9 Mon Sep 17 00:00:00 2001 From: Juan Cruz Viotti Date: Wed, 9 Oct 2024 15:54:23 -0400 Subject: [PATCH] [WIP] Support encoding and decoding JSONL datasets Signed-off-by: Juan Cruz Viotti --- docs/decode.markdown | 10 ++- docs/encode.markdown | 8 ++- src/command_encode.cc | 61 +++++++++++++++---- src/main.cc | 4 +- test/CMakeLists.txt | 2 + test/encode/pass_schema_less.sh | 4 +- test/encode/pass_schema_less_jsonl.sh | 33 ++++++++++ test/encode/pass_schema_less_jsonl_verbose.sh | 39 ++++++++++++ 8 files changed, 142 insertions(+), 19 deletions(-) create mode 100755 test/encode/pass_schema_less_jsonl.sh create mode 100755 test/encode/pass_schema_less_jsonl_verbose.sh diff --git a/docs/decode.markdown b/docs/decode.markdown index a137adf8..751907c7 100644 --- a/docs/decode.markdown +++ b/docs/decode.markdown @@ -2,7 +2,7 @@ Decode ====== ```sh -jsonschema decode +jsonschema decode ``` This command decodes a JSON document using [JSON @@ -28,8 +28,14 @@ Decoding this file using JSON BinPack will result in the following document: } ``` -### Decode a binary file +### Decode a binary file into a JSON document ```sh jsonschema decode path/to/output.binpack path/to/my/output.json ``` + +### Decode a binary file into a JSONL dataset + +```sh +jsonschema decode path/to/output.binpack path/to/my/dataset.jsonl +``` diff --git a/docs/encode.markdown b/docs/encode.markdown index 90b90731..099a2b82 100644 --- a/docs/encode.markdown +++ b/docs/encode.markdown @@ -2,7 +2,7 @@ Encode ====== ```sh -jsonschema encode +jsonschema encode ``` This command encodes a JSON document using [JSON @@ -33,3 +33,9 @@ $ xxd output.binpack ```sh jsonschema encode path/to/my/document.json path/to/output.binpack ``` + +### Encode a JSONL dataset + +```sh +jsonschema encode path/to/my/dataset.jsonl path/to/output.binpack +``` diff --git a/src/command_encode.cc b/src/command_encode.cc index e0554fc7..cadfa259 100644 --- a/src/command_encode.cc +++ b/src/command_encode.cc @@ -1,6 +1,7 @@ #include #include #include +#include #include #include // EXIT_SUCCESS @@ -33,18 +34,52 @@ auto sourcemeta::jsonschema::cli::encode( resolver(options, options.contains("h") || options.contains("http"))); const auto encoding{sourcemeta::jsonbinpack::load(schema)}; - const auto document{ - sourcemeta::jsontoolkit::from_file(options.at("").front())}; - - std::ofstream output_stream( - std::filesystem::weakly_canonical(options.at("").at(1)), - std::ios::binary); - output_stream.exceptions(std::ios_base::badbit); - sourcemeta::jsonbinpack::Encoder encoder{output_stream}; - encoder.write(document, encoding); - output_stream.flush(); - const auto size{output_stream.tellp()}; - output_stream.close(); - std::cerr << "size: " << size << " bytes\n"; + const std::filesystem::path document{options.at("").front()}; + const auto original_size{std::filesystem::file_size(document)}; + std::cerr << "original file size: " << original_size << " bytes\n"; + + if (document.extension() == ".jsonl") { + log_verbose(options) << "Interpreting input as JSONL: " + << std::filesystem::weakly_canonical(document).string() + << "\n"; + + auto stream{sourcemeta::jsontoolkit::read_file(document)}; + std::ofstream output_stream( + std::filesystem::weakly_canonical(options.at("").at(1)), + std::ios::binary); + output_stream.exceptions(std::ios_base::badbit); + sourcemeta::jsonbinpack::Encoder encoder{output_stream}; + std::size_t count{0}; + for (const auto &entry : sourcemeta::jsontoolkit::JSONL{stream}) { + log_verbose(options) << "Encoding entry #" << count << "\n"; + encoder.write(entry, encoding); + count += 1; + } + + output_stream.flush(); + const auto total_size{output_stream.tellp()}; + output_stream.close(); + std::cerr << "encoded file size: " << total_size << " bytes\n"; + std::cerr << "compression ratio: " + << (static_cast(total_size) * 100 / original_size) + << "%\n"; + } else { + const auto entry{ + sourcemeta::jsontoolkit::from_file(options.at("").front())}; + std::ofstream output_stream( + std::filesystem::weakly_canonical(options.at("").at(1)), + std::ios::binary); + output_stream.exceptions(std::ios_base::badbit); + sourcemeta::jsonbinpack::Encoder encoder{output_stream}; + encoder.write(entry, encoding); + output_stream.flush(); + const auto total_size{output_stream.tellp()}; + output_stream.close(); + std::cerr << "encoded file size: " << total_size << " bytes\n"; + std::cerr << "compression ratio: " + << (static_cast(total_size) * 100 / original_size) + << "%\n"; + } + return EXIT_SUCCESS; } diff --git a/src/main.cc b/src/main.cc index c562dced..a719d77d 100644 --- a/src/main.cc +++ b/src/main.cc @@ -74,11 +74,11 @@ Global Options: Pre-process a JSON Schema into JSON BinPack's canonical form for static analysis. - encode + encode Encode a JSON document or JSONL dataset using JSON BinPack. - decode + decode Decode a JSON document or JSONL dataset using JSON BinPack. diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 416a02da..bd3c1e1a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -174,6 +174,8 @@ add_jsonschema_test_unix(canonicalize/fail_unknown_metaschema) # Encode add_jsonschema_test_unix(encode/pass_schema_less) +add_jsonschema_test_unix(encode/pass_schema_less_jsonl) +add_jsonschema_test_unix(encode/pass_schema_less_jsonl_verbose) add_jsonschema_test_unix(encode/fail_no_document) add_jsonschema_test_unix(encode/fail_no_output) diff --git a/test/encode/pass_schema_less.sh b/test/encode/pass_schema_less.sh index 61116e84..8ac25215 100755 --- a/test/encode/pass_schema_less.sh +++ b/test/encode/pass_schema_less.sh @@ -19,7 +19,9 @@ cat << 'EOF' > "$TMP/expected.txt" EOF cat << 'EOF' > "$TMP/expected-output.txt" -size: 11 bytes +original file size: 19 bytes +encoded file size: 11 bytes +compression ratio: 57% EOF diff "$TMP/expected.txt" "$TMP/output.hex" diff --git a/test/encode/pass_schema_less_jsonl.sh b/test/encode/pass_schema_less_jsonl.sh new file mode 100755 index 00000000..46fb7672 --- /dev/null +++ b/test/encode/pass_schema_less_jsonl.sh @@ -0,0 +1,33 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/document.jsonl" +{ "count": 1 } +{ "count": 2 } +{ "count": 3 } +{ "count": 4 } +{ "count": 5 } +EOF + +"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" > "$TMP/output.txt" 2>&1 +xxd "$TMP/output.binpack" > "$TMP/output.hex" + +cat << 'EOF' > "$TMP/expected.txt" +00000000: 1306 636f 756e 7415 1300 091d 1300 0525 ..count........% +00000010: 1300 052d 1300 0535 ...-...5 +EOF + +cat << 'EOF' > "$TMP/expected-output.txt" +original file size: 75 bytes +encoded file size: 24 bytes +compression ratio: 32% +EOF + +diff "$TMP/expected.txt" "$TMP/output.hex" +diff "$TMP/output.txt" "$TMP/expected-output.txt" diff --git a/test/encode/pass_schema_less_jsonl_verbose.sh b/test/encode/pass_schema_less_jsonl_verbose.sh new file mode 100755 index 00000000..b8a63036 --- /dev/null +++ b/test/encode/pass_schema_less_jsonl_verbose.sh @@ -0,0 +1,39 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/document.jsonl" +{ "count": 1 } +{ "count": 2 } +{ "count": 3 } +{ "count": 4 } +{ "count": 5 } +EOF + +"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" --verbose > "$TMP/output.txt" 2>&1 +xxd "$TMP/output.binpack" > "$TMP/output.hex" + +cat << 'EOF' > "$TMP/expected.txt" +00000000: 1306 636f 756e 7415 1300 091d 1300 0525 ..count........% +00000010: 1300 052d 1300 0535 ...-...5 +EOF + +cat << EOF > "$TMP/expected-output.txt" +original file size: 75 bytes +Interpreting input as JSONL: $(realpath "$TMP")/document.jsonl +Encoding entry #0 +Encoding entry #1 +Encoding entry #2 +Encoding entry #3 +Encoding entry #4 +encoded file size: 24 bytes +compression ratio: 32% +EOF + +diff "$TMP/expected.txt" "$TMP/output.hex" +diff "$TMP/output.txt" "$TMP/expected-output.txt"