From 57559ab6eca2b5d91621658084e3c8b3fbb6d281 Mon Sep 17 00:00:00 2001 From: Juan Cruz Viotti Date: Wed, 9 Oct 2024 16:50:01 -0400 Subject: [PATCH] Support encoding and decoding JSONL datasets (#176) Signed-off-by: Juan Cruz Viotti --- docs/decode.markdown | 10 ++- docs/encode.markdown | 8 ++- src/command_decode.cc | 53 +++++++++++++--- src/command_encode.cc | 61 +++++++++++++++---- src/main.cc | 4 +- test/CMakeLists.txt | 4 ++ test/decode/pass_schema_less_jsonl.sh | 45 ++++++++++++++ test/decode/pass_schema_less_jsonl_verbose.sh | 51 ++++++++++++++++ test/encode/pass_schema_less.sh | 4 +- test/encode/pass_schema_less_jsonl.sh | 33 ++++++++++ test/encode/pass_schema_less_jsonl_verbose.sh | 39 ++++++++++++ 11 files changed, 285 insertions(+), 27 deletions(-) create mode 100755 test/decode/pass_schema_less_jsonl.sh create mode 100755 test/decode/pass_schema_less_jsonl_verbose.sh create mode 100755 test/encode/pass_schema_less_jsonl.sh create mode 100755 test/encode/pass_schema_less_jsonl_verbose.sh diff --git a/docs/decode.markdown b/docs/decode.markdown index a137adf8..751907c7 100644 --- a/docs/decode.markdown +++ b/docs/decode.markdown @@ -2,7 +2,7 @@ Decode ====== ```sh -jsonschema decode +jsonschema decode ``` This command decodes a JSON document using [JSON @@ -28,8 +28,14 @@ Decoding this file using JSON BinPack will result in the following document: } ``` -### Decode a binary file +### Decode a binary file into a JSON document ```sh jsonschema decode path/to/output.binpack path/to/my/output.json ``` + +### Decode a binary file into a JSONL dataset + +```sh +jsonschema decode path/to/output.binpack path/to/my/dataset.jsonl +``` diff --git a/docs/encode.markdown b/docs/encode.markdown index 90b90731..099a2b82 100644 --- a/docs/encode.markdown +++ b/docs/encode.markdown @@ -2,7 +2,7 @@ Encode ====== ```sh -jsonschema encode +jsonschema encode ``` This command encodes a JSON document using [JSON @@ -33,3 +33,9 @@ $ xxd output.binpack ```sh jsonschema encode path/to/my/document.json path/to/output.binpack ``` + +### Encode a JSONL dataset + +```sh +jsonschema encode path/to/my/dataset.jsonl path/to/output.binpack +``` diff --git a/src/command_decode.cc b/src/command_decode.cc index f627daa2..5da2786b 100644 --- a/src/command_decode.cc +++ b/src/command_decode.cc @@ -12,6 +12,19 @@ #include "command.h" #include "utils.h" +static auto has_data(std::ifstream &stream) -> bool { + if (!stream.is_open()) { + return false; + } + + std::streampos current_pos = stream.tellg(); + stream.seekg(0, std::ios::end); + std::streampos end_pos = stream.tellg(); + stream.seekg(current_pos); + + return (current_pos < end_pos) && stream.good(); +} + auto sourcemeta::jsonschema::cli::decode( const std::span &arguments) -> int { const auto options{parse_options(arguments, {})}; @@ -36,20 +49,44 @@ auto sourcemeta::jsonschema::cli::decode( std::ifstream input_stream{std::filesystem::canonical(options.at("").front()), std::ios::binary}; - input_stream.exceptions(std::ifstream::failbit | std::ifstream::badbit); assert(!input_stream.fail()); assert(input_stream.is_open()); - sourcemeta::jsonbinpack::Decoder decoder{input_stream}; - const auto document{decoder.read(encoding)}; - std::ofstream output_stream( - std::filesystem::weakly_canonical(options.at("").at(1)), - std::ios::binary); + const std::filesystem::path output{options.at("").at(1)}; + std::ofstream output_stream(std::filesystem::weakly_canonical(output), + std::ios::binary); output_stream.exceptions(std::ios_base::badbit); - sourcemeta::jsontoolkit::prettify( - document, output_stream, sourcemeta::jsontoolkit::schema_format_compare); + sourcemeta::jsonbinpack::Decoder decoder{input_stream}; + + if (output.extension() == ".jsonl") { + log_verbose(options) + << "Interpreting input as JSONL: " + << std::filesystem::weakly_canonical(options.at("").front()).string() + << "\n"; + + std::size_t count{0}; + while (has_data(input_stream)) { + log_verbose(options) << "Decoding entry #" << count << "\n"; + const auto document{decoder.read(encoding)}; + if (count > 0) { + output_stream << "\n"; + } + + sourcemeta::jsontoolkit::prettify( + document, output_stream, + sourcemeta::jsontoolkit::schema_format_compare); + count += 1; + } + } else { + const auto document{decoder.read(encoding)}; + sourcemeta::jsontoolkit::prettify( + document, output_stream, + sourcemeta::jsontoolkit::schema_format_compare); + } + output_stream << "\n"; output_stream.flush(); output_stream.close(); + return EXIT_SUCCESS; } diff --git a/src/command_encode.cc b/src/command_encode.cc index e0554fc7..cadfa259 100644 --- a/src/command_encode.cc +++ b/src/command_encode.cc @@ -1,6 +1,7 @@ #include #include #include +#include #include #include // EXIT_SUCCESS @@ -33,18 +34,52 @@ auto sourcemeta::jsonschema::cli::encode( resolver(options, options.contains("h") || options.contains("http"))); const auto encoding{sourcemeta::jsonbinpack::load(schema)}; - const auto document{ - sourcemeta::jsontoolkit::from_file(options.at("").front())}; - - std::ofstream output_stream( - std::filesystem::weakly_canonical(options.at("").at(1)), - std::ios::binary); - output_stream.exceptions(std::ios_base::badbit); - sourcemeta::jsonbinpack::Encoder encoder{output_stream}; - encoder.write(document, encoding); - output_stream.flush(); - const auto size{output_stream.tellp()}; - output_stream.close(); - std::cerr << "size: " << size << " bytes\n"; + const std::filesystem::path document{options.at("").front()}; + const auto original_size{std::filesystem::file_size(document)}; + std::cerr << "original file size: " << original_size << " bytes\n"; + + if (document.extension() == ".jsonl") { + log_verbose(options) << "Interpreting input as JSONL: " + << std::filesystem::weakly_canonical(document).string() + << "\n"; + + auto stream{sourcemeta::jsontoolkit::read_file(document)}; + std::ofstream output_stream( + std::filesystem::weakly_canonical(options.at("").at(1)), + std::ios::binary); + output_stream.exceptions(std::ios_base::badbit); + sourcemeta::jsonbinpack::Encoder encoder{output_stream}; + std::size_t count{0}; + for (const auto &entry : sourcemeta::jsontoolkit::JSONL{stream}) { + log_verbose(options) << "Encoding entry #" << count << "\n"; + encoder.write(entry, encoding); + count += 1; + } + + output_stream.flush(); + const auto total_size{output_stream.tellp()}; + output_stream.close(); + std::cerr << "encoded file size: " << total_size << " bytes\n"; + std::cerr << "compression ratio: " + << (static_cast(total_size) * 100 / original_size) + << "%\n"; + } else { + const auto entry{ + sourcemeta::jsontoolkit::from_file(options.at("").front())}; + std::ofstream output_stream( + std::filesystem::weakly_canonical(options.at("").at(1)), + std::ios::binary); + output_stream.exceptions(std::ios_base::badbit); + sourcemeta::jsonbinpack::Encoder encoder{output_stream}; + encoder.write(entry, encoding); + output_stream.flush(); + const auto total_size{output_stream.tellp()}; + output_stream.close(); + std::cerr << "encoded file size: " << total_size << " bytes\n"; + std::cerr << "compression ratio: " + << (static_cast(total_size) * 100 / original_size) + << "%\n"; + } + return EXIT_SUCCESS; } diff --git a/src/main.cc b/src/main.cc index c562dced..a719d77d 100644 --- a/src/main.cc +++ b/src/main.cc @@ -74,11 +74,11 @@ Global Options: Pre-process a JSON Schema into JSON BinPack's canonical form for static analysis. - encode + encode Encode a JSON document or JSONL dataset using JSON BinPack. - decode + decode Decode a JSON document or JSONL dataset using JSON BinPack. diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 416a02da..9da80194 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -174,11 +174,15 @@ add_jsonschema_test_unix(canonicalize/fail_unknown_metaschema) # Encode add_jsonschema_test_unix(encode/pass_schema_less) +add_jsonschema_test_unix(encode/pass_schema_less_jsonl) +add_jsonschema_test_unix(encode/pass_schema_less_jsonl_verbose) add_jsonschema_test_unix(encode/fail_no_document) add_jsonschema_test_unix(encode/fail_no_output) # Decode add_jsonschema_test_unix(decode/pass_schema_less) +add_jsonschema_test_unix(decode/pass_schema_less_jsonl) +add_jsonschema_test_unix(decode/pass_schema_less_jsonl_verbose) add_jsonschema_test_unix(decode/fail_no_document) add_jsonschema_test_unix(decode/fail_no_output) diff --git a/test/decode/pass_schema_less_jsonl.sh b/test/decode/pass_schema_less_jsonl.sh new file mode 100755 index 00000000..850bcb65 --- /dev/null +++ b/test/decode/pass_schema_less_jsonl.sh @@ -0,0 +1,45 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/document.jsonl" +{ "count": 1 } +{ "count": 2 } +{ "count": 3 } +{ "count": 4 } +{ "count": 5 } +EOF + +"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" +"$1" decode "$TMP/output.binpack" "$TMP/result.jsonl" > "$TMP/output.txt" 2>&1 + +cat "$TMP/result.jsonl" + +cat << EOF > "$TMP/expected.jsonl" +{ + "count": 1 +} +{ + "count": 2 +} +{ + "count": 3 +} +{ + "count": 4 +} +{ + "count": 5 +} +EOF + +cat << EOF > "$TMP/expected-output.txt" +EOF + +diff "$TMP/expected.jsonl" "$TMP/result.jsonl" +diff "$TMP/output.txt" "$TMP/expected-output.txt" diff --git a/test/decode/pass_schema_less_jsonl_verbose.sh b/test/decode/pass_schema_less_jsonl_verbose.sh new file mode 100755 index 00000000..1bffc9cd --- /dev/null +++ b/test/decode/pass_schema_less_jsonl_verbose.sh @@ -0,0 +1,51 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/document.jsonl" +{ "count": 1 } +{ "count": 2 } +{ "count": 3 } +{ "count": 4 } +{ "count": 5 } +EOF + +"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" +"$1" decode "$TMP/output.binpack" "$TMP/result.jsonl" --verbose > "$TMP/output.txt" 2>&1 + +cat "$TMP/result.jsonl" + +cat << EOF > "$TMP/expected.jsonl" +{ + "count": 1 +} +{ + "count": 2 +} +{ + "count": 3 +} +{ + "count": 4 +} +{ + "count": 5 +} +EOF + +cat << EOF > "$TMP/expected-output.txt" +Interpreting input as JSONL: $(realpath "$TMP")/output.binpack +Decoding entry #0 +Decoding entry #1 +Decoding entry #2 +Decoding entry #3 +Decoding entry #4 +EOF + +diff "$TMP/expected.jsonl" "$TMP/result.jsonl" +diff "$TMP/output.txt" "$TMP/expected-output.txt" diff --git a/test/encode/pass_schema_less.sh b/test/encode/pass_schema_less.sh index 61116e84..8ac25215 100755 --- a/test/encode/pass_schema_less.sh +++ b/test/encode/pass_schema_less.sh @@ -19,7 +19,9 @@ cat << 'EOF' > "$TMP/expected.txt" EOF cat << 'EOF' > "$TMP/expected-output.txt" -size: 11 bytes +original file size: 19 bytes +encoded file size: 11 bytes +compression ratio: 57% EOF diff "$TMP/expected.txt" "$TMP/output.hex" diff --git a/test/encode/pass_schema_less_jsonl.sh b/test/encode/pass_schema_less_jsonl.sh new file mode 100755 index 00000000..46fb7672 --- /dev/null +++ b/test/encode/pass_schema_less_jsonl.sh @@ -0,0 +1,33 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/document.jsonl" +{ "count": 1 } +{ "count": 2 } +{ "count": 3 } +{ "count": 4 } +{ "count": 5 } +EOF + +"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" > "$TMP/output.txt" 2>&1 +xxd "$TMP/output.binpack" > "$TMP/output.hex" + +cat << 'EOF' > "$TMP/expected.txt" +00000000: 1306 636f 756e 7415 1300 091d 1300 0525 ..count........% +00000010: 1300 052d 1300 0535 ...-...5 +EOF + +cat << 'EOF' > "$TMP/expected-output.txt" +original file size: 75 bytes +encoded file size: 24 bytes +compression ratio: 32% +EOF + +diff "$TMP/expected.txt" "$TMP/output.hex" +diff "$TMP/output.txt" "$TMP/expected-output.txt" diff --git a/test/encode/pass_schema_less_jsonl_verbose.sh b/test/encode/pass_schema_less_jsonl_verbose.sh new file mode 100755 index 00000000..b8a63036 --- /dev/null +++ b/test/encode/pass_schema_less_jsonl_verbose.sh @@ -0,0 +1,39 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/document.jsonl" +{ "count": 1 } +{ "count": 2 } +{ "count": 3 } +{ "count": 4 } +{ "count": 5 } +EOF + +"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" --verbose > "$TMP/output.txt" 2>&1 +xxd "$TMP/output.binpack" > "$TMP/output.hex" + +cat << 'EOF' > "$TMP/expected.txt" +00000000: 1306 636f 756e 7415 1300 091d 1300 0525 ..count........% +00000010: 1300 052d 1300 0535 ...-...5 +EOF + +cat << EOF > "$TMP/expected-output.txt" +original file size: 75 bytes +Interpreting input as JSONL: $(realpath "$TMP")/document.jsonl +Encoding entry #0 +Encoding entry #1 +Encoding entry #2 +Encoding entry #3 +Encoding entry #4 +encoded file size: 24 bytes +compression ratio: 32% +EOF + +diff "$TMP/expected.txt" "$TMP/output.hex" +diff "$TMP/output.txt" "$TMP/expected-output.txt"