Skip to content

Commit

Permalink
[WIP] Support encoding and decoding JSONL datasets
Browse files Browse the repository at this point in the history
Signed-off-by: Juan Cruz Viotti <[email protected]>
  • Loading branch information
jviotti committed Oct 9, 2024
1 parent 13b7fd0 commit cc73e11
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 14 deletions.
61 changes: 48 additions & 13 deletions src/command_encode.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <sourcemeta/jsonbinpack/compiler.h>
#include <sourcemeta/jsonbinpack/runtime.h>
#include <sourcemeta/jsontoolkit/json.h>
#include <sourcemeta/jsontoolkit/jsonl.h>
#include <sourcemeta/jsontoolkit/jsonschema.h>

#include <cstdlib> // EXIT_SUCCESS
Expand Down Expand Up @@ -33,18 +34,52 @@ auto sourcemeta::jsonschema::cli::encode(
resolver(options, options.contains("h") || options.contains("http")));
const auto encoding{sourcemeta::jsonbinpack::load(schema)};

const auto document{
sourcemeta::jsontoolkit::from_file(options.at("").front())};

std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
encoder.write(document, encoding);
output_stream.flush();
const auto size{output_stream.tellp()};
output_stream.close();
std::cerr << "size: " << size << " bytes\n";
const std::filesystem::path document{options.at("").front()};
const auto original_size{std::filesystem::file_size(document)};
std::cerr << "original file size: " << original_size << " bytes\n";

if (document.extension() == ".jsonl") {
log_verbose(options) << "Interpreting input as JSONL: "
<< std::filesystem::weakly_canonical(document).string()
<< "\n";

auto stream{sourcemeta::jsontoolkit::read_file(document)};
std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
std::size_t count{0};
for (const auto &entry : sourcemeta::jsontoolkit::JSONL{stream}) {
log_verbose(options) << "Encoding entry #" << count << "\n";
encoder.write(entry, encoding);
count += 1;
}

output_stream.flush();
const auto total_size{output_stream.tellp()};
output_stream.close();
std::cerr << "encoded file size: " << total_size << " bytes\n";
std::cerr << "compression ratio: "
<< (static_cast<std::uint64_t>(total_size) * 100 / original_size)
<< "%\n";
} else {
const auto entry{
sourcemeta::jsontoolkit::from_file(options.at("").front())};
std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
encoder.write(entry, encoding);
output_stream.flush();
const auto total_size{output_stream.tellp()};
output_stream.close();
std::cerr << "encoded file size: " << total_size << " bytes\n";
std::cerr << "compression ratio: "
<< (static_cast<std::uint64_t>(total_size) * 100 / original_size)
<< "%\n";
}

return EXIT_SUCCESS;
}
2 changes: 2 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ add_jsonschema_test_unix(canonicalize/fail_unknown_metaschema)

# Encode
add_jsonschema_test_unix(encode/pass_schema_less)
add_jsonschema_test_unix(encode/pass_schema_less_jsonl)
add_jsonschema_test_unix(encode/pass_schema_less_jsonl_verbose)
add_jsonschema_test_unix(encode/fail_no_document)
add_jsonschema_test_unix(encode/fail_no_output)

Expand Down
4 changes: 3 additions & 1 deletion test/encode/pass_schema_less.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ cat << 'EOF' > "$TMP/expected.txt"
EOF

cat << 'EOF' > "$TMP/expected-output.txt"
size: 11 bytes
original file size: 19 bytes
encoded file size: 11 bytes
compression ratio: 57%
EOF

diff "$TMP/expected.txt" "$TMP/output.hex"
Expand Down
33 changes: 33 additions & 0 deletions test/encode/pass_schema_less_jsonl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/sh

set -o errexit
set -o nounset

TMP="$(mktemp -d)"
clean() { rm -rf "$TMP"; }
trap clean EXIT

cat << 'EOF' > "$TMP/document.jsonl"
{ "count": 1 }
{ "count": 2 }
{ "count": 3 }
{ "count": 4 }
{ "count": 5 }
EOF

"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" > "$TMP/output.txt" 2>&1
xxd "$TMP/output.binpack" > "$TMP/output.hex"

cat << 'EOF' > "$TMP/expected.txt"
00000000: 1306 636f 756e 7415 1300 091d 1300 0525 ..count........%
00000010: 1300 052d 1300 0535 ...-...5
EOF

cat << 'EOF' > "$TMP/expected-output.txt"
original file size: 75 bytes
encoded file size: 24 bytes
compression ratio: 32%
EOF

diff "$TMP/expected.txt" "$TMP/output.hex"
diff "$TMP/output.txt" "$TMP/expected-output.txt"
39 changes: 39 additions & 0 deletions test/encode/pass_schema_less_jsonl_verbose.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/sh

set -o errexit
set -o nounset

TMP="$(mktemp -d)"
clean() { rm -rf "$TMP"; }
trap clean EXIT

cat << 'EOF' > "$TMP/document.jsonl"
{ "count": 1 }
{ "count": 2 }
{ "count": 3 }
{ "count": 4 }
{ "count": 5 }
EOF

"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" --verbose > "$TMP/output.txt" 2>&1
xxd "$TMP/output.binpack" > "$TMP/output.hex"

cat << 'EOF' > "$TMP/expected.txt"
00000000: 1306 636f 756e 7415 1300 091d 1300 0525 ..count........%
00000010: 1300 052d 1300 0535 ...-...5
EOF

cat << EOF > "$TMP/expected-output.txt"
original file size: 75 bytes
Interpreting input as JSONL: $(realpath "$TMP")/document.jsonl
Encoding entry #0
Encoding entry #1
Encoding entry #2
Encoding entry #3
Encoding entry #4
encoded file size: 24 bytes
compression ratio: 32%
EOF

diff "$TMP/expected.txt" "$TMP/output.hex"
diff "$TMP/output.txt" "$TMP/expected-output.txt"

0 comments on commit cc73e11

Please sign in to comment.