Skip to content

Commit

Permalink
[WIP] Support encoding and decoding JSONL datasets
Browse files Browse the repository at this point in the history
Signed-off-by: Juan Cruz Viotti <[email protected]>
  • Loading branch information
jviotti committed Oct 9, 2024
1 parent 48f3f42 commit e1dcc60
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 18 deletions.
2 changes: 1 addition & 1 deletion DEPENDENCIES
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ noa https://github.com/sourcemeta/noa 517e88aef5981b88ac6bb8caff15d17dffcb4320
jsontoolkit https://github.com/sourcemeta/jsontoolkit 9abbaee71e9e00e95632858d29c7ebe5c2a723b0
hydra https://github.com/sourcemeta/hydra 3c53d3fdef79e9ba603d48470a508cc45472a0dc
alterschema https://github.com/sourcemeta/alterschema 358df64771979da64e043a416cf340d83a5382ca
jsonbinpack https://github.com/sourcemeta/jsonbinpack 3046d0b9820b7da7f10645ca5fae6b0ffb749422
jsonbinpack https://github.com/sourcemeta/jsonbinpack b25d54363f5a88cd23c1af41e4c6025b9c94d0d6
52 changes: 39 additions & 13 deletions src/command_encode.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <sourcemeta/jsonbinpack/compiler.h>
#include <sourcemeta/jsonbinpack/runtime.h>
#include <sourcemeta/jsontoolkit/json.h>
#include <sourcemeta/jsontoolkit/jsonl.h>
#include <sourcemeta/jsontoolkit/jsonschema.h>

#include <cstdlib> // EXIT_SUCCESS
Expand Down Expand Up @@ -33,18 +34,43 @@ auto sourcemeta::jsonschema::cli::encode(
resolver(options, options.contains("h") || options.contains("http")));
const auto encoding{sourcemeta::jsonbinpack::load(schema)};

const auto document{
sourcemeta::jsontoolkit::from_file(options.at("").front())};

std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
encoder.write(document, encoding);
output_stream.flush();
const auto size{output_stream.tellp()};
output_stream.close();
std::cerr << "size: " << size << " bytes\n";
const std::filesystem::path document{options.at("").front()};
if (document.extension() == ".jsonl") {
log_verbose(options) << "Interpreting input as JSONL: "
<< std::filesystem::weakly_canonical(document).string()
<< "\n";

auto stream{sourcemeta::jsontoolkit::read_file(document)};
std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
std::size_t count{0};
for (const auto &entry : sourcemeta::jsontoolkit::JSONL{stream}) {
log_verbose(options) << "Encoding entry #" << count << "\n";
encoder.write(entry, encoding);
count += 1;
}

output_stream.flush();
const auto total_size{output_stream.tellp()};
output_stream.close();
std::cerr << "total size: " << total_size << " bytes\n";
} else {
const auto entry{
sourcemeta::jsontoolkit::from_file(options.at("").front())};
std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
encoder.write(entry, encoding);
output_stream.flush();
const auto size{output_stream.tellp()};
output_stream.close();
std::cerr << "size: " << size << " bytes\n";
}

return EXIT_SUCCESS;
}
2 changes: 2 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ add_jsonschema_test_unix(canonicalize/fail_unknown_metaschema)

# Encode
add_jsonschema_test_unix(encode/pass_schema_less)
add_jsonschema_test_unix(encode/pass_schema_less_jsonl)
add_jsonschema_test_unix(encode/pass_schema_less_jsonl_verbose)
add_jsonschema_test_unix(encode/fail_no_document)
add_jsonschema_test_unix(encode/fail_no_output)

Expand Down
31 changes: 31 additions & 0 deletions test/encode/pass_schema_less_jsonl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/sh

set -o errexit
set -o nounset

TMP="$(mktemp -d)"
clean() { rm -rf "$TMP"; }
trap clean EXIT

cat << 'EOF' > "$TMP/document.jsonl"
{ "count": 1 }
{ "count": 2 }
{ "count": 3 }
{ "count": 4 }
{ "count": 5 }
EOF

"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" > "$TMP/output.txt" 2>&1
xxd "$TMP/output.binpack" > "$TMP/output.hex"

cat << 'EOF' > "$TMP/expected.txt"
00000000: 1306 636f 756e 7415 1300 091d 1300 0525 ..count........%
00000010: 1300 052d 1300 0535 ...-...5
EOF

cat << 'EOF' > "$TMP/expected-output.txt"
total size: 24 bytes
EOF

diff "$TMP/expected.txt" "$TMP/output.hex"
diff "$TMP/output.txt" "$TMP/expected-output.txt"
37 changes: 37 additions & 0 deletions test/encode/pass_schema_less_jsonl_verbose.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/sh

set -o errexit
set -o nounset

TMP="$(mktemp -d)"
clean() { rm -rf "$TMP"; }
trap clean EXIT

cat << 'EOF' > "$TMP/document.jsonl"
{ "count": 1 }
{ "count": 2 }
{ "count": 3 }
{ "count": 4 }
{ "count": 5 }
EOF

"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" --verbose > "$TMP/output.txt" 2>&1
xxd "$TMP/output.binpack" > "$TMP/output.hex"

cat << 'EOF' > "$TMP/expected.txt"
00000000: 1306 636f 756e 7415 1300 091d 1300 0525 ..count........%
00000010: 1300 052d 1300 0535 ...-...5
EOF

cat << EOF > "$TMP/expected-output.txt"
Interpreting input as JSONL: $(realpath "$TMP")/document.jsonl
Encoding entry #0
Encoding entry #1
Encoding entry #2
Encoding entry #3
Encoding entry #4
total size: 24 bytes
EOF

diff "$TMP/expected.txt" "$TMP/output.hex"
diff "$TMP/output.txt" "$TMP/expected-output.txt"
8 changes: 4 additions & 4 deletions vendor/jsonbinpack/src/runtime/encoder_string.cc

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e1dcc60

Please sign in to comment.