Skip to content

Commit

Permalink
Support encoding and decoding JSONL datasets
Browse files Browse the repository at this point in the history
Signed-off-by: Juan Cruz Viotti <[email protected]>
  • Loading branch information
jviotti committed Oct 9, 2024
1 parent 13b7fd0 commit dc8f992
Show file tree
Hide file tree
Showing 11 changed files with 285 additions and 27 deletions.
10 changes: 8 additions & 2 deletions docs/decode.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Decode
======

```sh
jsonschema decode <output.binpack> <output.json>
jsonschema decode <output.binpack> <output.json|.jsonl>
```

This command decodes a JSON document using [JSON
Expand All @@ -28,8 +28,14 @@ Decoding this file using JSON BinPack will result in the following document:
}
```

### Decode a binary file
### Decode a binary file into a JSON document

```sh
jsonschema decode path/to/output.binpack path/to/my/output.json
```

### Decode a binary file into a JSONL dataset

```sh
jsonschema decode path/to/output.binpack path/to/my/dataset.jsonl
```
8 changes: 7 additions & 1 deletion docs/encode.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Encode
======

```sh
jsonschema encode <document.json> <output.binpack>
jsonschema encode <document.json|.jsonl> <output.binpack>
```

This command encodes a JSON document using [JSON
Expand Down Expand Up @@ -33,3 +33,9 @@ $ xxd output.binpack
```sh
jsonschema encode path/to/my/document.json path/to/output.binpack
```

### Encode a JSONL dataset

```sh
jsonschema encode path/to/my/dataset.jsonl path/to/output.binpack
```
53 changes: 45 additions & 8 deletions src/command_decode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,19 @@
#include "command.h"
#include "utils.h"

static auto has_data(std::ifstream &stream) -> bool {
if (!stream.is_open()) {
return false;
}

std::streampos current_pos = stream.tellg();
stream.seekg(0, std::ios::end);
std::streampos end_pos = stream.tellg();
stream.seekg(current_pos);

return (current_pos < end_pos) && stream.good();
}

auto sourcemeta::jsonschema::cli::decode(
const std::span<const std::string> &arguments) -> int {
const auto options{parse_options(arguments, {})};
Expand All @@ -36,20 +49,44 @@ auto sourcemeta::jsonschema::cli::decode(

std::ifstream input_stream{std::filesystem::canonical(options.at("").front()),
std::ios::binary};
input_stream.exceptions(std::ifstream::failbit | std::ifstream::badbit);
assert(!input_stream.fail());
assert(input_stream.is_open());
sourcemeta::jsonbinpack::Decoder decoder{input_stream};
const auto document{decoder.read(encoding)};

std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
const std::filesystem::path output{options.at("").at(1)};
std::ofstream output_stream(std::filesystem::weakly_canonical(output),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsontoolkit::prettify(
document, output_stream, sourcemeta::jsontoolkit::schema_format_compare);
sourcemeta::jsonbinpack::Decoder decoder{input_stream};

if (output.extension() == ".jsonl") {
log_verbose(options)
<< "Interpreting input as JSONL: "
<< std::filesystem::weakly_canonical(options.at("").front()).string()
<< "\n";

std::size_t count{0};
while (has_data(input_stream)) {
log_verbose(options) << "Decoding entry #" << count << "\n";
const auto document{decoder.read(encoding)};
if (count > 0) {
output_stream << "\n";
}

sourcemeta::jsontoolkit::prettify(
document, output_stream,
sourcemeta::jsontoolkit::schema_format_compare);
count += 1;
}
} else {
const auto document{decoder.read(encoding)};
sourcemeta::jsontoolkit::prettify(
document, output_stream,
sourcemeta::jsontoolkit::schema_format_compare);
}

output_stream << "\n";
output_stream.flush();
output_stream.close();

return EXIT_SUCCESS;
}
61 changes: 48 additions & 13 deletions src/command_encode.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <sourcemeta/jsonbinpack/compiler.h>
#include <sourcemeta/jsonbinpack/runtime.h>
#include <sourcemeta/jsontoolkit/json.h>
#include <sourcemeta/jsontoolkit/jsonl.h>
#include <sourcemeta/jsontoolkit/jsonschema.h>

#include <cstdlib> // EXIT_SUCCESS
Expand Down Expand Up @@ -33,18 +34,52 @@ auto sourcemeta::jsonschema::cli::encode(
resolver(options, options.contains("h") || options.contains("http")));
const auto encoding{sourcemeta::jsonbinpack::load(schema)};

const auto document{
sourcemeta::jsontoolkit::from_file(options.at("").front())};

std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
encoder.write(document, encoding);
output_stream.flush();
const auto size{output_stream.tellp()};
output_stream.close();
std::cerr << "size: " << size << " bytes\n";
const std::filesystem::path document{options.at("").front()};
const auto original_size{std::filesystem::file_size(document)};
std::cerr << "original file size: " << original_size << " bytes\n";

if (document.extension() == ".jsonl") {
log_verbose(options) << "Interpreting input as JSONL: "
<< std::filesystem::weakly_canonical(document).string()
<< "\n";

auto stream{sourcemeta::jsontoolkit::read_file(document)};
std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
std::size_t count{0};
for (const auto &entry : sourcemeta::jsontoolkit::JSONL{stream}) {
log_verbose(options) << "Encoding entry #" << count << "\n";
encoder.write(entry, encoding);
count += 1;
}

output_stream.flush();
const auto total_size{output_stream.tellp()};
output_stream.close();
std::cerr << "encoded file size: " << total_size << " bytes\n";
std::cerr << "compression ratio: "
<< (static_cast<std::uint64_t>(total_size) * 100 / original_size)
<< "%\n";
} else {
const auto entry{
sourcemeta::jsontoolkit::from_file(options.at("").front())};
std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
encoder.write(entry, encoding);
output_stream.flush();
const auto total_size{output_stream.tellp()};
output_stream.close();
std::cerr << "encoded file size: " << total_size << " bytes\n";
std::cerr << "compression ratio: "
<< (static_cast<std::uint64_t>(total_size) * 100 / original_size)
<< "%\n";
}

return EXIT_SUCCESS;
}
4 changes: 2 additions & 2 deletions src/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,11 @@ Global Options:
Pre-process a JSON Schema into JSON BinPack's canonical form
for static analysis.
encode <document.json> <output.binpack>
encode <document.json|.jsonl> <output.binpack>
Encode a JSON document or JSONL dataset using JSON BinPack.
decode <output.binpack> <output.json>
decode <output.binpack> <output.json|.jsonl>
Decode a JSON document or JSONL dataset using JSON BinPack.
Expand Down
4 changes: 4 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,15 @@ add_jsonschema_test_unix(canonicalize/fail_unknown_metaschema)

# Encode
add_jsonschema_test_unix(encode/pass_schema_less)
add_jsonschema_test_unix(encode/pass_schema_less_jsonl)
add_jsonschema_test_unix(encode/pass_schema_less_jsonl_verbose)
add_jsonschema_test_unix(encode/fail_no_document)
add_jsonschema_test_unix(encode/fail_no_output)

# Decode
add_jsonschema_test_unix(decode/pass_schema_less)
add_jsonschema_test_unix(decode/pass_schema_less_jsonl)
add_jsonschema_test_unix(decode/pass_schema_less_jsonl_verbose)
add_jsonschema_test_unix(decode/fail_no_document)
add_jsonschema_test_unix(decode/fail_no_output)

Expand Down
45 changes: 45 additions & 0 deletions test/decode/pass_schema_less_jsonl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/sh

set -o errexit
set -o nounset

TMP="$(mktemp -d)"
clean() { rm -rf "$TMP"; }
trap clean EXIT

cat << 'EOF' > "$TMP/document.jsonl"
{ "count": 1 }
{ "count": 2 }
{ "count": 3 }
{ "count": 4 }
{ "count": 5 }
EOF

"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack"
"$1" decode "$TMP/output.binpack" "$TMP/result.jsonl" > "$TMP/output.txt" 2>&1

cat "$TMP/result.jsonl"

cat << EOF > "$TMP/expected.jsonl"
{
"count": 1
}
{
"count": 2
}
{
"count": 3
}
{
"count": 4
}
{
"count": 5
}
EOF

cat << EOF > "$TMP/expected-output.txt"
EOF

diff "$TMP/expected.jsonl" "$TMP/result.jsonl"
diff "$TMP/output.txt" "$TMP/expected-output.txt"
51 changes: 51 additions & 0 deletions test/decode/pass_schema_less_jsonl_verbose.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/sh

set -o errexit
set -o nounset

TMP="$(mktemp -d)"
clean() { rm -rf "$TMP"; }
trap clean EXIT

cat << 'EOF' > "$TMP/document.jsonl"
{ "count": 1 }
{ "count": 2 }
{ "count": 3 }
{ "count": 4 }
{ "count": 5 }
EOF

"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack"
"$1" decode "$TMP/output.binpack" "$TMP/result.jsonl" --verbose > "$TMP/output.txt" 2>&1

cat "$TMP/result.jsonl"

cat << EOF > "$TMP/expected.jsonl"
{
"count": 1
}
{
"count": 2
}
{
"count": 3
}
{
"count": 4
}
{
"count": 5
}
EOF

cat << EOF > "$TMP/expected-output.txt"
Interpreting input as JSONL: $(realpath "$TMP")/output.binpack
Decoding entry #0
Decoding entry #1
Decoding entry #2
Decoding entry #3
Decoding entry #4
EOF

diff "$TMP/expected.jsonl" "$TMP/result.jsonl"
diff "$TMP/output.txt" "$TMP/expected-output.txt"
4 changes: 3 additions & 1 deletion test/encode/pass_schema_less.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ cat << 'EOF' > "$TMP/expected.txt"
EOF

cat << 'EOF' > "$TMP/expected-output.txt"
size: 11 bytes
original file size: 19 bytes
encoded file size: 11 bytes
compression ratio: 57%
EOF

diff "$TMP/expected.txt" "$TMP/output.hex"
Expand Down
33 changes: 33 additions & 0 deletions test/encode/pass_schema_less_jsonl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/sh

set -o errexit
set -o nounset

TMP="$(mktemp -d)"
clean() { rm -rf "$TMP"; }
trap clean EXIT

cat << 'EOF' > "$TMP/document.jsonl"
{ "count": 1 }
{ "count": 2 }
{ "count": 3 }
{ "count": 4 }
{ "count": 5 }
EOF

"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" > "$TMP/output.txt" 2>&1
xxd "$TMP/output.binpack" > "$TMP/output.hex"

cat << 'EOF' > "$TMP/expected.txt"
00000000: 1306 636f 756e 7415 1300 091d 1300 0525 ..count........%
00000010: 1300 052d 1300 0535 ...-...5
EOF

cat << 'EOF' > "$TMP/expected-output.txt"
original file size: 75 bytes
encoded file size: 24 bytes
compression ratio: 32%
EOF

diff "$TMP/expected.txt" "$TMP/output.hex"
diff "$TMP/output.txt" "$TMP/expected-output.txt"
Loading

0 comments on commit dc8f992

Please sign in to comment.