Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support encoding and decoding JSONL datasets #176

Merged
merged 1 commit into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions docs/decode.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Decode
======

```sh
jsonschema decode <output.binpack> <output.json>
jsonschema decode <output.binpack> <output.json|.jsonl>
```

This command decodes a JSON document using [JSON
Expand All @@ -28,8 +28,14 @@ Decoding this file using JSON BinPack will result in the following document:
}
```

### Decode a binary file
### Decode a binary file into a JSON document

```sh
jsonschema decode path/to/output.binpack path/to/my/output.json
```

### Decode a binary file into a JSONL dataset

```sh
jsonschema decode path/to/output.binpack path/to/my/dataset.jsonl
```
8 changes: 7 additions & 1 deletion docs/encode.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Encode
======

```sh
jsonschema encode <document.json> <output.binpack>
jsonschema encode <document.json|.jsonl> <output.binpack>
```

This command encodes a JSON document using [JSON
Expand Down Expand Up @@ -33,3 +33,9 @@ $ xxd output.binpack
```sh
jsonschema encode path/to/my/document.json path/to/output.binpack
```

### Encode a JSONL dataset

```sh
jsonschema encode path/to/my/dataset.jsonl path/to/output.binpack
```
53 changes: 45 additions & 8 deletions src/command_decode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,19 @@
#include "command.h"
#include "utils.h"

static auto has_data(std::ifstream &stream) -> bool {
if (!stream.is_open()) {
return false;
}

std::streampos current_pos = stream.tellg();
stream.seekg(0, std::ios::end);
std::streampos end_pos = stream.tellg();
stream.seekg(current_pos);

return (current_pos < end_pos) && stream.good();
}

auto sourcemeta::jsonschema::cli::decode(
const std::span<const std::string> &arguments) -> int {
const auto options{parse_options(arguments, {})};
Expand All @@ -36,20 +49,44 @@ auto sourcemeta::jsonschema::cli::decode(

std::ifstream input_stream{std::filesystem::canonical(options.at("").front()),
std::ios::binary};
input_stream.exceptions(std::ifstream::failbit | std::ifstream::badbit);
assert(!input_stream.fail());
assert(input_stream.is_open());
sourcemeta::jsonbinpack::Decoder decoder{input_stream};
const auto document{decoder.read(encoding)};

std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
const std::filesystem::path output{options.at("").at(1)};
std::ofstream output_stream(std::filesystem::weakly_canonical(output),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsontoolkit::prettify(
document, output_stream, sourcemeta::jsontoolkit::schema_format_compare);
sourcemeta::jsonbinpack::Decoder decoder{input_stream};

if (output.extension() == ".jsonl") {
log_verbose(options)
<< "Interpreting input as JSONL: "
<< std::filesystem::weakly_canonical(options.at("").front()).string()
<< "\n";

std::size_t count{0};
while (has_data(input_stream)) {
log_verbose(options) << "Decoding entry #" << count << "\n";
const auto document{decoder.read(encoding)};
if (count > 0) {
output_stream << "\n";
}

sourcemeta::jsontoolkit::prettify(
document, output_stream,
sourcemeta::jsontoolkit::schema_format_compare);
count += 1;
}
} else {
const auto document{decoder.read(encoding)};
sourcemeta::jsontoolkit::prettify(
document, output_stream,
sourcemeta::jsontoolkit::schema_format_compare);
}

output_stream << "\n";
output_stream.flush();
output_stream.close();

return EXIT_SUCCESS;
}
61 changes: 48 additions & 13 deletions src/command_encode.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <sourcemeta/jsonbinpack/compiler.h>
#include <sourcemeta/jsonbinpack/runtime.h>
#include <sourcemeta/jsontoolkit/json.h>
#include <sourcemeta/jsontoolkit/jsonl.h>
#include <sourcemeta/jsontoolkit/jsonschema.h>

#include <cstdlib> // EXIT_SUCCESS
Expand Down Expand Up @@ -33,18 +34,52 @@ auto sourcemeta::jsonschema::cli::encode(
resolver(options, options.contains("h") || options.contains("http")));
const auto encoding{sourcemeta::jsonbinpack::load(schema)};

const auto document{
sourcemeta::jsontoolkit::from_file(options.at("").front())};

std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
encoder.write(document, encoding);
output_stream.flush();
const auto size{output_stream.tellp()};
output_stream.close();
std::cerr << "size: " << size << " bytes\n";
const std::filesystem::path document{options.at("").front()};
const auto original_size{std::filesystem::file_size(document)};
std::cerr << "original file size: " << original_size << " bytes\n";

if (document.extension() == ".jsonl") {
log_verbose(options) << "Interpreting input as JSONL: "
<< std::filesystem::weakly_canonical(document).string()
<< "\n";

auto stream{sourcemeta::jsontoolkit::read_file(document)};
std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
std::size_t count{0};
for (const auto &entry : sourcemeta::jsontoolkit::JSONL{stream}) {
log_verbose(options) << "Encoding entry #" << count << "\n";
encoder.write(entry, encoding);
count += 1;
}

output_stream.flush();
const auto total_size{output_stream.tellp()};
output_stream.close();
std::cerr << "encoded file size: " << total_size << " bytes\n";
std::cerr << "compression ratio: "
<< (static_cast<std::uint64_t>(total_size) * 100 / original_size)
<< "%\n";
} else {
const auto entry{
sourcemeta::jsontoolkit::from_file(options.at("").front())};
std::ofstream output_stream(
std::filesystem::weakly_canonical(options.at("").at(1)),
std::ios::binary);
output_stream.exceptions(std::ios_base::badbit);
sourcemeta::jsonbinpack::Encoder encoder{output_stream};
encoder.write(entry, encoding);
output_stream.flush();
const auto total_size{output_stream.tellp()};
output_stream.close();
std::cerr << "encoded file size: " << total_size << " bytes\n";
std::cerr << "compression ratio: "
<< (static_cast<std::uint64_t>(total_size) * 100 / original_size)
<< "%\n";
}

return EXIT_SUCCESS;
}
4 changes: 2 additions & 2 deletions src/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,11 @@ Global Options:
Pre-process a JSON Schema into JSON BinPack's canonical form
for static analysis.

encode <document.json> <output.binpack>
encode <document.json|.jsonl> <output.binpack>

Encode a JSON document or JSONL dataset using JSON BinPack.

decode <output.binpack> <output.json>
decode <output.binpack> <output.json|.jsonl>

Decode a JSON document or JSONL dataset using JSON BinPack.

Expand Down
4 changes: 4 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,15 @@ add_jsonschema_test_unix(canonicalize/fail_unknown_metaschema)

# Encode
add_jsonschema_test_unix(encode/pass_schema_less)
add_jsonschema_test_unix(encode/pass_schema_less_jsonl)
add_jsonschema_test_unix(encode/pass_schema_less_jsonl_verbose)
add_jsonschema_test_unix(encode/fail_no_document)
add_jsonschema_test_unix(encode/fail_no_output)

# Decode
add_jsonschema_test_unix(decode/pass_schema_less)
add_jsonschema_test_unix(decode/pass_schema_less_jsonl)
add_jsonschema_test_unix(decode/pass_schema_less_jsonl_verbose)
add_jsonschema_test_unix(decode/fail_no_document)
add_jsonschema_test_unix(decode/fail_no_output)

Expand Down
45 changes: 45 additions & 0 deletions test/decode/pass_schema_less_jsonl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/sh

set -o errexit
set -o nounset

TMP="$(mktemp -d)"
clean() { rm -rf "$TMP"; }
trap clean EXIT

cat << 'EOF' > "$TMP/document.jsonl"
{ "count": 1 }
{ "count": 2 }
{ "count": 3 }
{ "count": 4 }
{ "count": 5 }
EOF

"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack"
"$1" decode "$TMP/output.binpack" "$TMP/result.jsonl" > "$TMP/output.txt" 2>&1

cat "$TMP/result.jsonl"

cat << EOF > "$TMP/expected.jsonl"
{
"count": 1
}
{
"count": 2
}
{
"count": 3
}
{
"count": 4
}
{
"count": 5
}
EOF

cat << EOF > "$TMP/expected-output.txt"
EOF

diff "$TMP/expected.jsonl" "$TMP/result.jsonl"
diff "$TMP/output.txt" "$TMP/expected-output.txt"
51 changes: 51 additions & 0 deletions test/decode/pass_schema_less_jsonl_verbose.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/sh

set -o errexit
set -o nounset

TMP="$(mktemp -d)"
clean() { rm -rf "$TMP"; }
trap clean EXIT

cat << 'EOF' > "$TMP/document.jsonl"
{ "count": 1 }
{ "count": 2 }
{ "count": 3 }
{ "count": 4 }
{ "count": 5 }
EOF

"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack"
"$1" decode "$TMP/output.binpack" "$TMP/result.jsonl" --verbose > "$TMP/output.txt" 2>&1

cat "$TMP/result.jsonl"

cat << EOF > "$TMP/expected.jsonl"
{
"count": 1
}
{
"count": 2
}
{
"count": 3
}
{
"count": 4
}
{
"count": 5
}
EOF

cat << EOF > "$TMP/expected-output.txt"
Interpreting input as JSONL: $(realpath "$TMP")/output.binpack
Decoding entry #0
Decoding entry #1
Decoding entry #2
Decoding entry #3
Decoding entry #4
EOF

diff "$TMP/expected.jsonl" "$TMP/result.jsonl"
diff "$TMP/output.txt" "$TMP/expected-output.txt"
4 changes: 3 additions & 1 deletion test/encode/pass_schema_less.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ cat << 'EOF' > "$TMP/expected.txt"
EOF

cat << 'EOF' > "$TMP/expected-output.txt"
size: 11 bytes
original file size: 19 bytes
encoded file size: 11 bytes
compression ratio: 57%
EOF

diff "$TMP/expected.txt" "$TMP/output.hex"
Expand Down
33 changes: 33 additions & 0 deletions test/encode/pass_schema_less_jsonl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/sh

set -o errexit
set -o nounset

TMP="$(mktemp -d)"
clean() { rm -rf "$TMP"; }
trap clean EXIT

cat << 'EOF' > "$TMP/document.jsonl"
{ "count": 1 }
{ "count": 2 }
{ "count": 3 }
{ "count": 4 }
{ "count": 5 }
EOF

"$1" encode "$TMP/document.jsonl" "$TMP/output.binpack" > "$TMP/output.txt" 2>&1
xxd "$TMP/output.binpack" > "$TMP/output.hex"

cat << 'EOF' > "$TMP/expected.txt"
00000000: 1306 636f 756e 7415 1300 091d 1300 0525 ..count........%
00000010: 1300 052d 1300 0535 ...-...5
EOF

cat << 'EOF' > "$TMP/expected-output.txt"
original file size: 75 bytes
encoded file size: 24 bytes
compression ratio: 32%
EOF

diff "$TMP/expected.txt" "$TMP/output.hex"
diff "$TMP/output.txt" "$TMP/expected-output.txt"
Loading