From dd831634a45fd71f98f094a00279aa42b9122f74 Mon Sep 17 00:00:00 2001 From: Sahin Yort Date: Fri, 6 Dec 2024 22:36:55 -0800 Subject: [PATCH] feat: support deduplication in flatten (#119) * feat: support deduplication in flatten * fix root duplicates * remove gnutar dep --- distroless/private/BUILD.bazel | 1 + distroless/private/flatten.bzl | 15 ++++++++-- distroless/private/flatten.sh | 52 ++++++++++++++++++++++++++++++++++ docs/rules.md | 3 +- examples/flatten/BUILD.bazel | 43 ++++++++++++++++++++++++++++ 5 files changed, 111 insertions(+), 3 deletions(-) create mode 100755 distroless/private/flatten.sh diff --git a/distroless/private/BUILD.bazel b/distroless/private/BUILD.bazel index 7838bba..6f6eb4b 100644 --- a/distroless/private/BUILD.bazel +++ b/distroless/private/BUILD.bazel @@ -4,6 +4,7 @@ load("@rules_java//java:defs.bzl", "java_binary") exports_files([ "cacerts.sh", "locale.sh", + "flatten.sh", ]) java_binary( diff --git a/distroless/private/flatten.bzl b/distroless/private/flatten.bzl index 7d3afca..fdc3ebc 100644 --- a/distroless/private/flatten.bzl +++ b/distroless/private/flatten.bzl @@ -11,15 +11,17 @@ def _flatten_impl(ctx): output = ctx.actions.declare_file(ctx.attr.name + ext) args = ctx.actions.args() + args.add(bsdtar.tarinfo.binary) + args.add(output if ctx.attr.deduplicate else "-") args.add_all(tar_lib.DEFAULT_ARGS) args.add("--create") tar_lib.common.add_compression_args(ctx.attr.compress, args) tar_lib.add_default_compression_args(ctx.attr.compress, args) - args.add("--file", output) + args.add("--file", "-" if ctx.attr.deduplicate else output) args.add_all(ctx.files.tars, format_each = "@%s") ctx.actions.run( - executable = bsdtar.tarinfo.binary, + executable = ctx.executable._flatten_sh, inputs = ctx.files.tars, outputs = [output], tools = bsdtar.default.files, @@ -39,10 +41,19 @@ flatten = rule( allow_empty = False, doc = "List of tars to flatten", ), + "deduplicate": attr.bool(doc = """\ +EXPERIMENTAL: We may change or remove it without a notice. + +Remove duplicate entries from the archives after flattening. +Deduplication is performed only for directories. + +This requires `awk` to be available in the PATH. + """, default = False), "compress": attr.string( doc = "Compress the archive file with a supported algorithm.", values = tar_lib.common.accepted_compression_types, ), + "_flatten_sh": attr.label(default = "//distroless/private:flatten.sh", executable = True, cfg = "exec", allow_single_file = True), }, implementation = _flatten_impl, toolchains = [tar_lib.TOOLCHAIN_TYPE], diff --git a/distroless/private/flatten.sh b/distroless/private/flatten.sh new file mode 100755 index 0000000..49ba33c --- /dev/null +++ b/distroless/private/flatten.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +set -o pipefail -o errexit + +bsdtar="$1"; +output="$2"; +shift 2; + +# Deduplication requested, use this complex pipeline to deduplicate. +if [[ "$output" != "-" ]]; then + + mtree=$(mktemp) + + # List files in all archives and append to single column mtree. + for arg in "$@"; do + if [[ "$arg" == "@"* ]]; then + "$bsdtar" -tf "${arg:1}" >> "$mtree" + fi + done + + + # There not a lot happening here but there is still too many implicit knowledge. + # + # When we run bsdtar, we ask for it to prompt every entry, in the same order we created above, the mtree. + # See: https://github.com/libarchive/libarchive/blob/f745a848d7a81758cd9fcd49d7fd45caeebe1c3d/tar/write.c#L683 + # + # For every prompt, therefore entry, we have write 31 bytes of data, one of which has to be either 'Y' or 'N'. + # And the reason for it is that since we are not TTY and pretending to be one, we can't interleave write calls + # so we have to interleave it by filling up the buffer with 31 bytes of 'Y' or 'N'. + # See: https://github.com/libarchive/libarchive/blob/f745a848d7a81758cd9fcd49d7fd45caeebe1c3d/tar/util.c#L240 + # See: https://github.com/libarchive/libarchive/blob/f745a848d7a81758cd9fcd49d7fd45caeebe1c3d/tar/util.c#L216 + # + # And finally we iterate over all the entries generating 31 bytes of interleaved 'Y' or 'N' date based on if + # we came across the entry before, for directories the first occurrence is kept, and for files copies are + # preserved. + $bsdtar --confirmation "$@" > $output 2< <(awk '{ + if (substr($0,0,1) == "#") { + next; + } + count[$1]++; + ORS="" + keep="n" + if (count[$1] == 1 || $1 !~ "/$") { + keep="y" + } + for (i=0;i<31;i++) print keep + fflush() + }' "$mtree") + rm "$mtree" +else + # No deduplication, business as usual + $bsdtar $@ +fi \ No newline at end of file diff --git a/docs/rules.md b/docs/rules.md index 745df6f..4aedfbe 100644 --- a/docs/rules.md +++ b/docs/rules.md @@ -70,7 +70,7 @@ oci_image(
 load("@rules_distroless//distroless:defs.bzl", "flatten")
 
-flatten(name, compress, tars)
+flatten(name, compress, deduplicate, tars)
 
Flatten multiple archives into single archive. @@ -82,6 +82,7 @@ Flatten multiple archives into single archive. | :------------- | :------------- | :------------- | :------------- | :------------- | | name | A unique name for this target. | Name | required | | | compress | Compress the archive file with a supported algorithm. | String | optional | `""` | +| deduplicate | EXPERIMENTAL: We may change or remove it without a notice.

Remove duplicate entries from the archives after flattening. Deduplication is performed only for directories.

This requires `awk` to be available in the PATH. | Boolean | optional | `False` | | tars | List of tars to flatten | List of labels | required | | diff --git a/examples/flatten/BUILD.bazel b/examples/flatten/BUILD.bazel index 8cce007..476f992 100644 --- a/examples/flatten/BUILD.bazel +++ b/examples/flatten/BUILD.bazel @@ -64,3 +64,46 @@ assert_tar_listing( ./root time=0.0 mode=700 gid=0 uid=0 type=dir """, ) + +# Flatten with deduplication +tar( + name = "source1", + srcs = glob(["dir/*"]), + compress = "xz", +) + +tar( + name = "source2", + srcs = glob(["dir/**/*"]), + compress = "xz", +) + +tar( + name = "source3", + srcs = glob(["dir/**/*"]), + compress = "xz", +) + +flatten( + name = "flatten_dedup", + deduplicate = True, + tars = [ + ":source2", + ":source1", + ":source3", + ], +) + +assert_tar_listing( + name = "test_flatten_dedup", + actual = "flatten_dedup", + expected = """\ +#mtree +./examples time=1672560000.0 mode=755 gid=0 uid=0 type=dir +./examples/flatten time=1672560000.0 mode=755 gid=0 uid=0 type=dir +./examples/flatten/dir time=1672560000.0 mode=755 gid=0 uid=0 type=dir +./examples/flatten/dir/changelog time=1672560000.0 mode=755 gid=0 uid=0 type=file size=0 +./examples/flatten/dir/sub time=1672560000.0 mode=755 gid=0 uid=0 type=dir +./examples/flatten/dir/sub/content.txt time=1672560000.0 mode=755 gid=0 uid=0 type=file size=0 +""", +)