Skip to content

Commit

Permalink
feat: support deduplication in flatten (#119)
Browse files Browse the repository at this point in the history
* feat: support deduplication in flatten

* fix root duplicates

* remove gnutar dep
  • Loading branch information
thesayyn authored Dec 7, 2024
1 parent 7fa53f5 commit dd83163
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 3 deletions.
1 change: 1 addition & 0 deletions distroless/private/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ load("@rules_java//java:defs.bzl", "java_binary")
exports_files([
"cacerts.sh",
"locale.sh",
"flatten.sh",
])

java_binary(
Expand Down
15 changes: 13 additions & 2 deletions distroless/private/flatten.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,17 @@ def _flatten_impl(ctx):
output = ctx.actions.declare_file(ctx.attr.name + ext)

args = ctx.actions.args()
args.add(bsdtar.tarinfo.binary)
args.add(output if ctx.attr.deduplicate else "-")
args.add_all(tar_lib.DEFAULT_ARGS)
args.add("--create")
tar_lib.common.add_compression_args(ctx.attr.compress, args)
tar_lib.add_default_compression_args(ctx.attr.compress, args)
args.add("--file", output)
args.add("--file", "-" if ctx.attr.deduplicate else output)
args.add_all(ctx.files.tars, format_each = "@%s")

ctx.actions.run(
executable = bsdtar.tarinfo.binary,
executable = ctx.executable._flatten_sh,
inputs = ctx.files.tars,
outputs = [output],
tools = bsdtar.default.files,
Expand All @@ -39,10 +41,19 @@ flatten = rule(
allow_empty = False,
doc = "List of tars to flatten",
),
"deduplicate": attr.bool(doc = """\
EXPERIMENTAL: We may change or remove it without a notice.
Remove duplicate entries from the archives after flattening.
Deduplication is performed only for directories.
This requires `awk` to be available in the PATH.
""", default = False),
"compress": attr.string(
doc = "Compress the archive file with a supported algorithm.",
values = tar_lib.common.accepted_compression_types,
),
"_flatten_sh": attr.label(default = "//distroless/private:flatten.sh", executable = True, cfg = "exec", allow_single_file = True),
},
implementation = _flatten_impl,
toolchains = [tar_lib.TOOLCHAIN_TYPE],
Expand Down
52 changes: 52 additions & 0 deletions distroless/private/flatten.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env bash
set -o pipefail -o errexit

bsdtar="$1";
output="$2";
shift 2;

# Deduplication requested, use this complex pipeline to deduplicate.
if [[ "$output" != "-" ]]; then

mtree=$(mktemp)

# List files in all archives and append to single column mtree.
for arg in "$@"; do
if [[ "$arg" == "@"* ]]; then
"$bsdtar" -tf "${arg:1}" >> "$mtree"
fi
done


# There not a lot happening here but there is still too many implicit knowledge.
#
# When we run bsdtar, we ask for it to prompt every entry, in the same order we created above, the mtree.
# See: https://github.com/libarchive/libarchive/blob/f745a848d7a81758cd9fcd49d7fd45caeebe1c3d/tar/write.c#L683
#
# For every prompt, therefore entry, we have write 31 bytes of data, one of which has to be either 'Y' or 'N'.
# And the reason for it is that since we are not TTY and pretending to be one, we can't interleave write calls
# so we have to interleave it by filling up the buffer with 31 bytes of 'Y' or 'N'.
# See: https://github.com/libarchive/libarchive/blob/f745a848d7a81758cd9fcd49d7fd45caeebe1c3d/tar/util.c#L240
# See: https://github.com/libarchive/libarchive/blob/f745a848d7a81758cd9fcd49d7fd45caeebe1c3d/tar/util.c#L216
#
# And finally we iterate over all the entries generating 31 bytes of interleaved 'Y' or 'N' date based on if
# we came across the entry before, for directories the first occurrence is kept, and for files copies are
# preserved.
$bsdtar --confirmation "$@" > $output 2< <(awk '{
if (substr($0,0,1) == "#") {
next;
}
count[$1]++;
ORS=""
keep="n"
if (count[$1] == 1 || $1 !~ "/$") {
keep="y"
}
for (i=0;i<31;i++) print keep
fflush()
}' "$mtree")
rm "$mtree"
else
# No deduplication, business as usual
$bsdtar $@
fi
3 changes: 2 additions & 1 deletion docs/rules.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

43 changes: 43 additions & 0 deletions examples/flatten/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,46 @@ assert_tar_listing(
./root time=0.0 mode=700 gid=0 uid=0 type=dir
""",
)

# Flatten with deduplication
tar(
name = "source1",
srcs = glob(["dir/*"]),
compress = "xz",
)

tar(
name = "source2",
srcs = glob(["dir/**/*"]),
compress = "xz",
)

tar(
name = "source3",
srcs = glob(["dir/**/*"]),
compress = "xz",
)

flatten(
name = "flatten_dedup",
deduplicate = True,
tars = [
":source2",
":source1",
":source3",
],
)

assert_tar_listing(
name = "test_flatten_dedup",
actual = "flatten_dedup",
expected = """\
#mtree
./examples time=1672560000.0 mode=755 gid=0 uid=0 type=dir
./examples/flatten time=1672560000.0 mode=755 gid=0 uid=0 type=dir
./examples/flatten/dir time=1672560000.0 mode=755 gid=0 uid=0 type=dir
./examples/flatten/dir/changelog time=1672560000.0 mode=755 gid=0 uid=0 type=file size=0
./examples/flatten/dir/sub time=1672560000.0 mode=755 gid=0 uid=0 type=dir
./examples/flatten/dir/sub/content.txt time=1672560000.0 mode=755 gid=0 uid=0 type=file size=0
""",
)

0 comments on commit dd83163

Please sign in to comment.