diff --git a/distroless/private/flatten.bzl b/distroless/private/flatten.bzl index 80096b5..fdc3ebc 100644 --- a/distroless/private/flatten.bzl +++ b/distroless/private/flatten.bzl @@ -42,17 +42,12 @@ flatten = rule( doc = "List of tars to flatten", ), "deduplicate": attr.bool(doc = """\ -EXPERIMENTAL: Remove duplicate entries from the archives after flattening. - -This requires `awk`, `sort` and `tar` to be available in the PATH. +EXPERIMENTAL: We may change or remove it without a notice. -To support macOS, presence of `gtar` is checked, and `tar` if it does not exist, -and ensured if supports the `--delete` mode. +Remove duplicate entries from the archives after flattening. +Deduplication is performed only for directories. -On macOS: `brew install gnu-tar` can be run to install gnutar. -See: https://formulae.brew.sh/formula/gnu-tar - -NOTE: You may also need to run `sudo ln -s /opt/homebrew/bin/gtar /usr/local/bin/gtar` to make it available to Bazel. +This requires `awk` to be available in the PATH. """, default = False), "compress": attr.string( doc = "Compress the archive file with a supported algorithm.", diff --git a/distroless/private/flatten.sh b/distroless/private/flatten.sh index 71f0fa6..49ba33c 100755 --- a/distroless/private/flatten.sh +++ b/distroless/private/flatten.sh @@ -5,46 +5,46 @@ bsdtar="$1"; output="$2"; shift 2; -function run_gtar() { - local TAR= - if [[ "$(command -v gtar)" ]]; then - TAR="gtar"; - elif [[ "$(command -v tar)" ]]; then - TAR="tar"; - else - echo "Neither 'tar' nor 'gtar' command is available."; - exit 1; - fi - "$TAR" "$@"; -} - - # Deduplication requested, use this complex pipeline to deduplicate. if [[ "$output" != "-" ]]; then mtree=$(mktemp) - duplicates=$(mktemp) + # List files in all archives and append to single column mtree. for arg in "$@"; do if [[ "$arg" == "@"* ]]; then - "$bsdtar" -cf - --format=mtree --options "mtree:!all,mtree:type" "$arg" >> "$mtree" + "$bsdtar" -tf "${arg:1}" >> "$mtree" fi done - awk '{ + + # There not a lot happening here but there is still too many implicit knowledge. + # + # When we run bsdtar, we ask for it to prompt every entry, in the same order we created above, the mtree. + # See: https://github.com/libarchive/libarchive/blob/f745a848d7a81758cd9fcd49d7fd45caeebe1c3d/tar/write.c#L683 + # + # For every prompt, therefore entry, we have write 31 bytes of data, one of which has to be either 'Y' or 'N'. + # And the reason for it is that since we are not TTY and pretending to be one, we can't interleave write calls + # so we have to interleave it by filling up the buffer with 31 bytes of 'Y' or 'N'. + # See: https://github.com/libarchive/libarchive/blob/f745a848d7a81758cd9fcd49d7fd45caeebe1c3d/tar/util.c#L240 + # See: https://github.com/libarchive/libarchive/blob/f745a848d7a81758cd9fcd49d7fd45caeebe1c3d/tar/util.c#L216 + # + # And finally we iterate over all the entries generating 31 bytes of interleaved 'Y' or 'N' date based on if + # we came across the entry before, for directories the first occurrence is kept, and for files copies are + # preserved. + $bsdtar --confirmation "$@" > $output 2< <(awk '{ if (substr($0,0,1) == "#") { next; } - line_count[$1]++; - if (line_count[$1] > 1) { - if ($1 == "/.") { - next - } - print $1 + count[$1]++; + ORS="" + keep="n" + if (count[$1] == 1 || $1 !~ "/$") { + keep="y" } - }' "$mtree" | sort | uniq | sort -r > "$duplicates" - - $bsdtar --exclude "^./$" $@ | run_gtar --delete --file - --occurrence=1 --files-from="$duplicates" > "$output" + for (i=0;i<31;i++) print keep + fflush() + }' "$mtree") rm "$mtree" else # No deduplication, business as usual