Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add copy_symlinks keyword to Tar.tree_hash #167

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ ArgTools = "1.1"
julia = "1.3"

[extras]
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SimpleBufferStream = "777ac1f9-54b0-4bf8-805c-2214025038e7"
Tar_jll = "9b64493d-8859-5bf3-93d7-7c32dd38186f"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Random", "SimpleBufferStream", "Tar_jll", "Test"]
test = ["Random", "SimpleBufferStream", "Tar_jll", "Test", "CodecZlib", "Downloads"]
12 changes: 10 additions & 2 deletions src/Tar.jl
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ end
tarball :: Union{AbstractString, AbstractCmd, IO}
algorithm :: AbstractString
skip_empty :: Bool
copy_symlinks :: Bool

Compute a tree hash value for the file tree that the tarball contains. By
default, this uses git's tree hashing algorithm with the SHA1 secure hash
Expand Down Expand Up @@ -389,21 +390,26 @@ hash, the hash value that you get will match the hash value computed by
are hashing trees that may contain empty directories (i.e. do not come from a
git repo), however, it is recommended that you hash them using a tool (such as
this one) that does not ignore empty directories.

If `copy_symlinks` is true, symlinks in the tarfile will be followed and the
target hashes will be copied. This is useful for checking what the hash would
be when using `Tar.extract` with `copy_symlinks = true`.
"""
function tree_hash(
predicate::Function,
tarball::ArgRead;
algorithm::AbstractString = "git-sha1",
skip_empty::Bool = false,
copy_symlinks::Bool = false,
)
check_tree_hash_tarball(tarball)
if algorithm == "git-sha1"
return arg_read(tarball) do tar
git_tree_hash(predicate, tar, SHA.SHA1_CTX, skip_empty)
git_tree_hash(predicate, tar, SHA.SHA1_CTX, skip_empty, copy_symlinks)
end
elseif algorithm == "git-sha256"
return arg_read(tarball) do tar
git_tree_hash(predicate, tar, SHA.SHA256_CTX, skip_empty)
git_tree_hash(predicate, tar, SHA.SHA256_CTX, skip_empty, copy_symlinks)
end
else
error("invalid tree hashing algorithm: $algorithm")
Expand All @@ -414,12 +420,14 @@ function tree_hash(
tarball::ArgRead;
algorithm::AbstractString = "git-sha1",
skip_empty::Bool = false,
copy_symlinks::Bool = false,
)
tree_hash(
true_predicate,
tarball,
algorithm = algorithm,
skip_empty = skip_empty,
copy_symlinks = copy_symlinks
)
end

Expand Down
67 changes: 62 additions & 5 deletions src/extract.jl
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,13 @@ function git_tree_hash(
predicate::Function,
tar::IO,
::Type{HashType},
skip_empty::Bool;
skip_empty::Bool,
copy_symlinks::Bool = false;
buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE),
) where HashType <: SHA.SHA_CTX
# build tree with leaves for files and symlinks
tree = Dict{String,Any}()
read_tarball(predicate, tar; buf=buf) do hdr, parts
paths = read_tarball(predicate, tar; buf=buf) do hdr, parts
isempty(parts) && return
name = pop!(parts)
node = tree
Expand All @@ -229,9 +230,14 @@ function git_tree_hash(
end
return
elseif hdr.type == :symlink
mode = "120000"
hash = git_object_hash("blob", HashType) do io
write(io, hdr.link)
if copy_symlinks
mode = "120000"
hash = hdr.link
else
mode = "120000"
hash = git_object_hash("blob", HashType) do io
write(io, hdr.link)
end
end
elseif hdr.type == :hardlink
mode = iszero(hdr.mode & 0o100) ? "100644" : "100755"
Expand All @@ -249,6 +255,57 @@ function git_tree_hash(
node[name] = (mode, hash)
end

if copy_symlinks
# resolve the internal targets of symlinks
for (path, what) in paths
what isa String || continue
target = link_target(paths, path, what)
paths[path] = something(target, :symlink)
end

for (path, what) in paths
what isa AbstractString || continue
paths[path] = follow_symlink_chain([path], what, paths)
end

# use paths to index into the tree
function get_tree_index(tree::Dict, path::AbstractString)
node = tree
parts = splitpath(path)
for part in parts
node = node[part]
end
return node
end
function set_tree_index!(tree::Dict, value, path::AbstractString)
node = tree
parts = splitpath(path)
for part in parts[1:end-1]
node = node[part]
end
node[parts[end]] = value
end
function prune_tree_index!(tree::Dict, path::AbstractString)
node = tree
parts = splitpath(path)
for part in parts[1:end-1]
node = node[part]
end
delete!(node, parts[end])
end

# copy hashes
for (path, what) in paths
if what isa AbstractString
what_hash = get_tree_index(tree, what)
set_tree_index!(tree, what_hash, path)
elseif what == :symlink
# external symlink
prune_tree_index!(tree, path)
end
end
end

# prune directories that don't contain any files
if skip_empty
prune_empty!(node::Tuple) = true
Expand Down
16 changes: 16 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,26 @@ end
@testset "Tar.tree_hash" begin
arg_readers(tarball) do tar
@arg_test tar @test Tar.tree_hash(tar) == hash
@arg_test tar @test Tar.tree_hash(tar; copy_symlinks=true) != hash
@arg_test tar @test empty_tree_sha1 == Tar.tree_hash(hdr->false, tar)
@arg_test tar @test empty_tree_sha1 == Tar.tree_hash(hdr->false, tar; copy_symlinks=true)
@arg_test tar @test empty_tree_sha1 ==
Tar.tree_hash(hdr->false, tar, algorithm="git-sha1")
@arg_test tar @test empty_tree_sha1 ==
Tar.tree_hash(hdr->false, tar, algorithm="git-sha1", copy_symlinks=true)
@arg_test tar @test empty_tree_sha256 ==
Tar.tree_hash(hdr->false, tar, algorithm="git-sha256")
@arg_test tar @test empty_tree_sha256 ==
Tar.tree_hash(hdr->false, tar, algorithm="git-sha256", copy_symlinks=true)
end
NON_STDLIB_TESTS && begin
iso_codes_tarball = Downloads.download("https://github.com/JuliaBinaryWrappers/iso_codes_jll.jl/releases/download/iso_codes-v4.11.0+0/iso_codes.v4.11.0.any.tar.gz")
Copy link
Member

@nhz2 nhz2 Jan 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test should use a generated test tar file, like in the "copy symlinks" testset below. The tests will be more reliable if they don't require internet access.

open(GzipDecompressorStream, iso_codes_tarball) do io
@test Tar.tree_hash(io) == "71f68a3d55d73f2e15a3969c241fae2349b1feb5"
end
open(GzipDecompressorStream, iso_codes_tarball) do io
@test Tar.tree_hash(io; copy_symlinks=true) == "409d6ac4c02dae43ff4fe576b5c5820d0386fb3f"
end
end
end
@testset "Tar.list & check properties" begin
Expand Down Expand Up @@ -484,6 +499,7 @@ end
@test read(path, String) == data
end
dir = Tar.extract(tarball, copy_symlinks=true)
@test tree_hash(dir) == Tar.tree_hash(tarball; copy_symlinks=true)
test_file("file", data₁)
test_file("link-file", data₁)
test_none("link-file-slash")
Expand Down
2 changes: 2 additions & 0 deletions test/setup.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ const NON_STDLIB_TESTS = Main == @__MODULE__

if NON_STDLIB_TESTS
using SimpleBufferStream
using CodecZlib
using Downloads

using Tar_jll
if isdefined(Tar_jll, :tar)
Expand Down
Loading