diff --git a/Project.toml b/Project.toml index b5b6e40..09d56c5 100644 --- a/Project.toml +++ b/Project.toml @@ -12,10 +12,12 @@ ArgTools = "1.1" julia = "1.3" [extras] +CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SimpleBufferStream = "777ac1f9-54b0-4bf8-805c-2214025038e7" Tar_jll = "9b64493d-8859-5bf3-93d7-7c32dd38186f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Random", "SimpleBufferStream", "Tar_jll", "Test"] +test = ["Random", "SimpleBufferStream", "Tar_jll", "Test", "CodecZlib", "Downloads"] diff --git a/src/Tar.jl b/src/Tar.jl index f0d821f..be205a7 100644 --- a/src/Tar.jl +++ b/src/Tar.jl @@ -342,6 +342,7 @@ end tarball :: Union{AbstractString, AbstractCmd, IO} algorithm :: AbstractString skip_empty :: Bool + copy_symlinks :: Bool Compute a tree hash value for the file tree that the tarball contains. By default, this uses git's tree hashing algorithm with the SHA1 secure hash @@ -389,21 +390,26 @@ hash, the hash value that you get will match the hash value computed by are hashing trees that may contain empty directories (i.e. do not come from a git repo), however, it is recommended that you hash them using a tool (such as this one) that does not ignore empty directories. + +If `copy_symlinks` is true, symlinks in the tarfile will be followed and the +target hashes will be copied. This is useful for checking what the hash would +be when using `Tar.extract` with `copy_symlinks = true`. """ function tree_hash( predicate::Function, tarball::ArgRead; algorithm::AbstractString = "git-sha1", skip_empty::Bool = false, + copy_symlinks::Bool = false, ) check_tree_hash_tarball(tarball) if algorithm == "git-sha1" return arg_read(tarball) do tar - git_tree_hash(predicate, tar, SHA.SHA1_CTX, skip_empty) + git_tree_hash(predicate, tar, SHA.SHA1_CTX, skip_empty, copy_symlinks) end elseif algorithm == "git-sha256" return arg_read(tarball) do tar - git_tree_hash(predicate, tar, SHA.SHA256_CTX, skip_empty) + git_tree_hash(predicate, tar, SHA.SHA256_CTX, skip_empty, copy_symlinks) end else error("invalid tree hashing algorithm: $algorithm") @@ -414,12 +420,14 @@ function tree_hash( tarball::ArgRead; algorithm::AbstractString = "git-sha1", skip_empty::Bool = false, + copy_symlinks::Bool = false, ) tree_hash( true_predicate, tarball, algorithm = algorithm, skip_empty = skip_empty, + copy_symlinks = copy_symlinks ) end diff --git a/src/extract.jl b/src/extract.jl index b771315..6143b1d 100644 --- a/src/extract.jl +++ b/src/extract.jl @@ -207,12 +207,13 @@ function git_tree_hash( predicate::Function, tar::IO, ::Type{HashType}, - skip_empty::Bool; + skip_empty::Bool, + copy_symlinks::Bool = false; buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) where HashType <: SHA.SHA_CTX # build tree with leaves for files and symlinks tree = Dict{String,Any}() - read_tarball(predicate, tar; buf=buf) do hdr, parts + paths = read_tarball(predicate, tar; buf=buf) do hdr, parts isempty(parts) && return name = pop!(parts) node = tree @@ -229,9 +230,14 @@ function git_tree_hash( end return elseif hdr.type == :symlink - mode = "120000" - hash = git_object_hash("blob", HashType) do io - write(io, hdr.link) + if copy_symlinks + mode = "120000" + hash = hdr.link + else + mode = "120000" + hash = git_object_hash("blob", HashType) do io + write(io, hdr.link) + end end elseif hdr.type == :hardlink mode = iszero(hdr.mode & 0o100) ? "100644" : "100755" @@ -249,6 +255,57 @@ function git_tree_hash( node[name] = (mode, hash) end + if copy_symlinks + # resolve the internal targets of symlinks + for (path, what) in paths + what isa String || continue + target = link_target(paths, path, what) + paths[path] = something(target, :symlink) + end + + for (path, what) in paths + what isa AbstractString || continue + paths[path] = follow_symlink_chain([path], what, paths) + end + + # use paths to index into the tree + function get_tree_index(tree::Dict, path::AbstractString) + node = tree + parts = splitpath(path) + for part in parts + node = node[part] + end + return node + end + function set_tree_index!(tree::Dict, value, path::AbstractString) + node = tree + parts = splitpath(path) + for part in parts[1:end-1] + node = node[part] + end + node[parts[end]] = value + end + function prune_tree_index!(tree::Dict, path::AbstractString) + node = tree + parts = splitpath(path) + for part in parts[1:end-1] + node = node[part] + end + delete!(node, parts[end]) + end + + # copy hashes + for (path, what) in paths + if what isa AbstractString + what_hash = get_tree_index(tree, what) + set_tree_index!(tree, what_hash, path) + elseif what == :symlink + # external symlink + prune_tree_index!(tree, path) + end + end + end + # prune directories that don't contain any files if skip_empty prune_empty!(node::Tuple) = true diff --git a/test/runtests.jl b/test/runtests.jl index 2212b0e..0325a4c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -83,11 +83,26 @@ end @testset "Tar.tree_hash" begin arg_readers(tarball) do tar @arg_test tar @test Tar.tree_hash(tar) == hash + @arg_test tar @test Tar.tree_hash(tar; copy_symlinks=true) != hash @arg_test tar @test empty_tree_sha1 == Tar.tree_hash(hdr->false, tar) + @arg_test tar @test empty_tree_sha1 == Tar.tree_hash(hdr->false, tar; copy_symlinks=true) @arg_test tar @test empty_tree_sha1 == Tar.tree_hash(hdr->false, tar, algorithm="git-sha1") + @arg_test tar @test empty_tree_sha1 == + Tar.tree_hash(hdr->false, tar, algorithm="git-sha1", copy_symlinks=true) @arg_test tar @test empty_tree_sha256 == Tar.tree_hash(hdr->false, tar, algorithm="git-sha256") + @arg_test tar @test empty_tree_sha256 == + Tar.tree_hash(hdr->false, tar, algorithm="git-sha256", copy_symlinks=true) + end + NON_STDLIB_TESTS && begin + iso_codes_tarball = Downloads.download("https://github.com/JuliaBinaryWrappers/iso_codes_jll.jl/releases/download/iso_codes-v4.11.0+0/iso_codes.v4.11.0.any.tar.gz") + open(GzipDecompressorStream, iso_codes_tarball) do io + @test Tar.tree_hash(io) == "71f68a3d55d73f2e15a3969c241fae2349b1feb5" + end + open(GzipDecompressorStream, iso_codes_tarball) do io + @test Tar.tree_hash(io; copy_symlinks=true) == "409d6ac4c02dae43ff4fe576b5c5820d0386fb3f" + end end end @testset "Tar.list & check properties" begin @@ -484,6 +499,7 @@ end @test read(path, String) == data end dir = Tar.extract(tarball, copy_symlinks=true) + @test tree_hash(dir) == Tar.tree_hash(tarball; copy_symlinks=true) test_file("file", data₁) test_file("link-file", data₁) test_none("link-file-slash") diff --git a/test/setup.jl b/test/setup.jl index 18d994b..3f2fc0a 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -10,6 +10,8 @@ const NON_STDLIB_TESTS = Main == @__MODULE__ if NON_STDLIB_TESTS using SimpleBufferStream + using CodecZlib + using Downloads using Tar_jll if isdefined(Tar_jll, :tar)