Skip to content

Commit

Permalink
Big refactor of newfile() / newdir() + BlobTree API improvements
Browse files Browse the repository at this point in the history
Refactor newfile() / newdir() into primarily in-place APIs. A
temporary directory can still be created with newdir().
Before this refactor the API looked slightly nicer, but we would never
be able to reach the efficiency of the native file APIs. In
particular, we might end up moving data across devices as a last step
in constructing a directory tree — this seemed bad!

Also:

* Further document the BlobTree API
* Allow path strings as keys in more places in BlobTree
  • Loading branch information
c42f committed Apr 26, 2022
1 parent 5bb953a commit 4872395
Show file tree
Hide file tree
Showing 2 changed files with 245 additions and 107 deletions.
267 changes: 193 additions & 74 deletions src/BlobTree.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,6 @@ function Base.iterate(tree::AbstractBlobTree, state=nothing)
end
end

"""
children(tree::AbstractBlobTree)
Return an array of the children of `tree`. A child `x` may abstractly either be
another tree (`children(x)` returns a collection) or a file, where `children(x)`
returns `()`.
Note that this is subtly different from `readdir(path)` which returns relative
paths, or `readdir(path, join=true)` which returns absolute paths.
"""
function children(tree::AbstractBlobTree)
# TODO: Is dispatch to the root a correct default?
children(tree.root, tree.path)
end


"""
showtree([io,], tree)
Expand Down Expand Up @@ -100,13 +84,12 @@ end

function Base.copy!(dst::AbstractBlobTree, src::AbstractBlobTree)
for x in src
newpath = joinpath(dst, basename(x))
xname = basename(x)
if isdir(x)
newdir = mkdir(newpath)
copy!(newdir, x)
copy!(newdir(dst, xname), x)
else
open(x) do io_src
open(newpath, write=true) do io_dst
newfile(dst, xname, write=true) do io_dst
write(io_dst, io_src)
end
end
Expand Down Expand Up @@ -255,34 +238,51 @@ BlobTree has a largely dictionary-like interface:
* List keys and values: `pairs(tree)`
* Query keys: `haskey(tree)`
* Traverse the tree: `tree["path"]`
* Add new content: `tree["path"] = file`
* Add new content: `newdir(tree, "path")`, `newfile(tree, "path")`
* Delete content: `delete!(tree, "path")`
Unlike Dict, iteration of BlobTree iterates values (not key value pairs). This
has some benefits - for example, broadcasting processing across files in a
directory works.
Querying
directory.
* Property access
- `isdir()`, `isfile()` - determine whether a child of tree is a directory or file.
# Example
Normally you'd construct these via the [`dataset`](@ref) function which takes
care of constructing the correct `root` object. However, here's a direct
demonstration:
You can create a new temporary BlobTree via the `newdir()` function:
```
julia> dir = newdir()
for i = 1:3
newfile(dir, "\$i/a.txt") do io
println(io, "Content of a")
end
newfile(dir, "b-\$i.txt") do io
println(io, "Content of b")
end
end
dir
📂 Tree @ /tmp/jl_Sp6wMF
📁 1
📁 2
📁 3
📄 b-1.txt
📄 b-2.txt
📄 b-3.txt
```
You can also get access to a `BlobTree` by using `DataSets.from_path()` with a
local directory name. For example:
```
julia> tree = BlobTree(DataSets.FileSystemRoot(dirname(pathof(DataSets))), path"../test/data")
📂 Tree ../test/data @ /home/chris/.julia/dev/DataSets/src
📁 csvset
📄 file.txt
📄 foo.txt
📄 people.csv.gz
julia> tree["csvset"]
📂 Tree ../test/data/csvset @ /home/chris/.julia/dev/DataSets/src
📄 1.csv
📄 2.csv
julia> using Pkg
open(DataSets.from_path(joinpath(Pkg.dir("DataSets"), "src")))
📂 Tree @ ~/.julia/dev/DataSets/src
📄 DataSet.jl
📄 DataSets.jl
📄 DataTomlStorage.jl
...
```
"""
mutable struct BlobTree{Root} <: AbstractBlobTree
Expand All @@ -292,28 +292,27 @@ end

BlobTree(root) = BlobTree(root, RelPath())

function AbstractTrees.printnode(io::IO, tree::BlobTree)
print(io, "📂 ", basename(tree))
end

function Base.show(io::IO, ::MIME"text/plain", tree::AbstractBlobTree)
function Base.show(io::IO, ::MIME"text/plain", tree::BlobTree)
# TODO: Ideally we'd use
# AbstractTrees.print_tree(io, tree, 1)
# However, this is hard to use efficiently; we'd need to implement a lazy
# `children()` for all our trees. It'd be much easier if
# `AbstractTrees.has_children()` was used consistently upstream.
cs = children(tree)
println(io, "📂 Tree ", tree.path, " @ ", summary(tree.root))
for (i, c) in enumerate(cs)
print(io, " ", isdir(c) ? '📁' : '📄', " ", basename(c))
if i != length(cs)
first = true
for (name,x) in pairs(tree)
if first
first = false
else
print(io, '\n')
end
print(io, " ", isdir(x) ? '📁' : '📄', " ", name)
end
end

Base.basename(tree::BlobTree) = basename(tree.path)
Base.abspath(tree::BlobTree) = AbsPath(tree.root, tree.path)
function AbstractTrees.printnode(io::IO, tree::BlobTree)
print(io, "📂 ", basename(tree))
end

# getindex vs joinpath:
# - getindex is about indexing the datastructure; therefore it looks in the
Expand All @@ -339,49 +338,134 @@ function Base.getindex(tree::BlobTree, name::AbstractString)
getindex(tree, RelPath(name))
end

function Base.delete!(tree::BlobTree, path::RelPath)
relpath = joinpath(tree.path, path)
root = tree.root
delete!(root, relpath)

# Keys, values and iteration

"""
children(tree::BlobTree)
Return an array of the children of `tree`. A child `x` may abstractly either be
another tree (`children(x)` returns a collection) or a file, where `children(x)`
returns `()`.
"""
function children(tree::BlobTree)
[tree[RelPath([n])] for n in keys(tree)]
end

function Base.delete!(tree::BlobTree, name::AbstractString)
delete!(tree, RelPath(name))
function Base.haskey(tree::BlobTree, path::AbstractString)
haskey(tree, RelPath(path))
end

# We've got a weird mishmash of path vs tree handling here.
# TODO: Can we refactor this to cleanly separate the filesystem-like commands
# (which take abstract paths?) from BlobTree and Blob which act as an
# abstraction over the filesystem or other storage mechanisms?
function Base.joinpath(tree::BlobTree, r::RelPath)
AbsPath(tree.root, joinpath(tree.path, r))
function Base.haskey(tree::BlobTree, path::RelPath)
ispath(tree.root, joinpath(tree.path, path))
end

function Base.joinpath(tree::BlobTree, s::AbstractString)
AbsPath(tree.root, joinpath(tree.path, s))
function Base.keys(tree::BlobTree)
readdir(tree.root, tree.path)
end

function Base.haskey(tree::BlobTree, name::AbstractString)
ispath(tree.root, joinpath(tree.path, name))
function Base.pairs(tree::BlobTree)
zip(keys(tree), children(tree))
end

function Base.readdir(tree::BlobTree)
readdir(tree.root, tree.path)
function Base.values(tree::BlobTree)
children(tree)
end

function Base.keys(tree::BlobTree)
readdir(tree.root, tree.path)

# Mutation

newdir(tree::BlobTree, path::AbstractString; kws...) =
newdir(tree, RelPath(path); kws...)
newfile(tree::BlobTree, path::AbstractString; kws...) =
newfile(tree, RelPath(path); kws...)
newfile(func::Function, tree::BlobTree, path::AbstractString; kws...) =
newfile(func, tree, RelPath(path); kws...)
Base.delete!(tree::BlobTree, path::AbstractString) =
delete!(tree, RelPath(path))

function _check_writeable(tree)
if !iswriteable(tree.root)
error("Attempt to write into a read-only tree with root $(tree.root)")
end
end

function Base.rm(tree::BlobTree; kws...)
rm(tree.root, tree.path; kws...)
function _check_new_item(tree, path, overwrite)
_check_writeable(tree)
if haskey(tree, path)
if overwrite
delete!(tree, path)
else
error("Overwriting a path $path which already exists requires the keyword `overwrite=true`")
end
end
end

function children(tree::BlobTree)
child_names = readdir(tree)
[tree[c] for c in child_names]
"""
newdir(tree, path; overwrite=false)
Create a new directory at tree[path] and return it. If `overwrite=true`, remove
any existing directory before creating the new one.
newdir()
Create a new temporary `BlobTree` which can have files assigned into it and may
be assigned to a permanent location in a persistent `BlobTree`. If not assigned
to a permanent location, the temporary tree is cleaned up during garbage
collection.
"""
function newdir(tree::BlobTree, path::RelPath; overwrite=false)
_check_new_item(tree, path, overwrite)
p = joinpath(tree.path, RelPath(path))
newdir(tree.root, p)
return BlobTree(tree.root, p)
end

"""
newfile(tree, path; overwrite=false)
newfile(tree, path; overwrite=false) do io ...
Create a new file object in the `tree` at the given `path`. In the second form,
the open file `io` will be passed to the do block.
newfile()
Create a new file which may be later assigned to a permanent location in a
tree. If not assigned to a permanent location, the temporary file is cleaned up
during garbage collection.
# Example
```
newfile(tree, "some/demo/path.txt") do io
println(io, "Hi there!")
end
```
"""
function newfile(tree::BlobTree, path::RelPath; overwrite=false)
_check_new_item(tree, path, overwrite)
p = joinpath(tree.path, path)
newfile(tree.root, p)
return Blob(tree.root, p)
end

function newfile(func::Function, tree::BlobTree, path::RelPath; overwrite=false)
_check_new_item(tree, path, overwrite)
p = joinpath(tree.path, path)
newfile(func, tree.root, p)
return Blob(tree.root, p)
end


function Base.delete!(tree::BlobTree, path::RelPath)
_check_writeable(tree)
relpath = joinpath(tree.path, path)
root = tree.root
delete!(root, relpath)
end



function Base.open(f::Function, ::Type{BlobTree}, tree::BlobTree; kws...)
f(tree)
end
Expand All @@ -391,3 +475,38 @@ end
end

# Base.open(::Type{T}, file::Blob; kws...) where {T} = open(identity, T, file.root, file.path; kws...)


#-------------------------------------------------------------------------------
# Path manipulation

# TODO: Maybe deprecate these? Under the "datastructure-like" model, it seems wrong
# for a blob to know its name in the parent data structure.
Base.basename(tree::BlobTree) = basename(tree.path)
Base.abspath(tree::BlobTree) = AbsPath(tree.root, tree.path)

function Base.joinpath(tree::BlobTree, r::RelPath)
AbsPath(tree.root, joinpath(tree.path, r))
end

function Base.joinpath(tree::BlobTree, s::AbstractString)
AbsPath(tree.root, joinpath(tree.path, s))
end


#-------------------------------------------------------------------------------
# Deprecated
function Base.rm(tree::BlobTree; kws...)
_check_writeable(tree)
rm(tree.root, tree.path; kws...)
end

function Base.readdir(tree::BlobTree)
readdir(tree.root, tree.path)
end

# Create files within a temporary directory.
# TODO: Deprecate in order to encourage the in-place version.
newdir(tree::BlobTree) = newdir(tree.root)
newfile(tree::BlobTree) = newfile(tree.root)

Loading

0 comments on commit 4872395

Please sign in to comment.