From 6bf32a2f034eba922d2b867c2660f344ef10e041 Mon Sep 17 00:00:00 2001 From: Chris Foster Date: Thu, 12 May 2022 16:16:54 +1000 Subject: [PATCH] WIP: fixes for open(dataset, write=true) and TOML storage backend --- src/BlobTree.jl | 3 ++ src/DataSet.jl | 27 ++++++++--- src/DataSets.jl | 9 ++-- src/TomlDataStorage.jl | 97 ++++++++++++++++++++++++++------------- src/data_project.jl | 29 +++++++++--- src/file_data_projects.jl | 8 ++-- src/filesystem.jl | 18 +++----- 7 files changed, 129 insertions(+), 62 deletions(-) diff --git a/src/BlobTree.jl b/src/BlobTree.jl index e754113..4615633 100644 --- a/src/BlobTree.jl +++ b/src/BlobTree.jl @@ -470,6 +470,9 @@ end # Base.open(::Type{T}, file::Blob; kws...) where {T} = open(identity, T, file.root, file.path; kws...) +function close_dataset(storage::Union{Blob,BlobTree}, exc=nothing) + close_dataset(storage.root) +end #------------------------------------------------------------------------------- # Path manipulation diff --git a/src/DataSet.jl b/src/DataSet.jl index 0ed33c0..eafc997 100644 --- a/src/DataSet.jl +++ b/src/DataSet.jl @@ -161,13 +161,31 @@ end #------------------------------------------------------------------------------- # Functions for opening datasets +# +# In principle we've got the following six variants: +# +# Scoped forms: +# +# open(dataset; kws...) do ... MISSING!! +# open(T, dataset; kws...) do ... +# +# Context manager: +# +# x = open(ctx, dataset; kws...) +# x = open(ctx, T, dataset; kws...) +# +# Finalizer-based: +# x = open(dataset; kws...) +# x = open(T, dataset; kws...) + # do-block form of open() function Base.open(f::Function, as_type, dataset::DataSet; write=false) + driver = _find_driver(dataset) if driver isa AbstractDataDriver storage = open_dataset(driver, dataset, write) try - open(f, as_type, storage) + open(f, as_type, storage, write=write) close_dataset(storage) catch exc close_dataset(storage, exc) @@ -193,7 +211,7 @@ end # Old deprecated API # Use `enter_do` because drivers are just functions if write - error("Cannot use `write=true` with the new API.") + error("Cannot use `write=true` with the old storage API.") end storage_config = dataset.storage (storage,) = @! enter_do(driver, storage_config, dataset) @@ -203,12 +221,9 @@ end @! function Base.open(as_type, dataset::DataSet; write=false) storage = @! open(dataset; write=write) - @! open(as_type, storage) + @! open(as_type, storage; write=write) end -# TODO: -# Consider making a distinction between open() and load(). - # Finalizer-based version of open() function Base.open(dataset::DataSet; write=false) @context begin diff --git a/src/DataSets.jl b/src/DataSets.jl index c6737c3..5ddd202 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -58,6 +58,8 @@ name="" """ const CURRENT_DATA_CONFIG_VERSION = 1 +const ConfigDict = Dict{String,Any} + include("paths.jl") include("DataSet.jl") include("data_project.jl") @@ -221,14 +223,15 @@ include("entrypoint.jl") # Builtin Data models include("BlobTree.jl") -# Builtin backends +# Builtin data drivers include("filesystem.jl") include("TomlDataStorage.jl") - -# Backends # include("ZipTree.jl") # include("GitTree.jl") +add_storage_driver("FileSystem"=>FileSystemDriver()) +add_storage_driver("TomlDataStorage"=>TomlDataDriver()) + # Application-level stuff include("repl.jl") diff --git a/src/TomlDataStorage.jl b/src/TomlDataStorage.jl index 31d1707..b055fe3 100644 --- a/src/TomlDataStorage.jl +++ b/src/TomlDataStorage.jl @@ -28,17 +28,34 @@ For BlobTree: ... ``` """ -struct TomlDataStorage +struct TomlDataRoot dataset::DataSet - data::Union{String,Dict{String,Any}} + data::Union{Vector{UInt8},ConfigDict} + write::Bool +end + +_data_strings_to_buffers(data::String) = base64decode(data) +function _data_strings_to_buffers(data::Dict) + ConfigDict(k=>_data_strings_to_buffers(v) for (k,v) in pairs(data)) +end +_data_strings_to_buffers(data) = error("Unexpected embedded data: expected a string or dictionary") + +_data_buffers_to_strings(data::Vector{UInt8}) = base64encode(data) +function _data_buffers_to_strings(data::Dict) + ConfigDict(k=>_data_buffers_to_strings(v) for (k,v) in pairs(data)) end # Get TOML data at `path`, returning nothing if not present -function _getpath(storage::TomlDataStorage, path::RelPath) +function _getpath(storage::TomlDataRoot, path::RelPath) x = storage.data for c in path.components + if !(x isa AbstractDict) + return nothing + end x = get(x, c, nothing) - !isnothing(x) || return nothing + if isnothing(x) + return nothing + end end x end @@ -46,13 +63,13 @@ end #-------------------------------------------------- # Storage data interface for trees -Base.isdir(storage::TomlDataStorage, path::RelPath) = _getpath(storage, path) isa Dict -Base.isfile(storage::TomlDataStorage, path::RelPath) = _getpath(storage, path) isa String -Base.ispath(storage::TomlDataStorage, path::RelPath) = !isnothing(_getpath(storage, path)) +Base.isdir(storage::TomlDataRoot, path::RelPath) = _getpath(storage, path) isa Dict +Base.isfile(storage::TomlDataRoot, path::RelPath) = _getpath(storage, path) isa String +Base.ispath(storage::TomlDataRoot, path::RelPath) = !isnothing(_getpath(storage, path)) -Base.summary(io::IO, storage::TomlDataStorage) = print(io, "Data.toml") +Base.summary(io::IO, storage::TomlDataRoot) = print(io, "Data.toml") -function Base.readdir(storage::TomlDataStorage, path::RelPath) +function Base.readdir(storage::TomlDataRoot, path::RelPath) try tree = _getpath(storage, path) !isnothing(tree) || KeyError(path) @@ -66,62 +83,78 @@ end # Storage data interface for Blob function Base.open(func::Function, as_type::Type{IO}, - storage::TomlDataStorage, path; kws...) - @context func(@! open(as_type, storage, path; kws...)) + storage::TomlDataRoot, path; write=false, kws...) + @context func(@! open(as_type, storage, path; write=false, kws...)) end -@! function Base.open(::Type{Vector{UInt8}}, storage::TomlDataStorage, path; - write=false, read=true, kws...) - if write - error("Embedded data is read-only from within the DataSets interface") - end +@! function Base.open(::Type{Vector{UInt8}}, storage::TomlDataRoot, path; + write=false) try - str = _getpath(storage, path) - !isnothing(str) || KeyError(path) - base64decode(str::AbstractString) + buf = _getpath(storage, path) + !isnothing(buf) || KeyError(path) + return buf catch error("TOML storage requires data to be as base64 encoded strings") end end -@! function Base.open(::Type{IO}, storage::TomlDataStorage, path; kws...) - buf = @! open(Vector{UInt8}, storage, path; kws...) - IOBuffer(buf) +@! function Base.open(::Type{IO}, storage::TomlDataRoot, path; write=false) + buf = @! open(Vector{UInt8}, storage, path; write=write) + if write + # For consistency with filesystem version of open() + resize!(buf,0) + end + return IOBuffer(buf, write=write) +end + +@! function Base.open(::Type{String}, storage::TomlDataRoot, path; write=false) + buf = @! open(Vector{UInt8}, storage, path; write=write) + return String(copy(buf)) end +function close_dataset(storage::TomlDataRoot, exc=nothing) + if storage.write + encoded_data = _data_buffers_to_strings(storage.data) + # Force writing of dataset to project + conf = copy(storage.dataset.storage) + conf["data"] = encoded_data + config(storage.dataset; storage=conf) + end +end # TODO: The following should be factored out and implemented generically -function Base.read(storage::TomlDataStorage, path::RelPath, ::Type{T}) where {T} +function Base.read(storage::TomlDataRoot, path::RelPath, ::Type{T}) where {T} @context begin io = @! open(IO, storage, path) read(io, T) end end -function Base.read(storage::TomlDataStorage, path::RelPath) +function Base.read(storage::TomlDataRoot, path::RelPath) @context @! open(Vector{UInt8}, storage, path) end #------------------------------------------------------------------------------- -# Connect storage backend -function connect_toml_data_storage(f, config, dataset) - type = config["type"] - data = get(config, "data", nothing) + +struct TomlDataDriver <: AbstractDataDriver +end + +function open_dataset(driver::TomlDataDriver, dataset, write) + type = dataset.storage["type"] + data = get(dataset.storage, "data", nothing) if type == "Blob" if !(data isa AbstractString) error("TOML data storage requires string data in the \"storage.data\" key") end - f(Blob(TomlDataStorage(dataset, data))) + return Blob(TomlDataRoot(dataset, _data_strings_to_buffers(data), write)) elseif type == "BlobTree" if !(data isa AbstractDict) error("TOML data storage requires a dictionary in the \"storage.data\" key") end - f(BlobTree(TomlDataStorage(dataset, data))) + return BlobTree(TomlDataRoot(dataset, _data_strings_to_buffers(data), write)) else throw(ArgumentError("DataSet type $type not supported for data embedded in Data.toml")) end end -add_storage_driver("TomlDataStorage"=>connect_toml_data_storage) - diff --git a/src/data_project.jl b/src/data_project.jl index 0b7f8e7..1c04d74 100644 --- a/src/data_project.jl +++ b/src/data_project.jl @@ -41,7 +41,7 @@ function dataset(proj::AbstractDataProject, spec::AbstractString) # Enhance dataset with "dataspec" holding URL-like fragment & query dataspec = Dict() if !isnothing(query) - dataspec["query"] = Dict{String,Any}(query) + dataspec["query"] = ConfigDict(query) end if !isnothing(fragmentstr) dataspec["fragment"] = fragmentstr @@ -222,13 +222,13 @@ A in-memory collection of DataSets. """ struct DataProject <: AbstractDataProject datasets::Dict{String,DataSet} - drivers::Vector{Dict{String,Any}} + drivers::Vector{ConfigDict} end -DataProject() = DataProject(Dict{String,DataSet}(), Vector{Dict{String,Any}}()) +DataProject() = DataProject(Dict{String,DataSet}(), Vector{ConfigDict}()) DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)), - Vector{Dict{String,Any}}()) + Vector{ConfigDict}()) data_drivers(project::DataProject) = project.drivers @@ -247,9 +247,21 @@ function Base.iterate(proj::DataProject, state=nothing) end function Base.setindex!(proj::DataProject, data::DataSet, name::AbstractString) + if haskey(proj, name) && proj[name] !== data + throw(ArgumentError("Cannot replace existing dataset with name \"$name\". Try DataSets.delete() first.")) + end + if isnothing(data.project) + data.project = proj + elseif data.project !== proj + throw(ArgumentError("DataSet is already owned by a different project")) + end proj.datasets[name] = data end +function delete(proj::DataProject, name::AbstractString) + delete!(proj.datasets, name) +end + #------------------------------------------------------------------------------- """ StackedDataProject() @@ -385,9 +397,14 @@ function save_project(path::AbstractString, proj::DataProject) close(tmpio) mv(tmppath, path, force=true) end + return nothing end -function create(name; kws...) +#------------------------------------------------------------------------------- +# Global versions of the dataset metadata manipulation functions which act on +# the global dataset PROJECT object. + +function create(name::AbstractString; kws...) ds = create(PROJECT, name; kws...) if isnothing(ds) error("Could not create dataset in any available data project") @@ -395,7 +412,7 @@ function create(name; kws...) return ds end -function delete(name) +function delete(name::AbstractString) delete(PROJECT, name) end diff --git a/src/file_data_projects.jl b/src/file_data_projects.jl index 874008e..dd34c97 100644 --- a/src/file_data_projects.jl +++ b/src/file_data_projects.jl @@ -203,13 +203,15 @@ end project_name(proj::TomlFileDataProject) = proj.path function Base.setindex!(proj::TomlFileDataProject, data::DataSet, name::AbstractString) - p = _get_cached(proj) + p = get_cache(proj) p[name] = data save_project(proj.path, p) end function delete(proj::TomlFileDataProject, name::AbstractString) - p = _get_cached(proj) + # FIXME: Make this safe for concurrent use in-process + # (or better, between processes?) + p = get_cache(proj) ds = dataset(p, name) # Assume all datasets which don't have the "linked" property are linked. @@ -221,7 +223,7 @@ function delete(proj::TomlFileDataProject, name::AbstractString) delete_storage(proj, driver, ds) end - delete!(p.datasets, name) + delete(p, name) save_project(proj.path, p) end diff --git a/src/filesystem.jl b/src/filesystem.jl index e62c1fc..c3d54e9 100644 --- a/src/filesystem.jl +++ b/src/filesystem.jl @@ -75,6 +75,11 @@ end @! open(sys_abspath(root, path); read=read, write=write, kws...) end +# FIXME: close() vs close_dataset() ? +function close_dataset(storage::AbstractFileSystemRoot, exc=nothing) + # Nothing to do here — data is already on the filesystem. +end + Base.read(root::AbstractFileSystemRoot, path::RelPath, ::Type{T}) where {T} = read(sys_abspath(root, path), T) Base.read(root::AbstractFileSystemRoot, path::RelPath) = @@ -270,6 +275,7 @@ function mv_force_with_dest_rollback(srcpath, destpath, tempdir_parent) end end +# Consider deprecating or generalizing this? function Base.setindex!(tree::BlobTree{<:AbstractFileSystemRoot}, tmpdata::Union{Blob{TempFilesystemRoot},BlobTree{TempFilesystemRoot}}, name::AbstractString) @@ -301,13 +307,6 @@ end struct FileSystemDriver <: AbstractDataDriver end -driver_name(::FileSystemDriver) = "FileSystemDriver" - -function (driver::FileSystemDriver)(user_func, config, dataset) - storage = open_dataset(driver, dataset, false) - user_func(storage) -end - function open_dataset(driver::FileSystemDriver, dataset, write) config = dataset.storage path = config["path"] @@ -328,9 +327,6 @@ function open_dataset(driver::FileSystemDriver, dataset, write) return storage end -function close_dataset(storage::Union{Blob,BlobTree}, exc=nothing) -end - function create_storage(proj, driver::FileSystemDriver, name::AbstractString; source::Union{Nothing,DataSet}=nothing, @@ -395,5 +391,3 @@ function delete_storage(proj, driver::FileSystemDriver, ds::DataSet) end -add_storage_driver("FileSystem"=>FileSystemDriver()) -