Skip to content

Commit

Permalink
WIP: fixes for open(dataset, write=true) and TOML storage backend
Browse files Browse the repository at this point in the history
  • Loading branch information
c42f committed May 12, 2022
1 parent 28f9a72 commit 6bf32a2
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 62 deletions.
3 changes: 3 additions & 0 deletions src/BlobTree.jl
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,9 @@ end

# Base.open(::Type{T}, file::Blob; kws...) where {T} = open(identity, T, file.root, file.path; kws...)

function close_dataset(storage::Union{Blob,BlobTree}, exc=nothing)
close_dataset(storage.root)
end

#-------------------------------------------------------------------------------
# Path manipulation
Expand Down
27 changes: 21 additions & 6 deletions src/DataSet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,31 @@ end

#-------------------------------------------------------------------------------
# Functions for opening datasets
#
# In principle we've got the following six variants:
#
# Scoped forms:
#
# open(dataset; kws...) do ... MISSING!!
# open(T, dataset; kws...) do ...
#
# Context manager:
#
# x = open(ctx, dataset; kws...)
# x = open(ctx, T, dataset; kws...)
#
# Finalizer-based:
# x = open(dataset; kws...)
# x = open(T, dataset; kws...)


# do-block form of open()
function Base.open(f::Function, as_type, dataset::DataSet; write=false)
driver = _find_driver(dataset)
if driver isa AbstractDataDriver
storage = open_dataset(driver, dataset, write)
try
open(f, as_type, storage)
open(f, as_type, storage, write=write)
close_dataset(storage)
catch exc
close_dataset(storage, exc)
Expand All @@ -193,7 +211,7 @@ end
# Old deprecated API
# Use `enter_do` because drivers are just functions
if write
error("Cannot use `write=true` with the new API.")
error("Cannot use `write=true` with the old storage API.")
end
storage_config = dataset.storage
(storage,) = @! enter_do(driver, storage_config, dataset)
Expand All @@ -203,12 +221,9 @@ end

@! function Base.open(as_type, dataset::DataSet; write=false)
storage = @! open(dataset; write=write)
@! open(as_type, storage)
@! open(as_type, storage; write=write)
end

# TODO:
# Consider making a distinction between open() and load().

# Finalizer-based version of open()
function Base.open(dataset::DataSet; write=false)
@context begin
Expand Down
9 changes: 6 additions & 3 deletions src/DataSets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ name="<driver name>"
"""
const CURRENT_DATA_CONFIG_VERSION = 1

const ConfigDict = Dict{String,Any}

include("paths.jl")
include("DataSet.jl")
include("data_project.jl")
Expand Down Expand Up @@ -221,14 +223,15 @@ include("entrypoint.jl")
# Builtin Data models
include("BlobTree.jl")

# Builtin backends
# Builtin data drivers
include("filesystem.jl")
include("TomlDataStorage.jl")

# Backends
# include("ZipTree.jl")
# include("GitTree.jl")

add_storage_driver("FileSystem"=>FileSystemDriver())
add_storage_driver("TomlDataStorage"=>TomlDataDriver())

# Application-level stuff
include("repl.jl")

Expand Down
97 changes: 65 additions & 32 deletions src/TomlDataStorage.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,31 +28,48 @@ For BlobTree:
...
```
"""
struct TomlDataStorage
struct TomlDataRoot
dataset::DataSet
data::Union{String,Dict{String,Any}}
data::Union{Vector{UInt8},ConfigDict}
write::Bool
end

_data_strings_to_buffers(data::String) = base64decode(data)
function _data_strings_to_buffers(data::Dict)
ConfigDict(k=>_data_strings_to_buffers(v) for (k,v) in pairs(data))
end
_data_strings_to_buffers(data) = error("Unexpected embedded data: expected a string or dictionary")

_data_buffers_to_strings(data::Vector{UInt8}) = base64encode(data)
function _data_buffers_to_strings(data::Dict)
ConfigDict(k=>_data_buffers_to_strings(v) for (k,v) in pairs(data))
end

# Get TOML data at `path`, returning nothing if not present
function _getpath(storage::TomlDataStorage, path::RelPath)
function _getpath(storage::TomlDataRoot, path::RelPath)
x = storage.data
for c in path.components
if !(x isa AbstractDict)
return nothing
end
x = get(x, c, nothing)
!isnothing(x) || return nothing
if isnothing(x)
return nothing
end
end
x
end

#--------------------------------------------------
# Storage data interface for trees

Base.isdir(storage::TomlDataStorage, path::RelPath) = _getpath(storage, path) isa Dict
Base.isfile(storage::TomlDataStorage, path::RelPath) = _getpath(storage, path) isa String
Base.ispath(storage::TomlDataStorage, path::RelPath) = !isnothing(_getpath(storage, path))
Base.isdir(storage::TomlDataRoot, path::RelPath) = _getpath(storage, path) isa Dict
Base.isfile(storage::TomlDataRoot, path::RelPath) = _getpath(storage, path) isa String
Base.ispath(storage::TomlDataRoot, path::RelPath) = !isnothing(_getpath(storage, path))

Base.summary(io::IO, storage::TomlDataStorage) = print(io, "Data.toml")
Base.summary(io::IO, storage::TomlDataRoot) = print(io, "Data.toml")

function Base.readdir(storage::TomlDataStorage, path::RelPath)
function Base.readdir(storage::TomlDataRoot, path::RelPath)
try
tree = _getpath(storage, path)
!isnothing(tree) || KeyError(path)
Expand All @@ -66,62 +83,78 @@ end
# Storage data interface for Blob

function Base.open(func::Function, as_type::Type{IO},
storage::TomlDataStorage, path; kws...)
@context func(@! open(as_type, storage, path; kws...))
storage::TomlDataRoot, path; write=false, kws...)
@context func(@! open(as_type, storage, path; write=false, kws...))
end

@! function Base.open(::Type{Vector{UInt8}}, storage::TomlDataStorage, path;
write=false, read=true, kws...)
if write
error("Embedded data is read-only from within the DataSets interface")
end
@! function Base.open(::Type{Vector{UInt8}}, storage::TomlDataRoot, path;
write=false)
try
str = _getpath(storage, path)
!isnothing(str) || KeyError(path)
base64decode(str::AbstractString)
buf = _getpath(storage, path)
!isnothing(buf) || KeyError(path)
return buf
catch
error("TOML storage requires data to be as base64 encoded strings")
end
end

@! function Base.open(::Type{IO}, storage::TomlDataStorage, path; kws...)
buf = @! open(Vector{UInt8}, storage, path; kws...)
IOBuffer(buf)
@! function Base.open(::Type{IO}, storage::TomlDataRoot, path; write=false)
buf = @! open(Vector{UInt8}, storage, path; write=write)
if write
# For consistency with filesystem version of open()
resize!(buf,0)
end
return IOBuffer(buf, write=write)
end

@! function Base.open(::Type{String}, storage::TomlDataRoot, path; write=false)
buf = @! open(Vector{UInt8}, storage, path; write=write)
return String(copy(buf))
end

function close_dataset(storage::TomlDataRoot, exc=nothing)
if storage.write
encoded_data = _data_buffers_to_strings(storage.data)
# Force writing of dataset to project
conf = copy(storage.dataset.storage)
conf["data"] = encoded_data
config(storage.dataset; storage=conf)
end
end

# TODO: The following should be factored out and implemented generically
function Base.read(storage::TomlDataStorage, path::RelPath, ::Type{T}) where {T}
function Base.read(storage::TomlDataRoot, path::RelPath, ::Type{T}) where {T}
@context begin
io = @! open(IO, storage, path)
read(io, T)
end
end

function Base.read(storage::TomlDataStorage, path::RelPath)
function Base.read(storage::TomlDataRoot, path::RelPath)
@context @! open(Vector{UInt8}, storage, path)
end


#-------------------------------------------------------------------------------
# Connect storage backend
function connect_toml_data_storage(f, config, dataset)
type = config["type"]
data = get(config, "data", nothing)

struct TomlDataDriver <: AbstractDataDriver
end

function open_dataset(driver::TomlDataDriver, dataset, write)
type = dataset.storage["type"]
data = get(dataset.storage, "data", nothing)
if type == "Blob"
if !(data isa AbstractString)
error("TOML data storage requires string data in the \"storage.data\" key")
end
f(Blob(TomlDataStorage(dataset, data)))
return Blob(TomlDataRoot(dataset, _data_strings_to_buffers(data), write))
elseif type == "BlobTree"
if !(data isa AbstractDict)
error("TOML data storage requires a dictionary in the \"storage.data\" key")
end
f(BlobTree(TomlDataStorage(dataset, data)))
return BlobTree(TomlDataRoot(dataset, _data_strings_to_buffers(data), write))
else
throw(ArgumentError("DataSet type $type not supported for data embedded in Data.toml"))
end
end

add_storage_driver("TomlDataStorage"=>connect_toml_data_storage)

29 changes: 23 additions & 6 deletions src/data_project.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ function dataset(proj::AbstractDataProject, spec::AbstractString)
# Enhance dataset with "dataspec" holding URL-like fragment & query
dataspec = Dict()
if !isnothing(query)
dataspec["query"] = Dict{String,Any}(query)
dataspec["query"] = ConfigDict(query)
end
if !isnothing(fragmentstr)
dataspec["fragment"] = fragmentstr
Expand Down Expand Up @@ -222,13 +222,13 @@ A in-memory collection of DataSets.
"""
struct DataProject <: AbstractDataProject
datasets::Dict{String,DataSet}
drivers::Vector{Dict{String,Any}}
drivers::Vector{ConfigDict}
end

DataProject() = DataProject(Dict{String,DataSet}(), Vector{Dict{String,Any}}())
DataProject() = DataProject(Dict{String,DataSet}(), Vector{ConfigDict}())

DataProject(project::AbstractDataProject) = DataProject(Dict(pairs(project)),
Vector{Dict{String,Any}}())
Vector{ConfigDict}())

data_drivers(project::DataProject) = project.drivers

Expand All @@ -247,9 +247,21 @@ function Base.iterate(proj::DataProject, state=nothing)
end

function Base.setindex!(proj::DataProject, data::DataSet, name::AbstractString)
if haskey(proj, name) && proj[name] !== data
throw(ArgumentError("Cannot replace existing dataset with name \"$name\". Try DataSets.delete() first."))
end
if isnothing(data.project)
data.project = proj
elseif data.project !== proj
throw(ArgumentError("DataSet is already owned by a different project"))
end
proj.datasets[name] = data
end

function delete(proj::DataProject, name::AbstractString)
delete!(proj.datasets, name)
end

#-------------------------------------------------------------------------------
"""
StackedDataProject()
Expand Down Expand Up @@ -385,17 +397,22 @@ function save_project(path::AbstractString, proj::DataProject)
close(tmpio)
mv(tmppath, path, force=true)
end
return nothing
end

function create(name; kws...)
#-------------------------------------------------------------------------------
# Global versions of the dataset metadata manipulation functions which act on
# the global dataset PROJECT object.

function create(name::AbstractString; kws...)
ds = create(PROJECT, name; kws...)
if isnothing(ds)
error("Could not create dataset in any available data project")
end
return ds
end

function delete(name)
function delete(name::AbstractString)
delete(PROJECT, name)
end

Expand Down
8 changes: 5 additions & 3 deletions src/file_data_projects.jl
Original file line number Diff line number Diff line change
Expand Up @@ -203,13 +203,15 @@ end
project_name(proj::TomlFileDataProject) = proj.path

function Base.setindex!(proj::TomlFileDataProject, data::DataSet, name::AbstractString)
p = _get_cached(proj)
p = get_cache(proj)
p[name] = data
save_project(proj.path, p)
end

function delete(proj::TomlFileDataProject, name::AbstractString)
p = _get_cached(proj)
# FIXME: Make this safe for concurrent use in-process
# (or better, between processes?)
p = get_cache(proj)

ds = dataset(p, name)
# Assume all datasets which don't have the "linked" property are linked.
Expand All @@ -221,7 +223,7 @@ function delete(proj::TomlFileDataProject, name::AbstractString)
delete_storage(proj, driver, ds)
end

delete!(p.datasets, name)
delete(p, name)
save_project(proj.path, p)
end

Expand Down
Loading

0 comments on commit 6bf32a2

Please sign in to comment.