diff --git a/Project.toml b/Project.toml index f858150..7bbe09c 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.1.0" CommonDataModel = "1fbeeb36-5f17-413c-809b-666fb144f157" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DiskArrays = "3c3547ce-8d99-4f5e-a174-61eb10b00ae3" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Zarr = "0a941bbe-ad1d-11e8-39d9-ab76183a1d99" [compat] diff --git a/docs/src/index.md b/docs/src/index.md index 369b619..cf0cd3f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -8,6 +8,7 @@ Modules = [ZarrDatasets] ### Differences between Zarr and NetCDF files -* All metadata is stored in JSON files for Zarr with the following implications: +* All metadata (in particular attributes) is stored in JSON files for the Zarr format with the following implications: * JSON does not distinguish between integers and real numbers. They are all considered as generic numbers. Whole numbers are loaded as `Int64` and decimal numbers `Float64`. It is not possible to store the number `1.0` as a real number. * The order of keys in a JSON document is undefined. It is therefore not possible to have a consistent ordering of the attributes or variables. + * The JSON standard does not allow NaN, +Inf, -Inf (https://github.com/capnproto/capnproto/issues/261). diff --git a/src/ZarrDatasets.jl b/src/ZarrDatasets.jl index 56a4c3d..c7a3221 100644 --- a/src/ZarrDatasets.jl +++ b/src/ZarrDatasets.jl @@ -13,6 +13,9 @@ import CommonDataModel: attrib, attribnames, dataset, + defAttrib, + defVar, + defDim, dim, dimnames, iswritable, @@ -29,6 +32,7 @@ import DiskArrays: import CommonDataModel as CDM using DataStructures using Zarr +import JSON include("types.jl") include("dataset.jl") diff --git a/src/dataset.jl b/src/dataset.jl index d692400..c2f5ef6 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -8,13 +8,7 @@ function CDM.variable(ds::ZarrDataset,varname::SymbolOrString) ZarrVariable{eltype(zarray),ndims(zarray),typeof(zarray),typeof(ds)}(zarray,ds) end -CDM.dimnames(ds::ZarrDataset) = Tuple( - sort( - unique( - reduce(vcat, - (collect(dimnames(variable(ds,vn))) for vn in keys(ds)), - init = String[] - )))) +CDM.dimnames(ds::ZarrDataset) = Tuple(String.(keys(ds.dimensions))) # function CDM.unlimited(ds::ZarrDataset) # ul = ds.unlimited @@ -33,17 +27,12 @@ CDM.dimnames(ds::ZarrDataset) = Tuple( # return nothing # end -function CDM.dim(ds::ZarrDataset,dimname::SymbolOrString) +CDM.dim(ds::ZarrDataset,dimname::SymbolOrString) = ds.dimensions[Symbol(dimname)] - for vn in keys(ds) - v = variable(ds,vn) - dn = dimnames(v) - i = findfirst(==(dimname),dn) - if !isnothing(i) - return size(v,i) - end - end - error("dimension $dimname not found") +function CDM.defDim(ds::ZarrDataset,dimname::SymbolOrString,dimlen) + dn = Symbol(dimname) + @assert !haskey(ds.dimensions,dn) + ds.dimensions[dn] = dimlen end CDM.varnames(ds::ZarrDataset) = keys(ds.zgroup.arrays) @@ -51,13 +40,22 @@ CDM.varnames(ds::ZarrDataset) = keys(ds.zgroup.arrays) CDM.attribnames(ds::ZarrDataset) = keys(ds.zgroup.attrs) CDM.attrib(ds::ZarrDataset,name::SymbolOrString) = ds.zgroup.attrs[String(name)] +function CDM.defAttrib(ds::ZarrDataset,name::SymbolOrString,value) + @assert iswritable(ds) + ds.zgroup.attrs[String(name)] = value + + storage = ds.zgroup.storage + io = IOBuffer() + JSON.print(io, ds.zgroup.attrs) + storage[ds.zgroup.path,".zattrs"] = take!(io) +end CDM.groupnames(ds::ZarrDataset) = keys(ds.zgroup.groups) CDM.group(ds::ZarrDataset,name::SymbolOrString) = ZarrDataset(ds.zgroup.groups,String(name),ds) CDM.parentdataset(ds::ZarrDataset) = ds.parentdataset -CDM.iswritable(ds::ZarrDataset) = false +CDM.iswritable(ds::ZarrDataset) = ds.iswritable CDM.maskingvalue(ds::ZarrDataset) = ds.maskingvalue @@ -108,12 +106,38 @@ end # implicit call to close(ds) function ZarrDataset(url::AbstractString,mode = "r"; parentdataset = nothing, _omitcode = 404, - maskingvalue = missing) - ds = Zarr.zopen(url,mode) - if ds.storage isa Zarr.HTTPStore - Zarr.missing_chunk_return_code!(ds.storage,_omitcode) + maskingvalue = missing, + attrib = Dict(), + ) + + dimensions = OrderedDict{Symbol,Int}() + iswritable = false + + if mode == "r" + zg = Zarr.zopen(url,mode) + if (zg.storage isa Zarr.HTTPStore) || + (zg.storage isa Zarr.ConsolidatedStore{Zarr.HTTPStore}) + @debug "omit chunks on HTTP error" _omitcode + Zarr.missing_chunk_return_code!(zg.storage,_omitcode) + end + + for (varname,zarray) in zg.arrays + for (dimname,dimlen) in zip(reverse(zarray.attrs["_ARRAY_DIMENSIONS"]),size(zarray)) + + dn = Symbol(dimname) + if haskey(dimensions,dn) + @assert dimensions[dn] == dimlen + else + dimensions[dn] = dimlen + end + end + end + elseif mode == "c" + store = Zarr.DirectoryStore(url) + zg = zgroup(store, "",attrs = Dict(attrib)) + iswritable = true end - ZarrDataset(ds,parentdataset,maskingvalue) + ZarrDataset(zg,parentdataset,dimensions,iswritable,maskingvalue) end diff --git a/src/types.jl b/src/types.jl index 05e638d..fff77e3 100644 --- a/src/types.jl +++ b/src/types.jl @@ -7,5 +7,7 @@ end struct ZarrDataset{TZ,TP,Tmaskingvalue} <: CDM.AbstractDataset zgroup::TZ parentdataset::TP + dimensions::OrderedDict{Symbol,Int} + iswritable::Bool maskingvalue::Tmaskingvalue end diff --git a/src/variable.jl b/src/variable.jl index aaef645..c430431 100644 --- a/src/variable.jl +++ b/src/variable.jl @@ -10,8 +10,32 @@ CDM.name(v::ZarrVariable) = Zarr.zname(v.zarray) CDM.dimnames(v::ZarrVariable) = Tuple(reverse(v.zarray.attrs["_ARRAY_DIMENSIONS"])) CDM.dataset(v::ZarrVariable) = v.parentdataset -CDM.attribnames(v::ZarrVariable) = filter(!=("_ARRAY_DIMENSIONS"),keys(v.zarray.attrs)) -CDM.attrib(v::ZarrVariable,name::SymbolOrString) = v.zarray.attrs[String(name)] +function CDM.attribnames(v::ZarrVariable) + names = filter(!=("_ARRAY_DIMENSIONS"),keys(v.zarray.attrs)) + if !isnothing(v.zarray.metadata.fill_value) + push!(names,"_FillValue") + end + return names +end + +function CDM.attrib(v::ZarrVariable,name::SymbolOrString) + if String(name) == "_FillValue" && !isnothing(v.zarray.metadata.fill_value) + return v.zarray.metadata.fill_value + end + return v.zarray.attrs[String(name)] +end + +function CDM.defAttrib(v::ZarrVariable,name::SymbolOrString,value) + @assert iswritable(dataset(v)) + @assert String(name) !== "_FillValue" + + v.zarray.attrs[String(name)] = value + + storage = v.zarray.storage + io = IOBuffer() + JSON.print(io, v.zarray.attrs) + storage[v.zarray.path,".zattrs"] = take!(io) +end # DiskArray methods @@ -19,3 +43,28 @@ eachchunk(v::ZarrVariable) = eachchunk(v.zarray) haschunks(v::ZarrVariable) = haschunks(v.zarray) eachchunk(v::CFVariable{T,N,<:ZarrVariable}) where {T,N} = eachchunk(v.var) haschunks(v::CFVariable{T,N,<:ZarrVariable}) where {T,N} = haschunks(v.var) + + +function CDM.defVar(ds::ZarrDataset,name::SymbolOrString,vtype::DataType,dimensionnames; chunksizes=nothing, attrib = Dict(), kwargs...) + @assert iswritable(ds) + + _attrib = Dict(attrib) + _attrib["_ARRAY_DIMENSIONS"] = reverse(dimensionnames) + + _size = ntuple(length(dimensionnames)) do i + ds.dimensions[Symbol(dimensionnames[i])] + end + + if isnothing(chunksizes) + chunksizes = _size + end + zarray = zcreate( + vtype, ds.zgroup, name, _size...; + chunks = chunksizes, + attrs = _attrib, + kwargs... + ) + + return ZarrVariable{vtype,ndims(zarray),typeof(zarray),typeof(ds)}( + zarray,ds) +end diff --git a/test/runtests.jl b/test/runtests.jl index 082e1fc..1f79678 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,4 +4,5 @@ using ZarrDatasets @testset "ZarrDatasets.jl" begin include("test_cdm.jl") include("test_multifile.jl") + include("test_write.jl") end diff --git a/test/test_cdm.jl b/test/test_cdm.jl index 7f6274f..7b8956a 100644 --- a/test/test_cdm.jl +++ b/test/test_cdm.jl @@ -1,4 +1,9 @@ -using CommonDataModel: iswritable, attribnames, parentdataset, load!, dataset +using CommonDataModel: + attribnames, + dataset, + iswritable, + load!, + parentdataset using Dates using DiskArrays using NCDatasets diff --git a/test/test_write.jl b/test/test_write.jl new file mode 100644 index 0000000..4a0f4bf --- /dev/null +++ b/test/test_write.jl @@ -0,0 +1,49 @@ +using ZarrDatasets +using ZarrDatasets: + defDim, + defVar, + defAttrib +using Zarr +using DataStructures + +data = rand(Int32,3,5) + +fname = tempname() +mkdir(fname) +gattrib = Dict{String,Any}("title" => "this is the title") +ds = ZarrDataset(fname,"c",attrib = gattrib) + +defDim(ds,"lon",3) +defDim(ds,"lat",5) + +attrib = Dict{String,Any}( + "units" => "m/s", + "long_name" => "test", +) + + +varname = "var2" +dimensionnames = ("lon","lat") +vtype = Int32 + +zv = defVar(ds,varname,vtype,dimensionnames, attrib = attrib) +zv[:,:] = data +zv.attrib["lala"] = 12 +zv.attrib["standard_name"] = "test" +ds.attrib["history"] = "test" +close(ds) + +ds = ZarrDataset(fname) + +zv = ds[varname] + +@test zv.attrib["lala"] == 12 +@test zv.attrib["standard_name"] == "test" +@test ds.attrib["history"] == "test" + +@test zv[:,:] == data + +io = IOBuffer() +show(io,ds) +str = String(take!(io)) +@test occursin("Global",str)