From 803a2321713064eff83bc5f0a8062b8702891b1f Mon Sep 17 00:00:00 2001 From: guilhermebodin Date: Mon, 23 Sep 2024 17:22:02 -0300 Subject: [PATCH 1/2] add a fast way to transform a Quiver file into datafram directly --- Project.toml | 2 +- src/metadata.jl | 8 +++++-- src/reader.jl | 46 ++++++++++++++++++++++++++++++++++++- test/test_read_write.jl | 51 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index 13a5944..a285802 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Quiver" uuid = "cdbb3f72-2527-4dbd-9d0e-93533a5519ac" authors = ["raphasampaio", "guilhermebodin"] -version = "0.1.5" +version = "0.1.6" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" diff --git a/src/metadata.jl b/src/metadata.jl index 2f53abb..f60537f 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -57,8 +57,8 @@ function Metadata(; return metadata end -function to_toml(metadata::Metadata, filename::String) - dict_metadata = OrderedDict( +function to_ordered_dict(metadata::Metadata) + return OrderedDict( "version" => metadata.version, "dimensions" => String.(metadata.dimensions), "dimension_size" => metadata.dimension_size, @@ -68,6 +68,10 @@ function to_toml(metadata::Metadata, filename::String) "unit" => metadata.unit, "labels" => metadata.labels, ) +end + +function to_toml(metadata::Metadata, filename::String) + dict_metadata = to_ordered_dict(metadata) open(filename, "w") do io TOML.print(io, dict_metadata) end diff --git a/src/reader.jl b/src/reader.jl index 0328568..aec83ff 100644 --- a/src/reader.jl +++ b/src/reader.jl @@ -126,7 +126,6 @@ function file_to_array( reader = Reader{I}( filename; labels_to_read, - carrousel = false, # carrousel does not make sense in this implemetations ) metadata = reader.metadata @@ -147,4 +146,49 @@ function file_to_array( Quiver.close!(reader) return data, metadata +end + +function file_to_df( + filename::String, + implementation::Type{I}; + labels_to_read::Vector{String} = String[], +) where {I <: Implementation} + reader = Reader{I}( + filename; + labels_to_read, + ) + + metadata = reader.metadata + dimension_names = reverse(metadata.dimensions) + dimension_sizes = reverse(metadata.dimension_size) + + df = DataFrame() + + # Add all columns to the DataFrame + for dim in metadata.dimensions + DataFrames.insertcols!(df, dim => Int[]) + end + for label in reader.labels_to_read + DataFrames.insertcols!(df, label => Float32[]) + end + + for dims in Iterators.product([1:size for size in dimension_sizes]...) + dim_kwargs = OrderedDict(Symbol.(dimension_names) .=> dims) + Quiver.goto!(reader; dim_kwargs...) + if all(isnan.(reader.data)) + continue + end + # Construct the data frame row by row + push!(df, [reverse(dims)...; reader.data...]) + end + + # Add metadata to DataFrame + orderec_dict_metadata = to_ordered_dict(metadata) + for (k, v) in orderec_dict_metadata + DataFrames.metadata!(df, k, v) + end + + Quiver.close!(reader) + + return df end \ No newline at end of file diff --git a/test/test_read_write.jl b/test/test_read_write.jl index 1ea18fb..21b4b4c 100644 --- a/test/test_read_write.jl +++ b/test/test_read_write.jl @@ -2,6 +2,7 @@ module TestWriter using Test using Quiver +using DataFrames using Dates function read_write_1(impl) @@ -1116,6 +1117,55 @@ function read_file_to_array(impl) end end +function read_file_to_df(impl) + filename = joinpath(@__DIR__, "test_read_file_to_df") + + initial_date = DateTime(2006, 1, 1) + num_stages = 4 + dates = collect(initial_date:Dates.Month(1):initial_date + Dates.Month(num_stages - 1)) + num_scenarios = 3 + num_blocks_per_stage = Int32.(Dates.daysinmonth.(dates) .* 24) + num_time_series = 3 + + dimensions = ["stage", "scenario", "block"] + labels = ["agent_$i" for i in 1:num_time_series] + time_dimension = "stage" + dimension_size = [num_stages, num_scenarios, maximum(num_blocks_per_stage)] + + data = zeros(num_time_series, maximum(num_blocks_per_stage), num_scenarios, num_stages) + for stage in 1:num_stages + for scenario in 1:num_scenarios + for block in 1:num_blocks_per_stage[stage] + for i in 1:num_time_series + data[i, block, scenario, stage] = stage + scenario + block + i + end + end + end + end + + Quiver.array_to_file( + filename, + data, + impl; + dimensions, + labels, + time_dimension, + dimension_size, + initial_date, + unit = " - " + ) + + df = Quiver.file_to_df(filename, impl) + + # This might be innacurate, if it fails these tests can be removed + @test size(df, 1) == 8928 + @test size(df, 2) == 6 + + @test DataFrames.metadata(df, "time_dimension") == "stage" + @test DataFrames.metadata(df, "dimensions") == ["stage", "scenario", "block"] + @test DataFrames.metadata(df, "labels") == ["agent_1", "agent_2", "agent_3"] +end + function test_read_write_implementations() for impl in Quiver.implementations() @testset "Read and Write $(impl)" begin @@ -1132,6 +1182,7 @@ function test_read_write_implementations() read_filtering_labels(impl) read_write_out_of_order_kwargs(impl) read_file_to_array(impl) + read_file_to_df(impl) if impl == Quiver.csv read_write_goto_csv_1() read_write_goto_csv_2() From 3e2eab8106cfdc6a03049136ccb5e8547967a0eb Mon Sep 17 00:00:00 2001 From: guilhermebodin Date: Mon, 23 Sep 2024 17:35:43 -0300 Subject: [PATCH 2/2] Add more comments in README --- README.md | 18 +++++++++++++----- src/reader.jl | 32 ++++++++++++++++++++++++++++++++ test/test_read_write.jl | 6 ++++++ 3 files changed, 51 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 83c6817..bed20f0 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,11 @@ Quiver is not the fastest data-structure for time series data, but it is designe is to have a set of dimensions that can be used to index the data and a set of values from the time serires attributes. This allows to have a table-like data-structure that can be used to store time series data. -Files that follow the Quiver implementation can be stored in any format that maps directly to a table-like structure with metadata. -CSV files are implemented in a way that the first few lines are used to store the metadata and the rest of the file is used to store the data., i.e. +Files that follow the Quiver implementation can be stored in any format that maps directly to a table-like structure with metadata. The metadata stores the frequency of the time series, the initial date, the unit of the data, the number of the dimension, the maximum value of each dimension, the time dimension and the version of the file. -```csv +The matadata is always stored in a TOML file in the following format: + +```toml version = 1 dimensions = ["stage", "scenario", "block"] dimension_size = [10, 12, 744] @@ -18,11 +19,18 @@ time_dimension = "stage" frequency = "month" unit = "" labels = ["agent_1", "agent_2", "agent_3"] ---- +``` + +And the data is stored in a csv or binary file that contains the values of the time series. The csv format is as follows: +```csv stage,scenario,block,agent_1,agent_2,agent_3 1,1,1,1.0,1.0,1.0 1,1,2,1.0,1.0,1.0 1,1,3,1.0,1.0,1.0 ``` -The metadata stores the frequency of the time series, the initial date, the unit of the data, the number of the dimension, the maximum value of each dimension, the time dimension and the version of the file. \ No newline at end of file +## Installation + +```julia +pkg> add Quiver +``` \ No newline at end of file diff --git a/src/reader.jl b/src/reader.jl index aec83ff..1cd35ce 100644 --- a/src/reader.jl +++ b/src/reader.jl @@ -80,6 +80,14 @@ function _move_data_from_buffer_cache_to_data!(reader::Reader) return nothing end +""" + goto!( + reader::Reader; + dims... + ) + +Move the reader to the specified dimensions and return the data. +""" function goto!(reader::Reader; dims...) validate_dimensions(reader.metadata, dims...) _build_dimension_to_read!(reader; dims...) @@ -89,12 +97,22 @@ function goto!(reader::Reader; dims...) return reader.data end +""" + next_dimension!(reader::Reader) + +Move the reader to the next dimension and return the data. +""" function next_dimension!(reader::Reader) _quiver_next_dimension!(reader) _move_data_from_buffer_cache_to_data!(reader) return reader.data end +""" + max_index(reader::Reader, dimension::String) + +Return the maximum index of the specified dimension. +""" function max_index(reader::Reader, dimension::String) symbol_dim = Symbol(dimension) index = findfirst(isequal(symbol_dim), reader.metadata.dimensions) @@ -104,6 +122,11 @@ function max_index(reader::Reader, dimension::String) return reader.metadata.dimension_size[index] end +""" + close!(reader::Reader) + +Close the reader. +""" function close!(reader::Reader) _quiver_close!(reader) return nothing @@ -148,6 +171,15 @@ function file_to_array( return data, metadata end +""" + file_to_df( + filename::String, + implementation::Type{I}; + labels_to_read::Vector{String} = String[], + ) where {I <: Implementation} + +Reads a file and returns the data and metadata as a DataFrame. +""" function file_to_df( filename::String, implementation::Type{I}; diff --git a/test/test_read_write.jl b/test/test_read_write.jl index 21b4b4c..21e05f1 100644 --- a/test/test_read_write.jl +++ b/test/test_read_write.jl @@ -1115,6 +1115,9 @@ function read_file_to_array(impl) for i in eachindex(data) @test data[i] == data_read[i] end + + rm("$filename.$(Quiver.file_extension(impl))") + rm("$filename.toml") end function read_file_to_df(impl) @@ -1164,6 +1167,9 @@ function read_file_to_df(impl) @test DataFrames.metadata(df, "time_dimension") == "stage" @test DataFrames.metadata(df, "dimensions") == ["stage", "scenario", "block"] @test DataFrames.metadata(df, "labels") == ["agent_1", "agent_2", "agent_3"] + + rm("$filename.$(Quiver.file_extension(impl))") + rm("$filename.toml") end function test_read_write_implementations()