Merge pull request #23 from psrenergy/gb/implement_to_df

Add a fast way to transform a Quiver file into dataframe directly
psrenergy · Sep 23, 2024 · 71fd3d8 · 71fd3d8 · guilhermebodin · Sep 23, 2024
2 parents f9112c2 + 3e2eab8
commit 71fd3d8
Show file tree

Hide file tree

Showing 5 changed files with 154 additions and 9 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Quiver"
 uuid = "cdbb3f72-2527-4dbd-9d0e-93533a5519ac"
 authors = ["raphasampaio", "guilhermebodin"]
-version = "0.1.5"
+version = "0.1.6"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"

diff --git a/README.md b/README.md
@@ -6,10 +6,11 @@ Quiver is not the fastest data-structure for time series data, but it is designe
 is to have a set of dimensions that can be used to index the data and a set of values from the time serires attributes. This allows to have a
 table-like data-structure that can be used to store time series data. 
 
-Files that follow the Quiver implementation can be stored in any format that maps directly to a table-like structure with metadata.
-CSV files are implemented in a way that the first few lines are used to store the metadata and the rest of the file is used to store the data., i.e.
+Files that follow the Quiver implementation can be stored in any format that maps directly to a table-like structure with metadata. The metadata stores the frequency of the time series, the initial date, the unit of the data, the number of the dimension, the maximum value of each dimension, the time dimension and the version of the file.
 
-```csv
+The matadata is always stored in a TOML file in the following format:
+
+```toml
 version = 1
 dimensions = ["stage", "scenario", "block"]
 dimension_size = [10, 12, 744]
@@ -18,11 +19,18 @@ time_dimension = "stage"
 frequency = "month"
 unit = ""
 labels = ["agent_1", "agent_2", "agent_3"]
---- 
+```
+
+And the data is stored in a csv or binary file that contains the values of the time series. The csv format is as follows:
+```csv
 stage,scenario,block,agent_1,agent_2,agent_3
 1,1,1,1.0,1.0,1.0
 1,1,2,1.0,1.0,1.0
 1,1,3,1.0,1.0,1.0
 ```
 
-The metadata stores the frequency of the time series, the initial date, the unit of the data, the number of the dimension, the maximum value of each dimension, the time dimension and the version of the file.
+## Installation
+
+```julia
+pkg> add Quiver
+```
diff --git a/src/metadata.jl b/src/metadata.jl
@@ -57,8 +57,8 @@ function Metadata(;
     return metadata
 end
 
-function to_toml(metadata::Metadata, filename::String)
-    dict_metadata = OrderedDict(
+function to_ordered_dict(metadata::Metadata)
+    return OrderedDict(
         "version" => metadata.version,
         "dimensions" => String.(metadata.dimensions),
         "dimension_size" => metadata.dimension_size,
@@ -68,6 +68,10 @@ function to_toml(metadata::Metadata, filename::String)
         "unit" => metadata.unit,
         "labels" => metadata.labels,
     )
+end
+
+function to_toml(metadata::Metadata, filename::String)
+    dict_metadata = to_ordered_dict(metadata)
     open(filename, "w") do io
         TOML.print(io, dict_metadata)
     end

diff --git a/src/reader.jl b/src/reader.jl
@@ -80,6 +80,14 @@ function _move_data_from_buffer_cache_to_data!(reader::Reader)
     return nothing
 end
 
+"""
+    goto!(
+        reader::Reader;
+        dims...
+    )
+
+Move the reader to the specified dimensions and return the data.
+"""
 function goto!(reader::Reader; dims...)
     validate_dimensions(reader.metadata, dims...)
     _build_dimension_to_read!(reader; dims...)
@@ -89,12 +97,22 @@ function goto!(reader::Reader; dims...)
     return reader.data
 end
 
+"""
+    next_dimension!(reader::Reader)
+
+Move the reader to the next dimension and return the data.
+"""
 function next_dimension!(reader::Reader)
     _quiver_next_dimension!(reader)
     _move_data_from_buffer_cache_to_data!(reader)
     return reader.data
 end
 
+"""
+    max_index(reader::Reader, dimension::String)
+
+Return the maximum index of the specified dimension.
+"""
 function max_index(reader::Reader, dimension::String)
     symbol_dim = Symbol(dimension)
     index = findfirst(isequal(symbol_dim), reader.metadata.dimensions)
@@ -104,6 +122,11 @@ function max_index(reader::Reader, dimension::String)
     return reader.metadata.dimension_size[index]
 end
 
+"""
+    close!(reader::Reader)
+
+Close the reader.
+"""
 function close!(reader::Reader)
     _quiver_close!(reader)
     return nothing
@@ -126,7 +149,6 @@ function file_to_array(
     reader = Reader{I}(
         filename;
         labels_to_read,
-        carrousel = false, # carrousel does not make sense in this implemetations
     )
 
     metadata = reader.metadata
@@ -147,4 +169,58 @@ function file_to_array(
     Quiver.close!(reader)
 
     return data, metadata
+end
+
+"""
+    file_to_df(
+        filename::String,
+        implementation::Type{I};
+        labels_to_read::Vector{String} = String[],
+    ) where {I <: Implementation}
+
+Reads a file and returns the data and metadata as a DataFrame.
+"""
+function file_to_df(
+    filename::String,
+    implementation::Type{I};
+    labels_to_read::Vector{String} = String[],
+) where {I <: Implementation}
+    reader = Reader{I}(
+        filename;
+        labels_to_read,
+    )
+
+    metadata = reader.metadata
+    dimension_names = reverse(metadata.dimensions)
+    dimension_sizes = reverse(metadata.dimension_size)
+
+    df = DataFrame()
+
+    # Add all columns to the DataFrame
+    for dim in metadata.dimensions
+        DataFrames.insertcols!(df, dim => Int[])
+    end
+    for label in reader.labels_to_read
+        DataFrames.insertcols!(df, label => Float32[])
+    end
+
+    for dims in Iterators.product([1:size for size in dimension_sizes]...)
+        dim_kwargs = OrderedDict(Symbol.(dimension_names) .=> dims)
+        Quiver.goto!(reader; dim_kwargs...)
+        if all(isnan.(reader.data))
+            continue
+        end
+        # Construct the data frame row by row
+        push!(df, [reverse(dims)...; reader.data...])
+    end
+
+    # Add metadata to DataFrame
+    orderec_dict_metadata = to_ordered_dict(metadata)
+    for (k, v) in orderec_dict_metadata
+        DataFrames.metadata!(df, k, v)
+    end
+
+    Quiver.close!(reader)
+
+    return df
 end
diff --git a/test/test_read_write.jl b/test/test_read_write.jl
@@ -2,6 +2,7 @@ module TestWriter
 
 using Test
 using Quiver
+using DataFrames
 using Dates
 
 function read_write_1(impl)
@@ -1114,6 +1115,61 @@ function read_file_to_array(impl)
     for i in eachindex(data)
         @test data[i] == data_read[i]
     end
+
+    rm("$filename.$(Quiver.file_extension(impl))")
+    rm("$filename.toml")
+end
+
+function read_file_to_df(impl)
+    filename = joinpath(@__DIR__, "test_read_file_to_df")
+
+    initial_date = DateTime(2006, 1, 1)
+    num_stages = 4
+    dates = collect(initial_date:Dates.Month(1):initial_date + Dates.Month(num_stages - 1))
+    num_scenarios = 3
+    num_blocks_per_stage = Int32.(Dates.daysinmonth.(dates) .* 24)
+    num_time_series = 3
+
+    dimensions = ["stage", "scenario", "block"]
+    labels = ["agent_$i" for i in 1:num_time_series]
+    time_dimension = "stage"
+    dimension_size = [num_stages, num_scenarios, maximum(num_blocks_per_stage)]
+
+    data = zeros(num_time_series, maximum(num_blocks_per_stage), num_scenarios, num_stages)
+    for stage in 1:num_stages
+        for scenario in 1:num_scenarios
+            for block in 1:num_blocks_per_stage[stage]
+                for i in 1:num_time_series
+                    data[i, block, scenario, stage] = stage + scenario + block + i
+                end
+            end
+        end
+    end
+
+    Quiver.array_to_file(
+        filename,
+        data,
+        impl;
+        dimensions,
+        labels,
+        time_dimension,
+        dimension_size,
+        initial_date,
+        unit = " - "
+    )
+
+    df = Quiver.file_to_df(filename, impl)
+
+    # This might be innacurate, if it fails these tests can be removed
+    @test size(df, 1) == 8928
+    @test size(df, 2) == 6
+
+    @test DataFrames.metadata(df, "time_dimension") == "stage"
+    @test DataFrames.metadata(df, "dimensions") == ["stage", "scenario", "block"]
+    @test DataFrames.metadata(df, "labels") == ["agent_1", "agent_2", "agent_3"]
+
+    rm("$filename.$(Quiver.file_extension(impl))")
+    rm("$filename.toml")
 end
 
 function test_read_write_implementations()
@@ -1132,6 +1188,7 @@ function test_read_write_implementations()
             read_filtering_labels(impl)
             read_write_out_of_order_kwargs(impl)
             read_file_to_array(impl)
+            read_file_to_df(impl)
             if impl == Quiver.csv
                 read_write_goto_csv_1()
                 read_write_goto_csv_2()