From 803a2321713064eff83bc5f0a8062b8702891b1f Mon Sep 17 00:00:00 2001
From: guilhermebodin <guilherme.b.moraes@gmail.com>
Date: Mon, 23 Sep 2024 17:22:02 -0300
Subject: [PATCH 1/2] add a fast way to transform a Quiver file into datafram
 directly

---
 Project.toml            |  2 +-
 src/metadata.jl         |  8 +++++--
 src/reader.jl           | 46 ++++++++++++++++++++++++++++++++++++-
 test/test_read_write.jl | 51 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 103 insertions(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index 13a5944..a285802 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Quiver"
 uuid = "cdbb3f72-2527-4dbd-9d0e-93533a5519ac"
 authors = ["raphasampaio", "guilhermebodin"]
-version = "0.1.5"
+version = "0.1.6"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
diff --git a/src/metadata.jl b/src/metadata.jl
index 2f53abb..f60537f 100644
--- a/src/metadata.jl
+++ b/src/metadata.jl
@@ -57,8 +57,8 @@ function Metadata(;
     return metadata
 end
 
-function to_toml(metadata::Metadata, filename::String)
-    dict_metadata = OrderedDict(
+function to_ordered_dict(metadata::Metadata)
+    return OrderedDict(
         "version" => metadata.version,
         "dimensions" => String.(metadata.dimensions),
         "dimension_size" => metadata.dimension_size,
@@ -68,6 +68,10 @@ function to_toml(metadata::Metadata, filename::String)
         "unit" => metadata.unit,
         "labels" => metadata.labels,
     )
+end
+
+function to_toml(metadata::Metadata, filename::String)
+    dict_metadata = to_ordered_dict(metadata)
     open(filename, "w") do io
         TOML.print(io, dict_metadata)
     end
diff --git a/src/reader.jl b/src/reader.jl
index 0328568..aec83ff 100644
--- a/src/reader.jl
+++ b/src/reader.jl
@@ -126,7 +126,6 @@ function file_to_array(
     reader = Reader{I}(
         filename;
         labels_to_read,
-        carrousel = false, # carrousel does not make sense in this implemetations
     )
 
     metadata = reader.metadata
@@ -147,4 +146,49 @@ function file_to_array(
     Quiver.close!(reader)
 
     return data, metadata
+end
+
+function file_to_df(
+    filename::String,
+    implementation::Type{I};
+    labels_to_read::Vector{String} = String[],
+) where {I <: Implementation}
+    reader = Reader{I}(
+        filename;
+        labels_to_read,
+    )
+
+    metadata = reader.metadata
+    dimension_names = reverse(metadata.dimensions)
+    dimension_sizes = reverse(metadata.dimension_size)
+
+    df = DataFrame()
+
+    # Add all columns to the DataFrame
+    for dim in metadata.dimensions
+        DataFrames.insertcols!(df, dim => Int[])
+    end
+    for label in reader.labels_to_read
+        DataFrames.insertcols!(df, label => Float32[])
+    end
+
+    for dims in Iterators.product([1:size for size in dimension_sizes]...)
+        dim_kwargs = OrderedDict(Symbol.(dimension_names) .=> dims)
+        Quiver.goto!(reader; dim_kwargs...)
+        if all(isnan.(reader.data))
+            continue
+        end
+        # Construct the data frame row by row
+        push!(df, [reverse(dims)...; reader.data...])
+    end
+
+    # Add metadata to DataFrame
+    orderec_dict_metadata = to_ordered_dict(metadata)
+    for (k, v) in orderec_dict_metadata
+        DataFrames.metadata!(df, k, v)
+    end
+
+    Quiver.close!(reader)
+
+    return df
 end
\ No newline at end of file
diff --git a/test/test_read_write.jl b/test/test_read_write.jl
index 1ea18fb..21b4b4c 100644
--- a/test/test_read_write.jl
+++ b/test/test_read_write.jl
@@ -2,6 +2,7 @@ module TestWriter
 
 using Test
 using Quiver
+using DataFrames
 using Dates
 
 function read_write_1(impl)
@@ -1116,6 +1117,55 @@ function read_file_to_array(impl)
     end
 end
 
+function read_file_to_df(impl)
+    filename = joinpath(@__DIR__, "test_read_file_to_df")
+
+    initial_date = DateTime(2006, 1, 1)
+    num_stages = 4
+    dates = collect(initial_date:Dates.Month(1):initial_date + Dates.Month(num_stages - 1))
+    num_scenarios = 3
+    num_blocks_per_stage = Int32.(Dates.daysinmonth.(dates) .* 24)
+    num_time_series = 3
+    
+    dimensions = ["stage", "scenario", "block"]
+    labels = ["agent_$i" for i in 1:num_time_series]
+    time_dimension = "stage"
+    dimension_size = [num_stages, num_scenarios, maximum(num_blocks_per_stage)]
+
+    data = zeros(num_time_series, maximum(num_blocks_per_stage), num_scenarios, num_stages)
+    for stage in 1:num_stages
+        for scenario in 1:num_scenarios
+            for block in 1:num_blocks_per_stage[stage]
+                for i in 1:num_time_series
+                    data[i, block, scenario, stage] = stage + scenario + block + i
+                end
+            end
+        end
+    end
+
+    Quiver.array_to_file(
+        filename,
+        data,
+        impl;
+        dimensions,
+        labels,
+        time_dimension,
+        dimension_size,
+        initial_date,
+        unit = " - "
+    )
+
+    df = Quiver.file_to_df(filename, impl)
+
+    # This might be innacurate, if it fails these tests can be removed
+    @test size(df, 1) == 8928
+    @test size(df, 2) == 6
+
+    @test DataFrames.metadata(df, "time_dimension") == "stage"
+    @test DataFrames.metadata(df, "dimensions") == ["stage", "scenario", "block"]
+    @test DataFrames.metadata(df, "labels") == ["agent_1", "agent_2", "agent_3"]
+end
+
 function test_read_write_implementations()
     for impl in Quiver.implementations()
         @testset "Read and Write $(impl)" begin
@@ -1132,6 +1182,7 @@ function test_read_write_implementations()
             read_filtering_labels(impl)
             read_write_out_of_order_kwargs(impl)
             read_file_to_array(impl)
+            read_file_to_df(impl)
             if impl == Quiver.csv
                 read_write_goto_csv_1()
                 read_write_goto_csv_2()

From 3e2eab8106cfdc6a03049136ccb5e8547967a0eb Mon Sep 17 00:00:00 2001
From: guilhermebodin <guilherme.b.moraes@gmail.com>
Date: Mon, 23 Sep 2024 17:35:43 -0300
Subject: [PATCH 2/2] Add more comments in README

---
 README.md               | 18 +++++++++++++-----
 src/reader.jl           | 32 ++++++++++++++++++++++++++++++++
 test/test_read_write.jl |  6 ++++++
 3 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 83c6817..bed20f0 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,11 @@ Quiver is not the fastest data-structure for time series data, but it is designe
 is to have a set of dimensions that can be used to index the data and a set of values from the time serires attributes. This allows to have a
 table-like data-structure that can be used to store time series data. 
 
-Files that follow the Quiver implementation can be stored in any format that maps directly to a table-like structure with metadata.
-CSV files are implemented in a way that the first few lines are used to store the metadata and the rest of the file is used to store the data., i.e.
+Files that follow the Quiver implementation can be stored in any format that maps directly to a table-like structure with metadata. The metadata stores the frequency of the time series, the initial date, the unit of the data, the number of the dimension, the maximum value of each dimension, the time dimension and the version of the file.
 
-```csv
+The matadata is always stored in a TOML file in the following format:
+
+```toml
 version = 1
 dimensions = ["stage", "scenario", "block"]
 dimension_size = [10, 12, 744]
@@ -18,11 +19,18 @@ time_dimension = "stage"
 frequency = "month"
 unit = ""
 labels = ["agent_1", "agent_2", "agent_3"]
---- 
+```
+
+And the data is stored in a csv or binary file that contains the values of the time series. The csv format is as follows:
+```csv
 stage,scenario,block,agent_1,agent_2,agent_3
 1,1,1,1.0,1.0,1.0
 1,1,2,1.0,1.0,1.0
 1,1,3,1.0,1.0,1.0
 ```
 
-The metadata stores the frequency of the time series, the initial date, the unit of the data, the number of the dimension, the maximum value of each dimension, the time dimension and the version of the file.
\ No newline at end of file
+## Installation
+
+```julia
+pkg> add Quiver
+```
\ No newline at end of file
diff --git a/src/reader.jl b/src/reader.jl
index aec83ff..1cd35ce 100644
--- a/src/reader.jl
+++ b/src/reader.jl
@@ -80,6 +80,14 @@ function _move_data_from_buffer_cache_to_data!(reader::Reader)
     return nothing
 end
 
+"""
+    goto!(
+        reader::Reader;
+        dims...
+    )
+
+Move the reader to the specified dimensions and return the data.
+"""
 function goto!(reader::Reader; dims...)
     validate_dimensions(reader.metadata, dims...)
     _build_dimension_to_read!(reader; dims...)
@@ -89,12 +97,22 @@ function goto!(reader::Reader; dims...)
     return reader.data
 end
 
+"""
+    next_dimension!(reader::Reader)
+
+Move the reader to the next dimension and return the data.
+"""
 function next_dimension!(reader::Reader)
     _quiver_next_dimension!(reader)
     _move_data_from_buffer_cache_to_data!(reader)
     return reader.data
 end
 
+"""
+    max_index(reader::Reader, dimension::String)
+
+Return the maximum index of the specified dimension.
+"""
 function max_index(reader::Reader, dimension::String)
     symbol_dim = Symbol(dimension)
     index = findfirst(isequal(symbol_dim), reader.metadata.dimensions)
@@ -104,6 +122,11 @@ function max_index(reader::Reader, dimension::String)
     return reader.metadata.dimension_size[index]
 end
 
+"""
+    close!(reader::Reader)
+
+Close the reader.
+"""
 function close!(reader::Reader)
     _quiver_close!(reader)
     return nothing
@@ -148,6 +171,15 @@ function file_to_array(
     return data, metadata
 end
 
+"""
+    file_to_df(
+        filename::String,
+        implementation::Type{I};
+        labels_to_read::Vector{String} = String[],
+    ) where {I <: Implementation}
+
+Reads a file and returns the data and metadata as a DataFrame.
+"""
 function file_to_df(
     filename::String,
     implementation::Type{I};
diff --git a/test/test_read_write.jl b/test/test_read_write.jl
index 21b4b4c..21e05f1 100644
--- a/test/test_read_write.jl
+++ b/test/test_read_write.jl
@@ -1115,6 +1115,9 @@ function read_file_to_array(impl)
     for i in eachindex(data)
         @test data[i] == data_read[i]
     end
+    
+    rm("$filename.$(Quiver.file_extension(impl))")
+    rm("$filename.toml")
 end
 
 function read_file_to_df(impl)
@@ -1164,6 +1167,9 @@ function read_file_to_df(impl)
     @test DataFrames.metadata(df, "time_dimension") == "stage"
     @test DataFrames.metadata(df, "dimensions") == ["stage", "scenario", "block"]
     @test DataFrames.metadata(df, "labels") == ["agent_1", "agent_2", "agent_3"]
+
+    rm("$filename.$(Quiver.file_extension(impl))")
+    rm("$filename.toml")
 end
 
 function test_read_write_implementations()