Skip to content

Commit

Permalink
Merge pull request #23 from psrenergy/gb/implement_to_df
Browse files Browse the repository at this point in the history
Add a fast way to transform a Quiver file into dataframe directly
  • Loading branch information
guilhermebodin authored Sep 23, 2024
2 parents f9112c2 + 3e2eab8 commit 71fd3d8
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Quiver"
uuid = "cdbb3f72-2527-4dbd-9d0e-93533a5519ac"
authors = ["raphasampaio", "guilhermebodin"]
version = "0.1.5"
version = "0.1.6"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand Down
18 changes: 13 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ Quiver is not the fastest data-structure for time series data, but it is designe
is to have a set of dimensions that can be used to index the data and a set of values from the time serires attributes. This allows to have a
table-like data-structure that can be used to store time series data.

Files that follow the Quiver implementation can be stored in any format that maps directly to a table-like structure with metadata.
CSV files are implemented in a way that the first few lines are used to store the metadata and the rest of the file is used to store the data., i.e.
Files that follow the Quiver implementation can be stored in any format that maps directly to a table-like structure with metadata. The metadata stores the frequency of the time series, the initial date, the unit of the data, the number of the dimension, the maximum value of each dimension, the time dimension and the version of the file.

```csv
The matadata is always stored in a TOML file in the following format:

```toml
version = 1
dimensions = ["stage", "scenario", "block"]
dimension_size = [10, 12, 744]
Expand All @@ -18,11 +19,18 @@ time_dimension = "stage"
frequency = "month"
unit = ""
labels = ["agent_1", "agent_2", "agent_3"]
---
```

And the data is stored in a csv or binary file that contains the values of the time series. The csv format is as follows:
```csv
stage,scenario,block,agent_1,agent_2,agent_3
1,1,1,1.0,1.0,1.0
1,1,2,1.0,1.0,1.0
1,1,3,1.0,1.0,1.0
```

The metadata stores the frequency of the time series, the initial date, the unit of the data, the number of the dimension, the maximum value of each dimension, the time dimension and the version of the file.
## Installation

```julia
pkg> add Quiver
```
8 changes: 6 additions & 2 deletions src/metadata.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ function Metadata(;
return metadata
end

function to_toml(metadata::Metadata, filename::String)
dict_metadata = OrderedDict(
function to_ordered_dict(metadata::Metadata)
return OrderedDict(
"version" => metadata.version,
"dimensions" => String.(metadata.dimensions),
"dimension_size" => metadata.dimension_size,
Expand All @@ -68,6 +68,10 @@ function to_toml(metadata::Metadata, filename::String)
"unit" => metadata.unit,
"labels" => metadata.labels,
)
end

function to_toml(metadata::Metadata, filename::String)
dict_metadata = to_ordered_dict(metadata)
open(filename, "w") do io
TOML.print(io, dict_metadata)
end
Expand Down
78 changes: 77 additions & 1 deletion src/reader.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,14 @@ function _move_data_from_buffer_cache_to_data!(reader::Reader)
return nothing
end

"""
goto!(
reader::Reader;
dims...
)
Move the reader to the specified dimensions and return the data.
"""
function goto!(reader::Reader; dims...)
validate_dimensions(reader.metadata, dims...)
_build_dimension_to_read!(reader; dims...)
Expand All @@ -89,12 +97,22 @@ function goto!(reader::Reader; dims...)
return reader.data
end

"""
next_dimension!(reader::Reader)
Move the reader to the next dimension and return the data.
"""
function next_dimension!(reader::Reader)
_quiver_next_dimension!(reader)
_move_data_from_buffer_cache_to_data!(reader)
return reader.data
end

"""
max_index(reader::Reader, dimension::String)
Return the maximum index of the specified dimension.
"""
function max_index(reader::Reader, dimension::String)
symbol_dim = Symbol(dimension)
index = findfirst(isequal(symbol_dim), reader.metadata.dimensions)
Expand All @@ -104,6 +122,11 @@ function max_index(reader::Reader, dimension::String)
return reader.metadata.dimension_size[index]
end

"""
close!(reader::Reader)
Close the reader.
"""
function close!(reader::Reader)
_quiver_close!(reader)
return nothing
Expand All @@ -126,7 +149,6 @@ function file_to_array(
reader = Reader{I}(
filename;
labels_to_read,
carrousel = false, # carrousel does not make sense in this implemetations
)

metadata = reader.metadata
Expand All @@ -147,4 +169,58 @@ function file_to_array(
Quiver.close!(reader)

return data, metadata
end

"""
file_to_df(
filename::String,
implementation::Type{I};
labels_to_read::Vector{String} = String[],
) where {I <: Implementation}
Reads a file and returns the data and metadata as a DataFrame.
"""
function file_to_df(
filename::String,
implementation::Type{I};
labels_to_read::Vector{String} = String[],
) where {I <: Implementation}
reader = Reader{I}(
filename;
labels_to_read,
)

metadata = reader.metadata
dimension_names = reverse(metadata.dimensions)
dimension_sizes = reverse(metadata.dimension_size)

df = DataFrame()

# Add all columns to the DataFrame
for dim in metadata.dimensions
DataFrames.insertcols!(df, dim => Int[])
end
for label in reader.labels_to_read
DataFrames.insertcols!(df, label => Float32[])
end

for dims in Iterators.product([1:size for size in dimension_sizes]...)
dim_kwargs = OrderedDict(Symbol.(dimension_names) .=> dims)
Quiver.goto!(reader; dim_kwargs...)
if all(isnan.(reader.data))
continue
end
# Construct the data frame row by row
push!(df, [reverse(dims)...; reader.data...])
end

# Add metadata to DataFrame
orderec_dict_metadata = to_ordered_dict(metadata)
for (k, v) in orderec_dict_metadata
DataFrames.metadata!(df, k, v)
end

Quiver.close!(reader)

return df
end
57 changes: 57 additions & 0 deletions test/test_read_write.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ module TestWriter

using Test
using Quiver
using DataFrames
using Dates

function read_write_1(impl)
Expand Down Expand Up @@ -1114,6 +1115,61 @@ function read_file_to_array(impl)
for i in eachindex(data)
@test data[i] == data_read[i]
end

rm("$filename.$(Quiver.file_extension(impl))")
rm("$filename.toml")
end

function read_file_to_df(impl)
filename = joinpath(@__DIR__, "test_read_file_to_df")

initial_date = DateTime(2006, 1, 1)
num_stages = 4
dates = collect(initial_date:Dates.Month(1):initial_date + Dates.Month(num_stages - 1))
num_scenarios = 3
num_blocks_per_stage = Int32.(Dates.daysinmonth.(dates) .* 24)
num_time_series = 3

dimensions = ["stage", "scenario", "block"]
labels = ["agent_$i" for i in 1:num_time_series]
time_dimension = "stage"
dimension_size = [num_stages, num_scenarios, maximum(num_blocks_per_stage)]

data = zeros(num_time_series, maximum(num_blocks_per_stage), num_scenarios, num_stages)
for stage in 1:num_stages
for scenario in 1:num_scenarios
for block in 1:num_blocks_per_stage[stage]
for i in 1:num_time_series
data[i, block, scenario, stage] = stage + scenario + block + i
end
end
end
end

Quiver.array_to_file(
filename,
data,
impl;
dimensions,
labels,
time_dimension,
dimension_size,
initial_date,
unit = " - "
)

df = Quiver.file_to_df(filename, impl)

# This might be innacurate, if it fails these tests can be removed
@test size(df, 1) == 8928
@test size(df, 2) == 6

@test DataFrames.metadata(df, "time_dimension") == "stage"
@test DataFrames.metadata(df, "dimensions") == ["stage", "scenario", "block"]
@test DataFrames.metadata(df, "labels") == ["agent_1", "agent_2", "agent_3"]

rm("$filename.$(Quiver.file_extension(impl))")
rm("$filename.toml")
end

function test_read_write_implementations()
Expand All @@ -1132,6 +1188,7 @@ function test_read_write_implementations()
read_filtering_labels(impl)
read_write_out_of_order_kwargs(impl)
read_file_to_array(impl)
read_file_to_df(impl)
if impl == Quiver.csv
read_write_goto_csv_1()
read_write_goto_csv_2()
Expand Down

2 comments on commit 71fd3d8

@guilhermebodin
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/115785

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.1.6 -m "<description of version>" 71fd3d85ba91402ef484825ccfe66161eff056cd
git push origin v0.1.6

Please sign in to comment.