Skip to content

Commit

Permalink
Added support for keyword arguments to agg methods - closes #67
Browse files Browse the repository at this point in the history
  • Loading branch information
ankane committed May 6, 2024
1 parent 223dadc commit 37f8480
Show file tree
Hide file tree
Showing 6 changed files with 197 additions and 31 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.10.1 (unreleased)

- Added support for keyword arguments to `agg` methods

## 0.10.0 (2024-05-02)

- Updated Polars to 0.39.2
Expand Down
4 changes: 2 additions & 2 deletions lib/polars/dynamic_group_by.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def initialize(
@start_by = start_by
end

def agg(aggs)
def agg(*aggs, **named_aggs)
@df.lazy
.group_by_dynamic(
@time_column,
Expand All @@ -45,7 +45,7 @@ def agg(aggs)
by: @by,
start_by: @start_by
)
.agg(aggs)
.agg(*aggs, **named_aggs)
.collect(no_optimization: true, string_cache: false)
end
end
Expand Down
111 changes: 88 additions & 23 deletions lib/polars/group_by.rb
Original file line number Diff line number Diff line change
Expand Up @@ -106,39 +106,104 @@ def each
# _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
# end

# Use multiple aggregations on columns.
# Compute aggregations for each group of a group by operation.
#
# This can be combined with complete lazy API and is considered idiomatic polars.
#
# @param aggs [Object]
# Single / multiple aggregation expression(s).
# @param aggs [Array]
# Aggregations to compute for each group of the group by operation,
# specified as positional arguments.
# Accepts expression input. Strings are parsed as column names.
# @param named_aggs [Hash]
# Additional aggregations, specified as keyword arguments.
# The resulting columns will be renamed to the keyword used.
#
# @return [DataFrame]
#
# @example
# @example Compute the aggregation of the columns for each group.
# df = Polars::DataFrame.new(
# {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]}
# {
# "a" => ["a", "b", "a", "b", "c"],
# "b" => [1, 2, 1, 3, 3],
# "c" => [5, 4, 3, 2, 1]
# }
# )
# df.group_by("foo", maintain_order: true).agg(
# [
# Polars.sum("bar").suffix("_sum"),
# Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum")
# ]
# df.group_by("a").agg(Polars.col("b"), Polars.col("c"))
# # =>
# # shape: (3, 3)
# # ┌─────┬───────────┬───────────┐
# # │ a ┆ b ┆ c │
# # │ --- ┆ --- ┆ --- │
# # │ str ┆ list[i64] ┆ list[i64] │
# # ╞═════╪═══════════╪═══════════╡
# # │ a ┆ [1, 1] ┆ [5, 3] │
# # │ b ┆ [2, 3] ┆ [4, 2] │
# # │ c ┆ [3] ┆ [1] │
# # └─────┴───────────┴───────────┘
#
# @example Compute the sum of a column for each group.
# df.group_by("a").agg(Polars.col("b").sum)
# # =>
# # shape: (3, 2)
# # ┌─────┬─────┐
# # │ a ┆ b │
# # │ --- ┆ --- │
# # │ str ┆ i64 │
# # ╞═════╪═════╡
# # │ a ┆ 2 │
# # │ b ┆ 5 │
# # │ c ┆ 3 │
# # └─────┴─────┘
#
# @example Compute multiple aggregates at once by passing a list of expressions.
# df.group_by("a").agg([Polars.sum("b"), Polars.mean("c")])
# # =>
# # shape: (3, 3)
# # ┌─────┬─────┬─────┐
# # │ a ┆ b ┆ c │
# # │ --- ┆ --- ┆ --- │
# # │ str ┆ i64 ┆ f64 │
# # ╞═════╪═════╪═════╡
# # │ c ┆ 3 ┆ 1.0 │
# # │ a ┆ 2 ┆ 4.0 │
# # │ b ┆ 5 ┆ 3.0 │
# # └─────┴─────┴─────┘
#
# @example Or use positional arguments to compute multiple aggregations in the same way.
# df.group_by("a").agg(
# Polars.sum("b").name.suffix("_sum"),
# (Polars.col("c") ** 2).mean.name.suffix("_mean_squared")
# )
# # =>
# # shape: (2, 3)
# # ┌─────┬─────────┬──────────────┐
# # │ foo ┆ bar_sum ┆ bar_tail_sum │
# # │ --- ┆ --- ┆ --- │
# # │ str ┆ i64 ┆ i64 │
# # ╞═════╪═════════╪══════════════╡
# # │ one ┆ 9 ┆ 9 │
# # │ two ┆ 6 ┆ 5 │
# # └─────┴─────────┴──────────────┘
def agg(aggs)
# # shape: (3, 3)
# # ┌─────┬───────┬────────────────┐
# # │ a ┆ b_sum ┆ c_mean_squared │
# # │ --- ┆ --- ┆ --- │
# # │ str ┆ i64 ┆ f64 │
# # ╞═════╪═══════╪════════════════╡
# # │ a ┆ 2 ┆ 17.0 │
# # │ c ┆ 3 ┆ 1.0 │
# # │ b ┆ 5 ┆ 10.0 │
# # └─────┴───────┴────────────────┘
#
# @example Use keyword arguments to easily name your expression inputs.
# df.group_by("a").agg(
# b_sum: Polars.sum("b"),
# c_mean_squared: (Polars.col("c") ** 2).mean
# )
# # =>
# # shape: (3, 3)
# # ┌─────┬───────┬────────────────┐
# # │ a ┆ b_sum ┆ c_mean_squared │
# # │ --- ┆ --- ┆ --- │
# # │ str ┆ i64 ┆ f64 │
# # ╞═════╪═══════╪════════════════╡
# # │ a ┆ 2 ┆ 17.0 │
# # │ c ┆ 3 ┆ 1.0 │
# # │ b ┆ 5 ┆ 10.0 │
# # └─────┴───────┴────────────────┘
def agg(*aggs, **named_aggs)
@df.lazy
.group_by(@by, maintain_order: @maintain_order)
.agg(aggs)
.agg(*aggs, **named_aggs)
.collect(no_optimization: true)
end

Expand Down
103 changes: 100 additions & 3 deletions lib/polars/lazy_group_by.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,108 @@ def initialize(lgb)
@lgb = lgb
end

# Describe the aggregation that need to be done on a group.
# Compute aggregations for each group of a group by operation.
#
# @param aggs [Array]
# Aggregations to compute for each group of the group by operation,
# specified as positional arguments.
# Accepts expression input. Strings are parsed as column names.
# @param named_aggs [Hash]
# Additional aggregations, specified as keyword arguments.
# The resulting columns will be renamed to the keyword used.
#
# @return [LazyFrame]
def agg(aggs)
rbexprs = Utils.selection_to_rbexpr_list(aggs)
#
# @example Compute the aggregation of the columns for each group.
# ldf = Polars::DataFrame.new(
# {
# "a" => ["a", "b", "a", "b", "c"],
# "b" => [1, 2, 1, 3, 3],
# "c" => [5, 4, 3, 2, 1]
# }
# ).lazy
# ldf.group_by("a").agg(
# [Polars.col("b"), Polars.col("c")]
# ).collect
# # =>
# # shape: (3, 3)
# # ┌─────┬───────────┬───────────┐
# # │ a ┆ b ┆ c │
# # │ --- ┆ --- ┆ --- │
# # │ str ┆ list[i64] ┆ list[i64] │
# # ╞═════╪═══════════╪═══════════╡
# # │ a ┆ [1, 1] ┆ [5, 3] │
# # │ b ┆ [2, 3] ┆ [4, 2] │
# # │ c ┆ [3] ┆ [1] │
# # └─────┴───────────┴───────────┘
#
# @example Compute the sum of a column for each group.
# ldf.group_by("a").agg(
# Polars.col("b").sum
# ).collect
# # =>
# # shape: (3, 2)
# # ┌─────┬─────┐
# # │ a ┆ b │
# # │ --- ┆ --- │
# # │ str ┆ i64 │
# # ╞═════╪═════╡
# # │ a ┆ 2 │
# # │ b ┆ 5 │
# # │ c ┆ 3 │
# # └─────┴─────┘
#
# @example Compute multiple aggregates at once by passing a list of expressions.
# ldf.group_by("a").agg(
# [Polars.sum("b"), Polars.mean("c")]
# ).collect
# # =>
# # shape: (3, 3)
# # ┌─────┬─────┬─────┐
# # │ a ┆ b ┆ c │
# # │ --- ┆ --- ┆ --- │
# # │ str ┆ i64 ┆ f64 │
# # ╞═════╪═════╪═════╡
# # │ c ┆ 3 ┆ 1.0 │
# # │ a ┆ 2 ┆ 4.0 │
# # │ b ┆ 5 ┆ 3.0 │
# # └─────┴─────┴─────┘
#
# @example Or use positional arguments to compute multiple aggregations in the same way.
# ldf.group_by("a").agg(
# Polars.sum("b").name.suffix("_sum"),
# (Polars.col("c") ** 2).mean.name.suffix("_mean_squared")
# ).collect
# # =>
# # shape: (3, 3)
# # ┌─────┬───────┬────────────────┐
# # │ a ┆ b_sum ┆ c_mean_squared │
# # │ --- ┆ --- ┆ --- │
# # │ str ┆ i64 ┆ f64 │
# # ╞═════╪═══════╪════════════════╡
# # │ a ┆ 2 ┆ 17.0 │
# # │ c ┆ 3 ┆ 1.0 │
# # │ b ┆ 5 ┆ 10.0 │
# # └─────┴───────┴────────────────┘
#
# @example Use keyword arguments to easily name your expression inputs.
# ldf.group_by("a").agg(
# b_sum: Polars.sum("b"),
# c_mean_squared: (Polars.col("c") ** 2).mean
# ).collect
# # =>
# # shape: (3, 3)
# # ┌─────┬───────┬────────────────┐
# # │ a ┆ b_sum ┆ c_mean_squared │
# # │ --- ┆ --- ┆ --- │
# # │ str ┆ i64 ┆ f64 │
# # ╞═════╪═══════╪════════════════╡
# # │ a ┆ 2 ┆ 17.0 │
# # │ c ┆ 3 ┆ 1.0 │
# # │ b ┆ 5 ┆ 10.0 │
# # └─────┴───────┴────────────────┘
def agg(*aggs, **named_aggs)
rbexprs = Utils.parse_as_list_of_expressions(*aggs, **named_aggs)
Utils.wrap_ldf(@lgb.agg(rbexprs))
end

Expand Down
4 changes: 2 additions & 2 deletions lib/polars/rolling_group_by.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ def initialize(
@check_sorted = check_sorted
end

def agg(aggs)
def agg(*aggs, **named_aggs)
@df.lazy
.group_by_rolling(
index_column: @time_column, period: @period, offset: @offset, closed: @closed, by: @by, check_sorted: @check_sorted
)
.agg(aggs)
.agg(*aggs, **named_aggs)
.collect(no_optimization: true, string_cache: false)
end
end
Expand Down
2 changes: 1 addition & 1 deletion test/docs_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def assert_examples(method, cls)
end

# non-deterministic output
next if [:sort, :mode, :duration, :_hash, :hash_rows, :flatten, :value_counts].include?(method.name)
next if [:sort, :mode, :duration, :_hash, :hash_rows, :flatten, :value_counts, :agg].include?(method.name)

# check output
lines = code.split("\n")
Expand Down

0 comments on commit 37f8480

Please sign in to comment.