Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

groupby().agg(mean) kinda works; groupby().agg(std) not at all #134

Open
MichaelTiemannOSC opened this issue Oct 7, 2022 · 1 comment
Open

Comments

@MichaelTiemannOSC
Copy link
Collaborator

The following test case shows that mean and std are well-behaved when using groupby().agg() on a dataframe full of floats. Not so when the dataframe is made of Quantities (which can be made to work for mean but not so for std).

import numpy as np
import pandas as pd
# import uncertainties                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

from pint import UnitRegistry, set_application_registry
from pint_pandas import PintArray, PintType
from openscm_units import unit_registry

PintType.ureg = unit_registry
ureg = unit_registry
set_application_registry(ureg)
Q_ = ureg.Quantity
M_ = ureg.Measurement
PA_ = PintArray
ureg.define("CO2e = CO2 = CO2eq = CO2_eq")

aa = pd.DataFrame({'company_id':['C1', 'C1', 'C1', 'C2', 'C2', 'C3'],
                   'metric': ['s1', 's1', 's1', 's1', 's1', 's1'],
                   2016: [1.0, 2.0, 3.0, 10.0, 10.0, 5.0]})

print(f"aa = {aa}")
print(f"aa.groupby(mean) = {aa.groupby(by=['company_id', 'metric']).agg(lambda x: x.mean())}")
print(f"aa.groupby(std) = {aa.groupby(by=['company_id', 'metric']).agg(lambda x: x.std())}")

xx = pd.DataFrame({'company_id':['C1', 'C1', 'C1', 'C2', 'C2', 'C3'],
                   'metric': ['s1', 's1', 's1', 's1', 's1', 's1'],
                   2016: [Q_(1.0, 't CO2'), Q_(2.0, 't CO2'), Q_(3.0, 't CO2'), Q_(10.0, 't CO2'), Q_(10.0, 't CO2'), Q_(5.0, 't CO2')]})

print(f"xx = {xx}")
print(f"xx.groupby(mean) = {xx.groupby(by=['company_id', 'metric']).agg(lambda x: x.astype('pint[t CO2]').mean())}")
print(f"xx.groupby(std) = {xx.groupby(by=['company_id', 'metric']).agg(lambda x: x.astype('pint[t CO2]').std())}")

Here's the output showing correct mean behavior and bad std behavior:

bash-3.2$ python pint-std.py
aa =   company_id metric  2016
0         C1     s1   1.0
1         C1     s1   2.0
2         C1     s1   3.0
3         C2     s1  10.0
4         C2     s1  10.0
5         C3     s1   5.0
aa.groupby(mean) =                    2016
company_id metric
C1         s1       2.0
C2         s1      10.0
C3         s1       5.0
aa.groupby(std) =                    2016
company_id metric
C1         s1       1.0
C2         s1       0.0
C3         s1       NaN
/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/dtypes/cast.py:1983: UnitStrippedWarning: The unit of the quantity is stripped when downcasting to ndarray.
  result[:] = values
xx =   company_id metric                   2016
0         C1     s1   1.0 CO2 * metric_ton
1         C1     s1   2.0 CO2 * metric_ton
2         C1     s1   3.0 CO2 * metric_ton
3         C2     s1  10.0 CO2 * metric_ton
4         C2     s1  10.0 CO2 * metric_ton
5         C3     s1   5.0 CO2 * metric_ton
xx.groupby(mean) =                                     2016
company_id metric
C1         s1       2.0 CO2 * metric_ton
C2         s1      10.0 CO2 * metric_ton
C3         s1       5.0 CO2 * metric_ton
/Users/michael/Documents/GitHub/ITR-MichaelTiemannOSC/examples/pint-std.py:31: FutureWarning: Dropping invalid columns in DataFrameGroupBy.agg is deprecated. In a future version, a TypeError will be raised. Before calling .agg, select only columns which should be valid for the function.
  print(f"xx.groupby(std) = {xx.groupby(by=['company_id', 'metric']).agg(lambda x: x.astype('pint[t CO2]').std())}")
/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pint_pandas/pint_array.py:194: RuntimeWarning: pint-pandas does not support magnitudes of <class 'str'>. Converting magnitudes to float.
  warnings.warn(
Traceback (most recent call last):
  File "/Users/michael/Documents/GitHub/ITR-MichaelTiemannOSC/examples/pint-std.py", line 31, in <module>
    print(f"xx.groupby(std) = {xx.groupby(by=['company_id', 'metric']).agg(lambda x: x.astype('pint[t CO2]').std())}")
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/groupby/generic.py", line 883, in aggregate
    return self._python_agg_general(func, *args, **kwargs)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/groupby/groupby.py", line 1499, in _python_agg_general
    return self._python_apply_general(f, self._selected_obj)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/groupby/groupby.py", line 1464, in _python_apply_general
    values, mutated = self.grouper.apply(f, data, self.axis)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/groupby/ops.py", line 761, in apply
    res = f(group)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/groupby/groupby.py", line 1476, in <lambda>
    f = lambda x: func(x, *args, **kwargs)
  File "/Users/michael/Documents/GitHub/ITR-MichaelTiemannOSC/examples/pint-std.py", line 31, in <lambda>
    print(f"xx.groupby(std) = {xx.groupby(by=['company_id', 'metric']).agg(lambda x: x.astype('pint[t CO2]').std())}")
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/generic.py", line 5905, in astype
    results = [
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/generic.py", line 5906, in <listcomp>
    self.iloc[:, i].astype(dtype, copy=copy)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/generic.py", line 5912, in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/internals/managers.py", line 419, in astype
    return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/internals/managers.py", line 304, in apply
    applied = getattr(b, f)(**kwargs)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/internals/blocks.py", line 580, in astype
    new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/dtypes/cast.py", line 1292, in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/dtypes/cast.py", line 1237, in astype_array
    values = astype_nansafe(values, dtype, copy=copy)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pandas/core/dtypes/cast.py", line 1108, in astype_nansafe
    return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pint_pandas/pint_array.py", line 446, in _from_sequence
    return cls(scalars, dtype=dtype, copy=copy)
  File "/Users/michael/opt/miniconda3/envs/itr_env/lib/python3.9/site-packages/pint_pandas/pint_array.py", line 198, in __init__
    self._data = np.array(values, float, copy=copy)
ValueError: could not convert string to float: 'C1'
@MichaelTiemannOSC
Copy link
Collaborator Author

This commit bears directly on the above problem, but was committed after the latest (0.2) release. Time for a 0.3 release?

I realize (from other discussions) that there's still a question as to what to do with std as the unit type is a delta of a base, not a base type. The aforementioned patch does not try to deal with that.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant