-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathskmem.py
188 lines (152 loc) · 6.45 KB
/
skmem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
""" skmem.MemReducer
Smart memory reduction for pandas.
A transformer to quickly reduce dataframe memory by converting memory-hungry
dtypes to ones needing less memory. Advantages include:
- Fully compatible with scikit-learn. Combine with other transformers
and pipelines with ease.
- Preserves data integrity. Set simple parameters to control
treatment of floats and objects.
- Easy to customize. Use class inheritance or directly change modular
functions as needed.
- Efficient. Save time with vectorized functions that process data
faster than most parallelized solutions.
"""
import re
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import validation
class MemReducer(BaseEstimator, TransformerMixin):
""" Reduce dataframe memory by converting dataframe columns to dtypes
requiring less memory. Returns a dataframe with memory-efficient
dtypes where possible.
Integers, 64-bit floats and objects/strings can be converted.
Parameters provide control for treatment of floats and objects.
Parameters
___________
max_unique_pct : float, optional, default=0.5
Sets maximum threshold for converting object columns to categoricals.
Threshold is compared to the number of unique values as a percent of
column length. 0.0 prevents all conversions and 1.0 allows all
conversions.
Example
--------
>>> import skmem
>>> df = pd.DataFrame({'cats': np.tile(['a', 'b'], 500_000),
'true_ints': np.tile(np.arange(-5, 5), 100_000),
'floats': np.arange(0., 1_000_000.)
})
>>> print(df.dtypes)
|cats object
|true_ints int64
|floats float64
|dtype: object
>>> mr = skmem.MemReducer(max_unique_pct=0.8)
>>> df_small = mr.fit_transform(df, float_cols=['floats'])
|Memory in: 0.08 GB
|Starting integers.
|Starting floats.
|Starting objects.
|Memory out: 0.01 GB
|Reduction: 92.7%
>>> print(df_small.dtypes)
|cats category
|true_ints int8
|floats float32
|dtype: object
Notes
-----
Downcasting to float dtypes below 32-bits (np.float16, np.float8)
is not supported.
"""
def __init__(self, max_unique_pct=0.5):
self.max_unique_pct = max_unique_pct
def fit(self, df, float_cols=None):
""" Identify dataframe and any float columns to be reduced.
Parameters
----------
df : pandas DataFrame
The dataframe used as the basis for conversion.
float_cols : list, optional, default=None
A list of column names to be converted from np.float64 to
np.float32.
"""
if not isinstance(df, pd.DataFrame):
raise TypeError(f"'{type(df).__name__}' object is not a pandas \
dataframe.")
self.float_candidates = float_cols
return self
# Helper functions for .transform()
def reduce_ints(self, df):
int_cols = df.select_dtypes('integer').columns
if len(int_cols) > 0:
print("Starting integers.")
mins = df[int_cols].min()
unsigneds = mins.index[mins >= 0]
df[unsigneds] = df[unsigneds].apply(pd.to_numeric,
downcast='unsigned')
signeds = mins.index[mins < 0]
df[signeds] = df[signeds].apply(pd.to_numeric,
downcast='signed')
return df
def reduce_floats(self, df, float_cols):
print("Starting floats.")
if not isinstance(float_cols, list):
print(f"'{type(float_cols).__name__}' object is not a list,\
skipping floats.")
else:
true_float_cols = df.select_dtypes(np.float64).columns.tolist()
non_float64s = [f for f in float_cols if f not in true_float_cols]
if len(non_float64s) > 0:
print("Skipping columns that are not np.float64")
convertibles = [f for f in float_cols if f in true_float_cols]
if len(convertibles) > 0:
df[convertibles] = df[convertibles].astype(np.float32)
return df
def reduce_objs(self, df, max_pct):
if (max_pct < 0.) or (max_pct > 1.):
raise ValueError("max_unique_pct must be between 0 and 1")
obj_cols = df.select_dtypes('object').columns
if len(obj_cols) > 0:
print("Starting objects.")
for oc in obj_cols:
try:
df[oc] = pd.to_numeric(df[oc], downcast='integer')
except:
pass
else:
print(f"Converting {oc} to numbers.")
obj_cols = df.select_dtypes('object').columns
if len(obj_cols) > 0:
category_mask = df[obj_cols].nunique().values/len(df) <= max_pct
cat_cols = obj_cols[category_mask]
if len(cat_cols) > 0:
df[cat_cols] = df[cat_cols].astype('category')
return df
def transform(self, df):
""" Convert dataframe columns to dtypes requiring lower memory.
Parameters
----------
df : pandas DataFrame
The dataframe to be converted.
"""
validation.check_is_fitted(self, 'float_candidates')
memory_MB_in = df.memory_usage(deep=True).sum()/(1024**2)
print(f"Memory in: {memory_MB_in:.2f} MB")
df = self.reduce_ints(df)
if self.float_candidates is not None:
df = self.reduce_floats(df, self.float_candidates)
df = self.reduce_objs(df, self.max_unique_pct)
memory_MB_out = df.memory_usage(deep=True).sum()/(1024**2)
print(f"Memory out: {memory_MB_out:.2f} MB",
f"Reduction: {1 - memory_MB_out/memory_MB_in:.1%}",
sep='\n')
return df
def main():
"""skmem is designed to be a utility script. This function helps with testing
and demonstrates how it works when run directly."""
df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
mr = MemReducer()
mr.fit_transform(df)
if __name__=="__main__":
main()