-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwrangling.py
161 lines (124 loc) · 4.8 KB
/
wrangling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import pandas as pd
import numpy as np
import logging
logger = logging.getLogger(__name__)
FORMAT = "[%(filename)s:%(lineno)s - %(funcName)20s() ] %(message)s"
logging.basicConfig(format=FORMAT)
logger.setLevel(logging.ERROR)
################################
# wrangling methods used in:
# - income_distribution.ipynb
################################
def get_persons_per_income_group(df):
"""Formats cumulative bins (e.g. <50k) to 25k incremental bins (e.g. >25-50k)."""
df["VALUE"].values[1:-1] = df["VALUE"].values[1:-1] - df["VALUE"].values[2:]
return df
def create_income_bins(y)->np.array:
raw_income_bins = 13
# sum 0:5, 5:7, and then take individual values
logger.info("create_income_bins()")
logger.debug(f"y: /n {y}")
if len(y)==raw_income_bins:
y = np.add.reduceat(y, [0,5,7,8,9,10,11,12])
return y
elif len(y)==0:
return np.array([np.nan]*13)
else: return y
def add_gaps(y):
# some empty values for discontinuities
y = np.insert(y, [4, 7], [np.nan])
return y
def normalize_plot_data(y)->np.array:
y = np.divide(y, np.sum(y))
return y
def format_hist_data(df)->np.array:
df = get_persons_per_income_group(df)
y = df.VALUE.values
y_hist = normalize_plot_data(y)
return y_hist
def preprocess_income_bin_data(df)->tuple:
"""Process the data for plotting.
Returns
---------
tuple of np.arrays
"""
y_hist = format_hist_data(df)
y_hist = create_income_bins(y_hist)
y_cumulative = np.cumsum(y_hist)
y_hist = add_gaps(y_hist)
y_cumulative = add_gaps(y_cumulative)
return y_hist, y_cumulative
def subset_plot_data_for_income_bins(df)->pd.DataFrame:
"""Used in make_data.py to subset the raw data."""
cols_to_keep = ['REF_DATE',
'GEO',
'Sex',
'Age group',
'Persons with income',
'SCALAR_FACTOR',
'VALUE',
]
income_to_plot = ["Persons with income under $5,000",
"Persons with income of $5,000 and over",
"Persons with income of $10,000 and over",
"Persons with income of $15,000 and over",
"Persons with income of $20,000 and over",
"Persons with income of $25,000 and over",
"Persons with income of $35,000 and over",
"Persons with income of $50,000 and over",
"Persons with income of $75,000 and over",
"Persons with income of $100,000 and over",
"Persons with income of $150,000 and over",
"Persons with income of $200,000 and over",
"Persons with income of $250,000 and over"]
df = df.loc[:,cols_to_keep]
df = get_income_to_plot_for_hist(df, income_to_plot)
return df
###############################
# wrangling methods used in:
# - median_income.ipynb
###############################
def subset_rows(df, column, value)->np.ndarray:
"""
A method to s A method to subset rows of the https://doi.org/10.25318/1110000801-eng
df : pd.DataFrame
column : str
value : str
"""
mask = (df[column] == value)
return mask.values
def subset_REF_DATE(df, year):
return subset_rows(df, "REF_DATE", year)
def subset_GEO(df, geo):
return subset_rows(df, "GEO", geo)
def subset_Sex(df, sex):
# return subset_rows(df, "Sex", sex)
logger.debug(f"sex: {sex}")
return df["Sex"].isin(sex)
def subset_Age(df, age):
return subset_rows(df, "Age group", age)
def subset_year_age_sex_geo(df, year=None, age=None, sex=None, geo=None):
mask_year = subset_REF_DATE(df, year)
mask_geo = subset_GEO(df, geo)
mask_sex = subset_Sex(df, sex)
mask_age = subset_Age(df, age)
return df[(mask_year) & (mask_geo) & (mask_sex) & (mask_age)]
def get_income_to_plot_for_hist(df, income_to_plot):
df = df[df["Persons with income"].isin(income_to_plot)]
return df
def get_income_to_plot_for_scatter(df, income_to_plot):
df = df[df["Statistics"].isin(income_to_plot)]
return df
def subset_plot_data_for_scatter_plot(
df, year, age, sex, geo,
income_source, income_to_plot, cols_to_keep):
df = df.loc[:,cols_to_keep]
df = subset_year_age_sex_geo(df, year, age, sex, geo)
df = df[df["Income source"].isin(income_source)]
df = get_income_to_plot_for_scatter(df, income_to_plot)
return df
def subset_for_scatter_plot(df, income_source, income_to_plot, cols_to_keep):
df = df.loc[:,cols_to_keep]
df = df[df["Income source"]==income_source]
df = df[df["Statistics"]==income_to_plot]
return df