Skip to content

Commit

Permalink
CATE uplift validation methods (#836)
Browse files Browse the repository at this point in the history
Added additional functionality to the DRTester validation class to include AUTOC validation metric, with associated inference methods. Also included ability to plot uplift curve methods (both QINI and TOC curves) and cleaned up handling of multiple treatments.
  • Loading branch information
amarvenu authored Jan 12, 2024
1 parent 7793184 commit 67eef1e
Show file tree
Hide file tree
Showing 5 changed files with 487 additions and 147 deletions.
47 changes: 31 additions & 16 deletions econml/tests/test_drtester.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,15 @@ def test_multi(self):
res = my_dr_tester.evaluate_all(Xval, Xtrain)
res_df = res.summary()

for k in range(3):
if k == 0:
with self.assertRaises(Exception) as exc:
res.plot_cal(k)
self.assertTrue(str(exc.exception) == 'Plotting only supported for treated units (not controls)')
else:
for k in range(4):
if k in [0, 3]:
self.assertRaises(ValueError, res.plot_cal, k)
self.assertRaises(ValueError, res.plot_qini, k)
self.assertRaises(ValueError, res.plot_toc, k)
else: # real treatments, k = 1 or 2
self.assertTrue(res.plot_cal(k) is not None)
self.assertTrue(res.plot_qini(k) is not None)
self.assertTrue(res.plot_toc(k) is not None)

self.assertGreater(res_df.blp_pval.values[0], 0.1) # no heterogeneity
self.assertLess(res_df.blp_pval.values[1], 0.05) # heterogeneity
Expand All @@ -103,6 +105,7 @@ def test_multi(self):
self.assertGreater(res_df.cal_r_squared.values[1], 0) # good R2

self.assertLess(res_df.qini_pval.values[1], res_df.qini_pval.values[0])
self.assertLess(res_df.autoc_pval.values[1], res_df.autoc_pval.values[0])

def test_binary(self):
Xtrain, Dtrain, Ytrain, Xval, Dval, Yval = self._get_data(num_treatments=1)
Expand Down Expand Up @@ -136,17 +139,20 @@ def test_binary(self):
res = my_dr_tester.evaluate_all(Xval, Xtrain)
res_df = res.summary()

for k in range(2):
if k == 0:
with self.assertRaises(Exception) as exc:
res.plot_cal(k)
self.assertTrue(str(exc.exception) == 'Plotting only supported for treated units (not controls)')
else:
for k in range(3):
if k in [0, 2]:
self.assertRaises(ValueError, res.plot_cal, k)
self.assertRaises(ValueError, res.plot_qini, k)
self.assertRaises(ValueError, res.plot_toc, k)
else: # real treatment, k = 1
self.assertTrue(res.plot_cal(k) is not None)
self.assertTrue(res.plot_qini(k) is not None)
self.assertTrue(res.plot_toc(k) is not None)

self.assertLess(res_df.blp_pval.values[0], 0.05) # heterogeneity
self.assertGreater(res_df.cal_r_squared.values[0], 0) # good R2
self.assertLess(res_df.qini_pval.values[0], 0.05) # heterogeneity
self.assertLess(res_df.autoc_pval.values[0], 0.05) # heterogeneity

def test_nuisance_val_fit(self):
Xtrain, Dtrain, Ytrain, Xval, Dval, Yval = self._get_data(num_treatments=1)
Expand Down Expand Up @@ -209,7 +215,7 @@ def test_exceptions(self):
)

# fit nothing
for func in [my_dr_tester.evaluate_blp, my_dr_tester.evaluate_cal, my_dr_tester.evaluate_qini]:
for func in [my_dr_tester.evaluate_blp, my_dr_tester.evaluate_cal, my_dr_tester.evaluate_uplift]:
with self.assertRaises(Exception) as exc:
func()
if func.__name__ == 'evaluate_cal':
Expand All @@ -226,7 +232,7 @@ def test_exceptions(self):
for func in [
my_dr_tester.evaluate_blp,
my_dr_tester.evaluate_cal,
my_dr_tester.evaluate_qini,
my_dr_tester.evaluate_uplift,
my_dr_tester.evaluate_all
]:
with self.assertRaises(Exception) as exc:
Expand All @@ -241,7 +247,7 @@ def test_exceptions(self):

for func in [
my_dr_tester.evaluate_cal,
my_dr_tester.evaluate_qini,
my_dr_tester.evaluate_uplift,
my_dr_tester.evaluate_all
]:
with self.assertRaises(Exception) as exc:
Expand All @@ -252,12 +258,21 @@ def test_exceptions(self):
cal_res = my_dr_tester.evaluate_cal(Xval, Xtrain)
self.assertGreater(cal_res.cal_r_squared[0], 0) # good R2

with self.assertRaises(Exception) as exc:
my_dr_tester.evaluate_uplift(metric='blah')
self.assertTrue(
str(exc.exception) == "Unsupported metric - must be one of ['toc', 'qini']"
)

my_dr_tester = DRtester(
model_regression=reg_y,
model_propensity=reg_t,
cate=cate
).fit_nuisance(
Xval, Dval, Yval, Xtrain, Dtrain, Ytrain
)
qini_res = my_dr_tester.evaluate_qini(Xval, Xtrain)
qini_res = my_dr_tester.evaluate_uplift(Xval, Xtrain)
self.assertLess(qini_res.pvals[0], 0.05)

autoc_res = my_dr_tester.evaluate_uplift(Xval, Xtrain, metric='toc')
self.assertLess(autoc_res.pvals[0], 0.05)
78 changes: 46 additions & 32 deletions econml/validate/drtester.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from statsmodels.api import OLS
from statsmodels.tools import add_constant

from .results import CalibrationEvaluationResults, BLPEvaluationResults, QiniEvaluationResults, EvaluationResults
from .utils import calculate_dr_outcomes, calc_qini_coeff
from .results import CalibrationEvaluationResults, BLPEvaluationResults, UpliftEvaluationResults, EvaluationResults
from .utils import calculate_dr_outcomes, calc_uplift


class DRtester:
Expand Down Expand Up @@ -382,7 +382,7 @@ def evaluate_cal(
self.get_cate_preds(Xval, Xtrain)

cal_r_squared = np.zeros(self.n_treat)
df_plot = pd.DataFrame()
plot_data_dict = dict()
for k in range(self.n_treat):
cuts = np.quantile(self.cate_preds_train_[:, k], np.linspace(0, 1, n_groups + 1))
probs = np.zeros(n_groups)
Expand All @@ -409,15 +409,19 @@ def evaluate_cal(
# Calculate R-square calibration score
cal_r_squared[k] = 1 - (cal_score_g / cal_score_o)

df_plot1 = pd.DataFrame({'ind': np.array(range(n_groups)),
'gate': gate, 'se_gate': se_gate,
'g_cate': g_cate, 'se_g_cate': se_g_cate})
df_plot1['tmt'] = self.treatments[k + 1]
df_plot = pd.concat((df_plot, df_plot1))
df_plot = pd.DataFrame({
'ind': np.array(range(n_groups)),
'gate': gate,
'se_gate': se_gate,
'g_cate': g_cate,
'se_g_cate': se_g_cate
})

plot_data_dict[self.treatments[k + 1]] = df_plot

self.cal_res = CalibrationEvaluationResults(
cal_r_squared=cal_r_squared,
df_plot=df_plot,
plot_data_dict=plot_data_dict,
treatments=self.treatments
)

Expand Down Expand Up @@ -480,12 +484,13 @@ def evaluate_blp(

return self.blp_res

def evaluate_qini(
def evaluate_uplift(
self,
Xval: np.array = None,
Xtrain: np.array = None,
percentiles: np.array = np.linspace(5, 95, 50)
) -> QiniEvaluationResults:
percentiles: np.array = np.linspace(5, 95, 50),
metric: str = 'qini'
) -> UpliftEvaluationResults:
"""
Calculates QINI coefficient for the given model as in Radcliffe (2007), where units are ordered by predicted
CATE values and a running measure of the average treatment effect in each cohort is kept as we progress
Expand All @@ -505,10 +510,12 @@ def evaluate_qini(
percentiles: one-dimensional array, default ``np.linspace(5, 95, 50)''
Array of percentiles over which the QINI curve should be constructed. Defaults to 5%-95% in intervals of
5%.
metric: string, default 'qini'
Which type of uplift curve to evaluate. Must be one of ['toc', 'qini']
Returns
-------
QiniEvaluationResults object showing the results of the QINI fit
UpliftEvaluationResults object showing the fitted results
"""
if not hasattr(self, 'dr_val_'):
raise Exception("Must fit nuisances before evaluating")
Expand All @@ -518,39 +525,44 @@ def evaluate_qini(
raise Exception('CATE predictions not yet calculated - must provide both Xval, Xtrain')
self.get_cate_preds(Xval, Xtrain)

curve_data_dict = dict()
if self.n_treat == 1:
qini, qini_err = calc_qini_coeff(
coeff, err, curve_df = calc_uplift(
self.cate_preds_train_,
self.cate_preds_val_,
self.dr_val_,
percentiles
percentiles,
metric
)
qinis = [qini]
errs = [qini_err]
coeffs = [coeff]
errs = [err]
curve_data_dict[self.treatments[1]] = curve_df
else:
qinis = []
coeffs = []
errs = []
for k in range(self.n_treat):
qini, qini_err = calc_qini_coeff(
coeff, err, curve_df = calc_uplift(
self.cate_preds_train_[:, k],
self.cate_preds_val_[:, k],
self.dr_val_[:, k],
percentiles
percentiles,
metric
)
coeffs.append(coeff)
errs.append(err)
curve_data_dict[self.treatments[k + 1]] = curve_df

qinis.append(qini)
errs.append(qini_err)

pvals = [st.norm.sf(abs(q / e)) for q, e in zip(qinis, errs)]
pvals = [st.norm.sf(abs(q / e)) for q, e in zip(coeffs, errs)]

self.qini_res = QiniEvaluationResults(
params=qinis,
self.uplift_res = UpliftEvaluationResults(
params=coeffs,
errs=errs,
pvals=pvals,
treatments=self.treatments
treatments=self.treatments,
curve_data_dict=curve_data_dict
)

return self.qini_res
return self.uplift_res

def evaluate_all(
self,
Expand All @@ -559,8 +571,8 @@ def evaluate_all(
n_groups: int = 4
) -> EvaluationResults:
"""
Implements the best linear prediction (`evaluate_blp'), calibration (`evaluate_cal') and QINI coefficient
(`evaluate_qini') methods.
Implements the best linear prediction (`evaluate_blp'), calibration (`evaluate_cal'), uplift curve
('evaluate_uplift') methods
Parameters
----------
Expand All @@ -583,12 +595,14 @@ def evaluate_all(

blp_res = self.evaluate_blp()
cal_res = self.evaluate_cal(n_groups=n_groups)
qini_res = self.evaluate_qini()
qini_res = self.evaluate_uplift(metric='qini')
toc_res = self.evaluate_uplift(metric='toc')

self.res = EvaluationResults(
blp_res=blp_res,
cal_res=cal_res,
qini_res=qini_res
qini_res=qini_res,
toc_res=toc_res
)

return self.res
Loading

0 comments on commit 67eef1e

Please sign in to comment.