Adds sequential pareto - lognormal estimation

john-p-ryan · Feb 20, 2024 · 4768e5f · 4768e5f
1 parent 14417ca
commit 4768e5f
Showing 1 changed file with 37 additions and 2 deletions.
diff --git a/iot/inverse_optimal_tax.py b/iot/inverse_optimal_tax.py
@@ -136,7 +136,7 @@ def compute_mtr_dist(
         )
         binned_data = pd.DataFrame(
             data[["mtr", income_measure, "z_bin", weight_var]]
-            .groupby(["z_bin"])
+            .groupby(["z_bin"], observed=False)
             .apply(lambda x: wm(x[["mtr", income_measure]], x[weight_var]))
         )
         # make column 0 into two columns
@@ -228,7 +228,7 @@ def compute_income_dist(
                     / (z_line**2 * sigma**3 * np.sqrt(2 * np.pi))
                 )
             )
-        elif dist_type == "log_normal_pareto": 
+        elif dist_type == "log_normal_pareto_mle": 
             # lognormal before cutoff, pareto after
             def lognormal_pareto_pdf(x, mu, sigma, b, cutoff):
                 # for continuity at cutoff
@@ -276,6 +276,11 @@ def weighted_log_likelihood(params):
                 mu, sigma, b = scipy.optimize.minimize(weighted_log_likelihood, params_guess, method='Nelder-Mead').x
                 cutoff = pareto_cutoff
 
+            # assert that mu, sigma, b, and cutoff are all positive
+            assert mu > 0 and sigma > 0 and b > 0 and cutoff > 0
+            # assert that sigma, b and cutoff not too large
+            assert sigma < 25 and b < 25 and cutoff < 1_000_000
+
             c = st.lognorm.pdf(cutoff, sigma, scale=np.exp(mu)) * cutoff / b
             integral = st.lognorm.cdf(cutoff, sigma, scale=np.exp(mu)) + c
 
@@ -289,6 +294,36 @@ def weighted_log_likelihood(params):
                     / (z_line**2 * sigma**3 * np.sqrt(2 * np.pi))
                 ),
                 (-c * b * (b+1) * cutoff**b / z_line**(b+2))) / integral
+        elif dist_type == "log_normal_pareto_sequential":
+            # requires a cutoff
+            cutoff = pareto_cutoff
+            data_pareto = data[data[income_measure] > pareto_cutoff]
+            b = sum(data_pareto[weight_var]) / sum(data_pareto[weight_var] * np.log(data_pareto[income_measure] / pareto_cutoff))
+            # for continuity at cutoff
+            mu = (
+                np.log(data[income_measure]) * data[weight_var]
+                ).sum() / data[weight_var].sum()
+            sigma = np.sqrt((
+                (
+                    ((np.log(data[income_measure]) - mu) ** 2)
+                    * data[weight_var]
+                ).values
+                / data[weight_var].sum()
+                ).sum())
+
+            c = st.lognorm.pdf(cutoff, sigma, scale=np.exp(mu)) * cutoff / b
+            # for renormalization, derived analytically
+            integral = st.lognorm.cdf(cutoff, sigma, scale=np.exp(mu)) + c
+            f = np.where(z_line < cutoff, st.lognorm.pdf(z_line, sigma, scale=np.exp(mu)), c*b*cutoff**b / z_line**(b+1)) / integral
+            F = np.where(z_line <= cutoff, st.lognorm.cdf(z_line, sigma, scale=np.exp(mu)), 
+                         st.lognorm.cdf(cutoff, sigma, scale=np.exp(mu)) + c*(1 - (cutoff / z_line)**b)) / integral
+            f_prime = np.where(z_line <= cutoff, -1 *
+                np.exp(-((np.log(z_line) - mu) ** 2) / (2 * sigma**2))
+                * (
+                    (np.log(z_line) + sigma**2 - mu)
+                    / (z_line**2 * sigma**3 * np.sqrt(2 * np.pi))
+                ),
+                (-c * b * (b+1) * cutoff**b / z_line**(b+2))) / integral
         elif dist_type == "kde":
             # uses the original full data for kde estimation
             f_function = st.gaussian_kde(