to ensure backwards compat, put back lower and upper deciles

0xfdf · Aug 11, 2024 · c198715 · c198715
1 parent 470ff25
commit c198715
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 14 deletions.
diff --git a/toraniko/model.py b/toraniko/model.py
@@ -109,6 +109,8 @@ def estimate_factor_returns(
     mkt_cap_col: str = "market_cap",
     symbol_col: str = "symbol",
     date_col: str = "date",
+    mkt_factor_col: str = "market",
+    res_ret_col: str = "res_asset_returns",
 ) -> tuple[pl.DataFrame, pl.DataFrame]:
     """Estimate factor and residual returns across all time periods using input asset factor scores.
 
@@ -130,6 +132,8 @@ def estimate_factor_returns(
     mkt_cap_col: str name of the column we expect to find market cap values in, defaults to "market_cap"
     symbol_col: str name of the column we expect to find symbol names in, defaults to "symbol"
     date_col: str name of the column we expect to find time periods in, defaults to "date"
+    mkt_factor_col: str name to use for the column containing returned market factor, defaults to "market"
+    res_ret_col: str name to use for the column containing asset residual returns, defaults to "res_asset_returns"
 
     Returns
     -------
@@ -158,11 +162,11 @@ def estimate_factor_returns(
                 .join(sector_df, on=[date_col, symbol_col])
                 .join(style_df, on=[date_col, symbol_col])
             )
+            # split the conditional winsorization branch into two functions, so we don't have a conditional
+            # needlessly evaluated on each iteration of the `.map_groups`
             if winsor_factor is not None:
 
                 def _estimate_factor_returns(data):
-                    """"""
-                    dt = data[date_col].head(1).item()
                     r = winsorize(data[asset_returns_col].to_numpy())
                     fac, eps = factor_returns_cs(
                         r,
@@ -171,28 +175,39 @@ def _estimate_factor_returns(data):
                         data.select(styles).to_numpy(),
                         residualize_styles,
                     )
-                    return pl.DataFrame(fac.reshape(1, -1), schema=["market"] + sectors + styles).with_columns(
-                        pl.lit(dt).alias("date")
+                    return (
+                        # reshape so we get a row vector instead of a column vector for the DataFrame
+                        pl.DataFrame(fac.reshape(1, -1), schema=[mkt_factor_col] + sectors + styles)
+                        # add back the time period group to disambiguate
+                        .with_columns(pl.lit(data[date_col].head(1).item()).cast(pl.Date).alias(date_col)).with_columns(
+                            pl.lit(eps.tolist()).alias(res_ret_col),
+                            pl.lit(data[symbol_col].to_list()).alias(symbol_col),
+                        )
                     )
 
             else:
 
                 def _estimate_factor_returns(data):
-                    """"""
-                    dt = data[date_col].head(1).item()
                     fac, eps = factor_returns_cs(
                         data[asset_returns_col].to_numpy(),
                         data[mkt_cap_col].to_numpy(),
                         data.select(sectors).to_numpy(),
                         data.select(styles).to_numpy(),
                         residualize_styles,
                     )
-                    return pl.DataFrame(fac.reshape(1, -1), schema=["market"] + sectors + styles).with_columns(
-                        pl.lit(dt).alias("date")
+                    return (
+                        # reshape so we get a row vector instead of a column vector for the DataFrame
+                        pl.DataFrame(fac.reshape(1, -1), schema=[mkt_factor_col] + sectors + styles)
+                        # add back the time period group to disambiguate
+                        .with_columns(pl.lit(data[date_col].head(1).item()).cast(pl.Date).alias(date_col)).with_columns(
+                            pl.lit(eps.tolist()).alias(res_ret_col),
+                            pl.lit(data[symbol_col].to_list()).alias(symbol_col),
+                        )
                     )
 
-            # eps_df = pl.DataFrame(residuals).with_columns(pl.Series(dates).alias(date_col))
-            return returns_df.group_by(date_col).map_groups(_estimate_factor_returns)
+            fac_df = returns_df.group_by(date_col).map_groups(_estimate_factor_returns)
+            eps_df = fac_df[[date_col, symbol_col, res_ret_col]].explode([symbol_col, res_ret_col])
+            return fac_df.drop([symbol_col, res_ret_col]), eps_df
         except AttributeError as e:
             raise TypeError(
                 "`returns_df` and `mkt_cap_df` must be Polars DataFrames, but there are missing attributes"

diff --git a/toraniko/styles.py b/toraniko/styles.py
@@ -1,15 +1,20 @@
 """Style factor implementations."""
 
+import logging
+
 import numpy as np
 import polars as pl
 import polars.exceptions as pl_exc
 
 from toraniko.math import (
     exp_weights,
     center_xsection,
+    percentiles_xsection,
     winsorize_xsection,
 )
 
+logger = logging.getLogger(__name__)
+
 ###
 # NB: These functions do not try to handle NaN or null resilience for you, nor do they make allowances
 # for data having pathological distributions. Garbage in, garbage out. You need to inspect your data
@@ -49,7 +54,7 @@ def factor_mom(
 
     Parameters
     ----------
-    returns_df: Polars DataFrame containing columns: | date | symbol | asset_returns |
+    returns_df: Polars DataFrame containing columns: | `date_col` | `symbol_col` | `asset_returns_col` |
     trailing_days: int look back period over which to measure momentum
     half_life: int decay rate for exponential weighting, in days
     lag: int number of days to lag the current day's return observation (20 trading days is one month)
@@ -100,6 +105,8 @@ def weighted_cumprod(values: np.ndarray) -> float:
 
 def factor_sze(
     mkt_cap_df: pl.DataFrame | pl.LazyFrame,
+    lower_decile: float | None = None,
+    upper_decile: float | None = None,
     center: bool = True,
     standardize: bool = True,
     mkt_cap_col: str = "market_cap",
@@ -113,11 +120,17 @@ def factor_sze(
     to their rough order of magnitude, which is the salient feature we care most about. Given that we want to capture
     the risk premium of smaller factors over larger ones, we also multiply by -1.
 
+    You may also optionally implement Fama-French-like "hi - lo" behavior using the `lower_decile` and `upper_decile`
+    arguments. If you pass e.g. `lower_decile=0.3` and `upper_decile=0.7`, only values less than the 30th percentile
+    and greater than the 70th percentile will be considered for the factor. This is supported for backwards
+    compatibility, but not recommended.
+
     In practice, you should center and standardize your factor scores unless you have a very good reason not to.
 
     Parameters
     ----------
-    mkt_cap_df: Polars DataFrame containing columns: | date | symbol | market_cap |
+    mkt_cap_df: Polars DataFrame containing columns: | `date_col` | `symbol_col` | `mkt_cap_col` |
+    lower_decile: float value
     center: boolean indicating whether to center the final size scores before returning
     standardize: boolean indicating whether to standardize the final size scores after centering
     mkt_cap_col: str name of the column we expect to find the market cap values in, defaults to "market_cap"
@@ -131,6 +144,17 @@ def factor_sze(
     """
     try:
         df = mkt_cap_df.lazy().with_columns(pl.col(mkt_cap_col).log().alias(score_col))
+        if lower_decile is not None and upper_decile is not None:
+            df = df.with_columns(
+                percentiles_xsection(
+                    score_col, date_col, lower_pct=lower_decile, upper_pct=upper_decile, fill_val=0.0
+                ).alias(score_col)
+            )
+        if (lower_decile is not None and upper_decile is None) or (lower_decile is None and upper_decile is not None):
+            logger.warning(
+                "`lower_decile` and `upper_decile` must both be float values to apply cross-sectional percentile limits, "
+                "but one is None. Skipping cross-sectional percentile limiting; please review arguments"
+            )
         if center:
             df = df.with_columns((center_xsection(score_col, date_col, standardize=standardize)).alias(score_col) * -1)
         return df.select(date_col, symbol_col, score_col)
@@ -165,7 +189,7 @@ def factor_val(
 
     Parameters
     ----------
-    value_df: Polars DataFrame containing columns: | date | symbol | book_price | sales_price | cf_price
+    value_df: Polars DataFrame containing columns: | `date_col` | `symbol_col` | `bp_col` | `sp_col` | `cf_col`
     winsor_factor: optional float indicating what percentile to symmetrically winsorize features at, if desired
     center: boolean indicating whether to center the final value scores before returning
     standardize: boolean indicating whether to standardize the final value scores after centering
@@ -204,7 +228,7 @@ def factor_val(
             )
         )
         if center:
-            df = df.with_columns(center_xsection(score_col, date_col, standardize=True).alias(score_col))
+            df = df.with_columns(center_xsection(score_col, date_col, standardize=standardize).alias(score_col))
         return df.select(
             date_col,
             symbol_col,