diff --git a/toraniko/model.py b/toraniko/model.py index 72c1f2e..c612827 100644 --- a/toraniko/model.py +++ b/toraniko/model.py @@ -109,6 +109,8 @@ def estimate_factor_returns( mkt_cap_col: str = "market_cap", symbol_col: str = "symbol", date_col: str = "date", + mkt_factor_col: str = "market", + res_ret_col: str = "res_asset_returns", ) -> tuple[pl.DataFrame, pl.DataFrame]: """Estimate factor and residual returns across all time periods using input asset factor scores. @@ -130,6 +132,8 @@ def estimate_factor_returns( mkt_cap_col: str name of the column we expect to find market cap values in, defaults to "market_cap" symbol_col: str name of the column we expect to find symbol names in, defaults to "symbol" date_col: str name of the column we expect to find time periods in, defaults to "date" + mkt_factor_col: str name to use for the column containing returned market factor, defaults to "market" + res_ret_col: str name to use for the column containing asset residual returns, defaults to "res_asset_returns" Returns ------- @@ -158,11 +162,11 @@ def estimate_factor_returns( .join(sector_df, on=[date_col, symbol_col]) .join(style_df, on=[date_col, symbol_col]) ) + # split the conditional winsorization branch into two functions, so we don't have a conditional + # needlessly evaluated on each iteration of the `.map_groups` if winsor_factor is not None: def _estimate_factor_returns(data): - """""" - dt = data[date_col].head(1).item() r = winsorize(data[asset_returns_col].to_numpy()) fac, eps = factor_returns_cs( r, @@ -171,15 +175,19 @@ def _estimate_factor_returns(data): data.select(styles).to_numpy(), residualize_styles, ) - return pl.DataFrame(fac.reshape(1, -1), schema=["market"] + sectors + styles).with_columns( - pl.lit(dt).alias("date") + return ( + # reshape so we get a row vector instead of a column vector for the DataFrame + pl.DataFrame(fac.reshape(1, -1), schema=[mkt_factor_col] + sectors + styles) + # add back the time period group to disambiguate + .with_columns(pl.lit(data[date_col].head(1).item()).cast(pl.Date).alias(date_col)).with_columns( + pl.lit(eps.tolist()).alias(res_ret_col), + pl.lit(data[symbol_col].to_list()).alias(symbol_col), + ) ) else: def _estimate_factor_returns(data): - """""" - dt = data[date_col].head(1).item() fac, eps = factor_returns_cs( data[asset_returns_col].to_numpy(), data[mkt_cap_col].to_numpy(), @@ -187,12 +195,19 @@ def _estimate_factor_returns(data): data.select(styles).to_numpy(), residualize_styles, ) - return pl.DataFrame(fac.reshape(1, -1), schema=["market"] + sectors + styles).with_columns( - pl.lit(dt).alias("date") + return ( + # reshape so we get a row vector instead of a column vector for the DataFrame + pl.DataFrame(fac.reshape(1, -1), schema=[mkt_factor_col] + sectors + styles) + # add back the time period group to disambiguate + .with_columns(pl.lit(data[date_col].head(1).item()).cast(pl.Date).alias(date_col)).with_columns( + pl.lit(eps.tolist()).alias(res_ret_col), + pl.lit(data[symbol_col].to_list()).alias(symbol_col), + ) ) - # eps_df = pl.DataFrame(residuals).with_columns(pl.Series(dates).alias(date_col)) - return returns_df.group_by(date_col).map_groups(_estimate_factor_returns) + fac_df = returns_df.group_by(date_col).map_groups(_estimate_factor_returns) + eps_df = fac_df[[date_col, symbol_col, res_ret_col]].explode([symbol_col, res_ret_col]) + return fac_df.drop([symbol_col, res_ret_col]), eps_df except AttributeError as e: raise TypeError( "`returns_df` and `mkt_cap_df` must be Polars DataFrames, but there are missing attributes" diff --git a/toraniko/styles.py b/toraniko/styles.py index bce8130..db6e56e 100644 --- a/toraniko/styles.py +++ b/toraniko/styles.py @@ -1,5 +1,7 @@ """Style factor implementations.""" +import logging + import numpy as np import polars as pl import polars.exceptions as pl_exc @@ -7,9 +9,12 @@ from toraniko.math import ( exp_weights, center_xsection, + percentiles_xsection, winsorize_xsection, ) +logger = logging.getLogger(__name__) + ### # NB: These functions do not try to handle NaN or null resilience for you, nor do they make allowances # for data having pathological distributions. Garbage in, garbage out. You need to inspect your data @@ -49,7 +54,7 @@ def factor_mom( Parameters ---------- - returns_df: Polars DataFrame containing columns: | date | symbol | asset_returns | + returns_df: Polars DataFrame containing columns: | `date_col` | `symbol_col` | `asset_returns_col` | trailing_days: int look back period over which to measure momentum half_life: int decay rate for exponential weighting, in days lag: int number of days to lag the current day's return observation (20 trading days is one month) @@ -100,6 +105,8 @@ def weighted_cumprod(values: np.ndarray) -> float: def factor_sze( mkt_cap_df: pl.DataFrame | pl.LazyFrame, + lower_decile: float | None = None, + upper_decile: float | None = None, center: bool = True, standardize: bool = True, mkt_cap_col: str = "market_cap", @@ -113,11 +120,17 @@ def factor_sze( to their rough order of magnitude, which is the salient feature we care most about. Given that we want to capture the risk premium of smaller factors over larger ones, we also multiply by -1. + You may also optionally implement Fama-French-like "hi - lo" behavior using the `lower_decile` and `upper_decile` + arguments. If you pass e.g. `lower_decile=0.3` and `upper_decile=0.7`, only values less than the 30th percentile + and greater than the 70th percentile will be considered for the factor. This is supported for backwards + compatibility, but not recommended. + In practice, you should center and standardize your factor scores unless you have a very good reason not to. Parameters ---------- - mkt_cap_df: Polars DataFrame containing columns: | date | symbol | market_cap | + mkt_cap_df: Polars DataFrame containing columns: | `date_col` | `symbol_col` | `mkt_cap_col` | + lower_decile: float value center: boolean indicating whether to center the final size scores before returning standardize: boolean indicating whether to standardize the final size scores after centering mkt_cap_col: str name of the column we expect to find the market cap values in, defaults to "market_cap" @@ -131,6 +144,17 @@ def factor_sze( """ try: df = mkt_cap_df.lazy().with_columns(pl.col(mkt_cap_col).log().alias(score_col)) + if lower_decile is not None and upper_decile is not None: + df = df.with_columns( + percentiles_xsection( + score_col, date_col, lower_pct=lower_decile, upper_pct=upper_decile, fill_val=0.0 + ).alias(score_col) + ) + if (lower_decile is not None and upper_decile is None) or (lower_decile is None and upper_decile is not None): + logger.warning( + "`lower_decile` and `upper_decile` must both be float values to apply cross-sectional percentile limits, " + "but one is None. Skipping cross-sectional percentile limiting; please review arguments" + ) if center: df = df.with_columns((center_xsection(score_col, date_col, standardize=standardize)).alias(score_col) * -1) return df.select(date_col, symbol_col, score_col) @@ -165,7 +189,7 @@ def factor_val( Parameters ---------- - value_df: Polars DataFrame containing columns: | date | symbol | book_price | sales_price | cf_price + value_df: Polars DataFrame containing columns: | `date_col` | `symbol_col` | `bp_col` | `sp_col` | `cf_col` winsor_factor: optional float indicating what percentile to symmetrically winsorize features at, if desired center: boolean indicating whether to center the final value scores before returning standardize: boolean indicating whether to standardize the final value scores after centering @@ -204,7 +228,7 @@ def factor_val( ) ) if center: - df = df.with_columns(center_xsection(score_col, date_col, standardize=True).alias(score_col)) + df = df.with_columns(center_xsection(score_col, date_col, standardize=standardize).alias(score_col)) return df.select( date_col, symbol_col,