Skip to content

Commit

Permalink
to ensure backwards compat, put back lower and upper deciles
Browse files Browse the repository at this point in the history
  • Loading branch information
0xfdf committed Aug 11, 2024
1 parent 470ff25 commit c198715
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 14 deletions.
35 changes: 25 additions & 10 deletions toraniko/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ def estimate_factor_returns(
mkt_cap_col: str = "market_cap",
symbol_col: str = "symbol",
date_col: str = "date",
mkt_factor_col: str = "market",
res_ret_col: str = "res_asset_returns",
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Estimate factor and residual returns across all time periods using input asset factor scores.
Expand All @@ -130,6 +132,8 @@ def estimate_factor_returns(
mkt_cap_col: str name of the column we expect to find market cap values in, defaults to "market_cap"
symbol_col: str name of the column we expect to find symbol names in, defaults to "symbol"
date_col: str name of the column we expect to find time periods in, defaults to "date"
mkt_factor_col: str name to use for the column containing returned market factor, defaults to "market"
res_ret_col: str name to use for the column containing asset residual returns, defaults to "res_asset_returns"
Returns
-------
Expand Down Expand Up @@ -158,11 +162,11 @@ def estimate_factor_returns(
.join(sector_df, on=[date_col, symbol_col])
.join(style_df, on=[date_col, symbol_col])
)
# split the conditional winsorization branch into two functions, so we don't have a conditional
# needlessly evaluated on each iteration of the `.map_groups`
if winsor_factor is not None:

def _estimate_factor_returns(data):
""""""
dt = data[date_col].head(1).item()
r = winsorize(data[asset_returns_col].to_numpy())
fac, eps = factor_returns_cs(
r,
Expand All @@ -171,28 +175,39 @@ def _estimate_factor_returns(data):
data.select(styles).to_numpy(),
residualize_styles,
)
return pl.DataFrame(fac.reshape(1, -1), schema=["market"] + sectors + styles).with_columns(
pl.lit(dt).alias("date")
return (
# reshape so we get a row vector instead of a column vector for the DataFrame
pl.DataFrame(fac.reshape(1, -1), schema=[mkt_factor_col] + sectors + styles)
# add back the time period group to disambiguate
.with_columns(pl.lit(data[date_col].head(1).item()).cast(pl.Date).alias(date_col)).with_columns(
pl.lit(eps.tolist()).alias(res_ret_col),
pl.lit(data[symbol_col].to_list()).alias(symbol_col),
)
)

else:

def _estimate_factor_returns(data):
""""""
dt = data[date_col].head(1).item()
fac, eps = factor_returns_cs(
data[asset_returns_col].to_numpy(),
data[mkt_cap_col].to_numpy(),
data.select(sectors).to_numpy(),
data.select(styles).to_numpy(),
residualize_styles,
)
return pl.DataFrame(fac.reshape(1, -1), schema=["market"] + sectors + styles).with_columns(
pl.lit(dt).alias("date")
return (
# reshape so we get a row vector instead of a column vector for the DataFrame
pl.DataFrame(fac.reshape(1, -1), schema=[mkt_factor_col] + sectors + styles)
# add back the time period group to disambiguate
.with_columns(pl.lit(data[date_col].head(1).item()).cast(pl.Date).alias(date_col)).with_columns(
pl.lit(eps.tolist()).alias(res_ret_col),
pl.lit(data[symbol_col].to_list()).alias(symbol_col),
)
)

# eps_df = pl.DataFrame(residuals).with_columns(pl.Series(dates).alias(date_col))
return returns_df.group_by(date_col).map_groups(_estimate_factor_returns)
fac_df = returns_df.group_by(date_col).map_groups(_estimate_factor_returns)
eps_df = fac_df[[date_col, symbol_col, res_ret_col]].explode([symbol_col, res_ret_col])
return fac_df.drop([symbol_col, res_ret_col]), eps_df
except AttributeError as e:
raise TypeError(
"`returns_df` and `mkt_cap_df` must be Polars DataFrames, but there are missing attributes"
Expand Down
32 changes: 28 additions & 4 deletions toraniko/styles.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
"""Style factor implementations."""

import logging

import numpy as np
import polars as pl
import polars.exceptions as pl_exc

from toraniko.math import (
exp_weights,
center_xsection,
percentiles_xsection,
winsorize_xsection,
)

logger = logging.getLogger(__name__)

###
# NB: These functions do not try to handle NaN or null resilience for you, nor do they make allowances
# for data having pathological distributions. Garbage in, garbage out. You need to inspect your data
Expand Down Expand Up @@ -49,7 +54,7 @@ def factor_mom(
Parameters
----------
returns_df: Polars DataFrame containing columns: | date | symbol | asset_returns |
returns_df: Polars DataFrame containing columns: | `date_col` | `symbol_col` | `asset_returns_col` |
trailing_days: int look back period over which to measure momentum
half_life: int decay rate for exponential weighting, in days
lag: int number of days to lag the current day's return observation (20 trading days is one month)
Expand Down Expand Up @@ -100,6 +105,8 @@ def weighted_cumprod(values: np.ndarray) -> float:

def factor_sze(
mkt_cap_df: pl.DataFrame | pl.LazyFrame,
lower_decile: float | None = None,
upper_decile: float | None = None,
center: bool = True,
standardize: bool = True,
mkt_cap_col: str = "market_cap",
Expand All @@ -113,11 +120,17 @@ def factor_sze(
to their rough order of magnitude, which is the salient feature we care most about. Given that we want to capture
the risk premium of smaller factors over larger ones, we also multiply by -1.
You may also optionally implement Fama-French-like "hi - lo" behavior using the `lower_decile` and `upper_decile`
arguments. If you pass e.g. `lower_decile=0.3` and `upper_decile=0.7`, only values less than the 30th percentile
and greater than the 70th percentile will be considered for the factor. This is supported for backwards
compatibility, but not recommended.
In practice, you should center and standardize your factor scores unless you have a very good reason not to.
Parameters
----------
mkt_cap_df: Polars DataFrame containing columns: | date | symbol | market_cap |
mkt_cap_df: Polars DataFrame containing columns: | `date_col` | `symbol_col` | `mkt_cap_col` |
lower_decile: float value
center: boolean indicating whether to center the final size scores before returning
standardize: boolean indicating whether to standardize the final size scores after centering
mkt_cap_col: str name of the column we expect to find the market cap values in, defaults to "market_cap"
Expand All @@ -131,6 +144,17 @@ def factor_sze(
"""
try:
df = mkt_cap_df.lazy().with_columns(pl.col(mkt_cap_col).log().alias(score_col))
if lower_decile is not None and upper_decile is not None:
df = df.with_columns(
percentiles_xsection(
score_col, date_col, lower_pct=lower_decile, upper_pct=upper_decile, fill_val=0.0
).alias(score_col)
)
if (lower_decile is not None and upper_decile is None) or (lower_decile is None and upper_decile is not None):
logger.warning(
"`lower_decile` and `upper_decile` must both be float values to apply cross-sectional percentile limits, "
"but one is None. Skipping cross-sectional percentile limiting; please review arguments"
)
if center:
df = df.with_columns((center_xsection(score_col, date_col, standardize=standardize)).alias(score_col) * -1)
return df.select(date_col, symbol_col, score_col)
Expand Down Expand Up @@ -165,7 +189,7 @@ def factor_val(
Parameters
----------
value_df: Polars DataFrame containing columns: | date | symbol | book_price | sales_price | cf_price
value_df: Polars DataFrame containing columns: | `date_col` | `symbol_col` | `bp_col` | `sp_col` | `cf_col`
winsor_factor: optional float indicating what percentile to symmetrically winsorize features at, if desired
center: boolean indicating whether to center the final value scores before returning
standardize: boolean indicating whether to standardize the final value scores after centering
Expand Down Expand Up @@ -204,7 +228,7 @@ def factor_val(
)
)
if center:
df = df.with_columns(center_xsection(score_col, date_col, standardize=True).alias(score_col))
df = df.with_columns(center_xsection(score_col, date_col, standardize=standardize).alias(score_col))
return df.select(
date_col,
symbol_col,
Expand Down

0 comments on commit c198715

Please sign in to comment.