Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Project enhancements #1

Merged
merged 16 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: CI

on:
pull_request:
branches:
- main

jobs:
test:
runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v3

- name: Set Python
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Install deps
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt

- name: Run tests
run: |
pytest tests/
1 change: 1 addition & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pytest~=7.4.4
55 changes: 22 additions & 33 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,57 +2,46 @@
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[project]
[tool.poetry]
name = "toraniko"
version = "1.0.0"
description = "A multi-factor equity risk model for quantitative trading."
authors = [
{ name = "0xfdf", email = "[email protected]" },
]
maintainers = [
{ name = "0xfdf", email = "[email protected]" },
]
requires-python = ">=3.10"
readme = {file = "README.md", content-type = "text/markdown"}
license = {file = "LICENSE"}
authors = ["0xfdf <[email protected]>"]
maintainers = ["0xfdf <[email protected]>"]
license = "MIT"
readme = "README.md"
homepage = "https://github.com/0xfdf/toraniko"
repository = "https://github.com/0xfdf/toraniko"
keywords = ["risk", "model", "portfolio", "optimization", "factor", "quant", "quantitative", "finance", "trading"]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Science/Research",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"License :: OSI Approved :: MIT License",
"Topic :: Scientific/Engineering"
]

dependencies = [
"numpy>=1.26",
"polars>=0.20.3"
]
[tool.poetry.dependencies]
python = ">=3.10,<4.0"
numpy = "~1.26.2"
polars = "~1.0.0"

[project.urls]
Homepage = "https://github.com/0xfdf/toraniko"
Repository = "https://github.com/0xfdf/toraniko"
Issues = "https://github.com/0xfdf/toraniko/issues"
Changelog = "https://github.com/0xfdf/toraniko/releases"
[tool.poetry.dev-dependencies]
pytest = "~7.4.4"

[tool.poetry.urls]
homepage = "https://github.com/0xfdf/toraniko"
repository = "https://github.com/0xfdf/toraniko"
issues = "https://github.com/0xfdf/toraniko/issues"
changelog = "https://github.com/0xfdf/toraniko/releases"

[tool.ruff]
line-length = 88
line-length = 120
fix = false
select = ["E", "F", "I", "N", "Q", "R", "S", "T", "U", "W", "Y"]

[tool.black]
line-length = 88

[tool.poetry]
name = "toraniko"
version = "1.0.0"
description = "A multi-factor equity risk model for quantitative trading."
authors = ["0xfdf <[email protected]>"]

[tool.poetry.dependencies]
python = "^3.10"
numpy = "1.26.2"
polars = "^0.20.3"
line-length = 120
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
numpy~=1.26.2
polars~=1.0
76 changes: 45 additions & 31 deletions toraniko/math.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
import polars as pl


def center_xsection(
target_col: str, over_col: str, standardize: bool = False
) -> pl.Expr:
def center_xsection(target_col: str, over_col: str, standardize: bool = False) -> pl.Expr:
"""Cross-sectionally center (and optionally standardize) a Polars DataFrame `target_col` partitioned by `over_col`.

This returns a Polars expression, so it be chained in a `select` or `with_columns` invocation
Expand All @@ -22,9 +20,7 @@ def center_xsection(
-------
Polars Expr
"""
expr = pl.col(target_col) - pl.col(target_col).drop_nulls().drop_nans().mean().over(
over_col
)
expr = pl.col(target_col) - pl.col(target_col).drop_nulls().drop_nans().mean().over(over_col)
if standardize:
return expr / pl.col(target_col).drop_nulls().drop_nans().std().over(over_col)
return expr
Expand All @@ -39,27 +35,34 @@ def norm_xsection(
"""Cross-sectionally normalize a Polars DataFrame `target_col` partitioned by `over_col`, with rescaling
to the interval [`lower`, `upper`].

This returns a Polars expression, so it be chained in a `select` or `with_columns` invocation
This returns a Polars expression, so it can be chained in a `select` or `with_columns` invocation
without needing to set a new intermediate DataFrame or materialize lazy evaluation.

NaN values are not propagated in the max and min calculation, but NaN values are preserved for normalization.

Parameters
----------
target_col: str name of the column to normalize
over_col: str name of the column to partition the normalization by
lower: lower bound of the recaling interval, defaults to 0 to construct a percent
lower: lower bound of the rescaling interval, defaults to 0 to construct a percent
upper: upper bound of the rescaling interval, defaults to 1 to construct a percent

Returns
-------
Polars Expr
"""
min_col = pl.col(target_col).drop_nans().min().over(over_col)
max_col = pl.col(target_col).drop_nans().max().over(over_col)

norm_col = (
pl.col(target_col) - pl.col(target_col).drop_nans().min().over(over_col)
) / (
pl.col(target_col).drop_nans().max().over(over_col)
- pl.col(target_col).drop_nans().min().over(over_col)
pl.when(pl.col(target_col).is_nan())
.then(pl.col(target_col)) # Preserve NaN values
.when(max_col != min_col) # Avoid division by zero by making sure min != max
.then((pl.col(target_col) - min_col) / (max_col - min_col) * (upper - lower) + lower)
.otherwise(lower)
)
return norm_col * (upper - lower) + lower

return norm_col


def winsorize(data: np.ndarray, percentile: float = 0.05, axis: int = 0) -> np.ndarray:
Expand All @@ -78,18 +81,17 @@ def winsorize(data: np.ndarray, percentile: float = 0.05, axis: int = 0) -> np.n
-------
numpy array
"""
if not 0 <= percentile <= 1:
raise ValueError("`percentile` must be between 0 and 1")
try:
if not 0 <= percentile <= 1:
raise ValueError("`percentile` must be between 0 and 1")
except AttributeError as e:
raise TypeError("`percentile` must be a numeric type, such as an int or float") from e

fin_data = np.where(np.isfinite(data), data, np.nan)

# compute lower and upper percentiles for each column
lower_bounds = np.nanpercentile(
fin_data, percentile * 100, axis=axis, keepdims=True
)
upper_bounds = np.nanpercentile(
fin_data, (1 - percentile) * 100, axis=axis, keepdims=True
)
lower_bounds = np.nanpercentile(fin_data, percentile * 100, axis=axis, keepdims=True)
upper_bounds = np.nanpercentile(fin_data, (1 - percentile) * 100, axis=axis, keepdims=True)

# clip data to within the bounds
return np.clip(data, lower_bounds, upper_bounds)
Expand Down Expand Up @@ -122,11 +124,17 @@ def winsorize_group(group: pl.DataFrame) -> pl.DataFrame:
group = group.with_columns(pl.Series(col, winsorized_data).alias(col))
return group

grouped = df.groupby(group_col).apply(winsorize_group)
match df:
case pl.DataFrame():
grouped = df.group_by(group_col).map_groups(winsorize_group)
case pl.LazyFrame():
grouped = df.group_by(group_col).map_groups(winsorize_group, schema=df.collect_schema())
case _:
raise TypeError("`df` must be a Polars DataFrame or LazyFrame")
return grouped


def xsection_percentiles(
def percentiles_xsection(
target_col: str,
over_col: str,
lower_pct: float,
Expand Down Expand Up @@ -154,14 +162,8 @@ def xsection_percentiles(
"""
return (
pl.when(
(
pl.col(target_col)
<= pl.col(target_col).drop_nans().quantile(lower_pct).over(over_col)
)
| (
pl.col(target_col)
>= pl.col(target_col).drop_nans().quantile(upper_pct).over(over_col)
)
(pl.col(target_col) <= pl.col(target_col).drop_nans().quantile(lower_pct).over(over_col))
| (pl.col(target_col) >= pl.col(target_col).drop_nans().quantile(upper_pct).over(over_col))
)
.then(pl.col(target_col))
.otherwise(fill_val)
Expand All @@ -180,5 +182,17 @@ def exp_weights(window: int, half_life: int) -> np.ndarray:
-------
numpy array
"""
try:
assert isinstance(window, int)
if not window > 0:
raise ValueError("`window` must be a strictly positive integer")
except (AttributeError, AssertionError) as e:
raise TypeError("`window` must be an integer type") from e
try:
assert isinstance(half_life, int)
if not half_life > 0:
raise ValueError("`half_life` must be a strictly positive integer")
except (AttributeError, AssertionError) as e:
raise TypeError("`half_life` must be an integer type") from e
decay = np.log(2) / half_life
return np.exp(-decay * np.arange(window))[::-1]
31 changes: 10 additions & 21 deletions toraniko/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np
import polars as pl
import polars.exceptions as pl_exc

from toraniko.math import winsorize

Expand Down Expand Up @@ -42,19 +43,15 @@ def _factor_returns(
# Change of variables to add the constraint
B_sector = beta_sector @ R_sector

V_sector, _, _, _ = np.linalg.lstsq(
B_sector.T @ W @ B_sector, B_sector.T @ W, rcond=None
)
V_sector, _, _, _ = np.linalg.lstsq(B_sector.T @ W @ B_sector, B_sector.T @ W, rcond=None)
# Change of variables to recover all sectors
g = V_sector @ returns
fac_ret_sector = R_sector @ g

sector_resid_returns = returns - (B_sector @ g)

# Estimate style factor returns without constraints
V_style, _, _, _ = np.linalg.lstsq(
style_scores.T @ W @ style_scores, style_scores.T @ W, rcond=None
)
V_style, _, _, _ = np.linalg.lstsq(style_scores.T @ W @ style_scores, style_scores.T @ W, rcond=None)
if residualize_styles:
fac_ret_style = V_style @ sector_resid_returns
else:
Expand Down Expand Up @@ -96,23 +93,15 @@ def estimate_factor_returns(
try:
sectors = sorted(sector_df.select(pl.exclude("date", "symbol")).columns)
except AttributeError as e:
raise TypeError(
"`sector_df` must be a Polars DataFrame, but it's missing required attributes"
) from e
except pl.ColumnNotFoundError as e:
raise ValueError(
"`sector_df` must have columns for 'date' and 'symbol' in addition to each sector"
) from e
raise TypeError("`sector_df` must be a Polars DataFrame, but it's missing required attributes") from e
except pl_exc.ColumnNotFoundError as e:
raise ValueError("`sector_df` must have columns for 'date' and 'symbol' in addition to each sector") from e
try:
styles = sorted(style_df.select(pl.exclude("date", "symbol")).columns)
except AttributeError as e:
raise TypeError(
"`style_df` must be a Polars DataFrame, but it's missing required attributes"
) from e
except pl.ColumnNotFoundError as e:
raise ValueError(
"`style_df` must have columns for 'date' and 'symbol' in addition to each style"
) from e
raise TypeError("`style_df` must be a Polars DataFrame, but it's missing required attributes") from e
except pl_exc.ColumnNotFoundError as e:
raise ValueError("`style_df` must have columns for 'date' and 'symbol' in addition to each style") from e
try:
returns_df = (
returns_df.join(mkt_cap_df, on=["date", "symbol"])
Expand Down Expand Up @@ -140,7 +129,7 @@ def estimate_factor_returns(
raise TypeError(
"`returns_df` and `mkt_cap_df` must be Polars DataFrames, but there are missing attributes"
) from e
except pl.ColumnNotFoundError as e:
except pl_exc.ColumnNotFoundError as e:
raise ValueError(
"`returns_df` must have columns 'date', 'symbol' and 'asset_returns'; "
"`mkt_cap_df` must have 'date', 'symbol' and 'market_cap' columns"
Expand Down
Loading
Loading