0xfdf · 0xfdf · Aug 5, 2024 · Aug 4, 2024 · Aug 4, 2024 · Aug 4, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,28 @@
+name: CI
+
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install deps
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Run tests
+        run: |
+          pytest tests/
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -0,0 +1 @@
+pytest~=7.4.4
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,57 +2,46 @@
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
-[project]
+[tool.poetry]
 name = "toraniko"
 version = "1.0.0"
 description = "A multi-factor equity risk model for quantitative trading."
-authors = [
-  { name = "0xfdf", email = "[email protected]" },
-]
-maintainers = [
-  { name = "0xfdf", email = "[email protected]" },
-]
-requires-python = ">=3.10"
-readme = {file = "README.md", content-type = "text/markdown"}
-license = {file = "LICENSE"}
+authors = ["0xfdf <[email protected]>"]
+maintainers = ["0xfdf <[email protected]>"]
+license = "MIT"
+readme = "README.md"
+homepage = "https://github.com/0xfdf/toraniko"
+repository = "https://github.com/0xfdf/toraniko"
 keywords = ["risk", "model", "portfolio", "optimization", "factor", "quant", "quantitative", "finance", "trading"]
 classifiers = [
     "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Science/Research",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3 :: Only",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "License :: OSI Approved :: MIT License",
     "Topic :: Scientific/Engineering"
 ]
 
-dependencies = [
-    "numpy>=1.26",
-    "polars>=0.20.3"
-]
+[tool.poetry.dependencies]
+python = ">=3.10,<4.0"
+numpy = "~1.26.2"
+polars = "~1.0.0"
 
-[project.urls]
-Homepage = "https://github.com/0xfdf/toraniko"
-Repository = "https://github.com/0xfdf/toraniko"
-Issues = "https://github.com/0xfdf/toraniko/issues"
-Changelog = "https://github.com/0xfdf/toraniko/releases"
+[tool.poetry.dev-dependencies]
+pytest = "~7.4.4"
+
+[tool.poetry.urls]
+homepage = "https://github.com/0xfdf/toraniko"
+repository = "https://github.com/0xfdf/toraniko"
+issues = "https://github.com/0xfdf/toraniko/issues"
+changelog = "https://github.com/0xfdf/toraniko/releases"
 
 [tool.ruff]
-line-length = 88
+line-length = 120
 fix = false
+select = ["E", "F", "I", "N", "Q", "R", "S", "T", "U", "W", "Y"]
 
 [tool.black]
-line-length = 88
-
-[tool.poetry]
-name = "toraniko"
-version = "1.0.0"
-description = "A multi-factor equity risk model for quantitative trading."
-authors = ["0xfdf <[email protected]>"]
-
-[tool.poetry.dependencies]
-python = "^3.10"
-numpy = "1.26.2"
-polars = "^0.20.3"
+line-length = 120
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+numpy~=1.26.2
+polars~=1.0
diff --git a/toraniko/math.py b/toraniko/math.py
@@ -4,9 +4,7 @@
 import polars as pl
 
 
-def center_xsection(
-    target_col: str, over_col: str, standardize: bool = False
-) -> pl.Expr:
+def center_xsection(target_col: str, over_col: str, standardize: bool = False) -> pl.Expr:
     """Cross-sectionally center (and optionally standardize) a Polars DataFrame `target_col` partitioned by `over_col`.
 
     This returns a Polars expression, so it be chained in a `select` or `with_columns` invocation
@@ -22,9 +20,7 @@ def center_xsection(
     -------
     Polars Expr
     """
-    expr = pl.col(target_col) - pl.col(target_col).drop_nulls().drop_nans().mean().over(
-        over_col
-    )
+    expr = pl.col(target_col) - pl.col(target_col).drop_nulls().drop_nans().mean().over(over_col)
     if standardize:
         return expr / pl.col(target_col).drop_nulls().drop_nans().std().over(over_col)
     return expr
@@ -39,27 +35,34 @@ def norm_xsection(
     """Cross-sectionally normalize a Polars DataFrame `target_col` partitioned by `over_col`, with rescaling
     to the interval [`lower`, `upper`].
 
-    This returns a Polars expression, so it be chained in a `select` or `with_columns` invocation
+    This returns a Polars expression, so it can be chained in a `select` or `with_columns` invocation
     without needing to set a new intermediate DataFrame or materialize lazy evaluation.
 
+    NaN values are not propagated in the max and min calculation, but NaN values are preserved for normalization.
+
     Parameters
     ----------
     target_col: str name of the column to normalize
     over_col: str name of the column to partition the normalization by
-    lower: lower bound of the recaling interval, defaults to 0 to construct a percent
+    lower: lower bound of the rescaling interval, defaults to 0 to construct a percent
     upper: upper bound of the rescaling interval, defaults to 1 to construct a percent
 
     Returns
     -------
     Polars Expr
     """
+    min_col = pl.col(target_col).drop_nans().min().over(over_col)
+    max_col = pl.col(target_col).drop_nans().max().over(over_col)
+
     norm_col = (
-        pl.col(target_col) - pl.col(target_col).drop_nans().min().over(over_col)
-    ) / (
-        pl.col(target_col).drop_nans().max().over(over_col)
-        - pl.col(target_col).drop_nans().min().over(over_col)
+        pl.when(pl.col(target_col).is_nan())
+        .then(pl.col(target_col))  # Preserve NaN values
+        .when(max_col != min_col)  # Avoid division by zero by making sure min != max
+        .then((pl.col(target_col) - min_col) / (max_col - min_col) * (upper - lower) + lower)
+        .otherwise(lower)
     )
-    return norm_col * (upper - lower) + lower
+
+    return norm_col
 
 
 def winsorize(data: np.ndarray, percentile: float = 0.05, axis: int = 0) -> np.ndarray:
@@ -78,18 +81,17 @@ def winsorize(data: np.ndarray, percentile: float = 0.05, axis: int = 0) -> np.n
     -------
     numpy array
     """
-    if not 0 <= percentile <= 1:
-        raise ValueError("`percentile` must be between 0 and 1")
+    try:
+        if not 0 <= percentile <= 1:
+            raise ValueError("`percentile` must be between 0 and 1")
+    except AttributeError as e:
+        raise TypeError("`percentile` must be a numeric type, such as an int or float") from e
 
     fin_data = np.where(np.isfinite(data), data, np.nan)
 
     # compute lower and upper percentiles for each column
-    lower_bounds = np.nanpercentile(
-        fin_data, percentile * 100, axis=axis, keepdims=True
-    )
-    upper_bounds = np.nanpercentile(
-        fin_data, (1 - percentile) * 100, axis=axis, keepdims=True
-    )
+    lower_bounds = np.nanpercentile(fin_data, percentile * 100, axis=axis, keepdims=True)
+    upper_bounds = np.nanpercentile(fin_data, (1 - percentile) * 100, axis=axis, keepdims=True)
 
     # clip data to within the bounds
     return np.clip(data, lower_bounds, upper_bounds)
@@ -122,11 +124,17 @@ def winsorize_group(group: pl.DataFrame) -> pl.DataFrame:
             group = group.with_columns(pl.Series(col, winsorized_data).alias(col))
         return group
 
-    grouped = df.groupby(group_col).apply(winsorize_group)
+    match df:
+        case pl.DataFrame():
+            grouped = df.group_by(group_col).map_groups(winsorize_group)
+        case pl.LazyFrame():
+            grouped = df.group_by(group_col).map_groups(winsorize_group, schema=df.collect_schema())
+        case _:
+            raise TypeError("`df` must be a Polars DataFrame or LazyFrame")
     return grouped
 
 
-def xsection_percentiles(
+def percentiles_xsection(
     target_col: str,
     over_col: str,
     lower_pct: float,
@@ -154,14 +162,8 @@ def xsection_percentiles(
     """
     return (
         pl.when(
-            (
-                pl.col(target_col)
-                <= pl.col(target_col).drop_nans().quantile(lower_pct).over(over_col)
-            )
-            | (
-                pl.col(target_col)
-                >= pl.col(target_col).drop_nans().quantile(upper_pct).over(over_col)
-            )
+            (pl.col(target_col) <= pl.col(target_col).drop_nans().quantile(lower_pct).over(over_col))
+            | (pl.col(target_col) >= pl.col(target_col).drop_nans().quantile(upper_pct).over(over_col))
         )
         .then(pl.col(target_col))
         .otherwise(fill_val)
@@ -180,5 +182,17 @@ def exp_weights(window: int, half_life: int) -> np.ndarray:
     -------
     numpy array
     """
+    try:
+        assert isinstance(window, int)
+        if not window > 0:
+            raise ValueError("`window` must be a strictly positive integer")
+    except (AttributeError, AssertionError) as e:
+        raise TypeError("`window` must be an integer type") from e
+    try:
+        assert isinstance(half_life, int)
+        if not half_life > 0:
+            raise ValueError("`half_life` must be a strictly positive integer")
+    except (AttributeError, AssertionError) as e:
+        raise TypeError("`half_life` must be an integer type") from e
     decay = np.log(2) / half_life
     return np.exp(-decay * np.arange(window))[::-1]
diff --git a/toraniko/model.py b/toraniko/model.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import polars as pl
+import polars.exceptions as pl_exc
 
 from toraniko.math import winsorize
 
@@ -42,19 +43,15 @@ def _factor_returns(
     # Change of variables to add the constraint
     B_sector = beta_sector @ R_sector
 
-    V_sector, _, _, _ = np.linalg.lstsq(
-        B_sector.T @ W @ B_sector, B_sector.T @ W, rcond=None
-    )
+    V_sector, _, _, _ = np.linalg.lstsq(B_sector.T @ W @ B_sector, B_sector.T @ W, rcond=None)
     # Change of variables to recover all sectors
     g = V_sector @ returns
     fac_ret_sector = R_sector @ g
 
     sector_resid_returns = returns - (B_sector @ g)
 
     # Estimate style factor returns without constraints
-    V_style, _, _, _ = np.linalg.lstsq(
-        style_scores.T @ W @ style_scores, style_scores.T @ W, rcond=None
-    )
+    V_style, _, _, _ = np.linalg.lstsq(style_scores.T @ W @ style_scores, style_scores.T @ W, rcond=None)
     if residualize_styles:
         fac_ret_style = V_style @ sector_resid_returns
     else:
@@ -96,23 +93,15 @@ def estimate_factor_returns(
     try:
         sectors = sorted(sector_df.select(pl.exclude("date", "symbol")).columns)
     except AttributeError as e:
-        raise TypeError(
-            "`sector_df` must be a Polars DataFrame, but it's missing required attributes"
-        ) from e
-    except pl.ColumnNotFoundError as e:
-        raise ValueError(
-            "`sector_df` must have columns for 'date' and 'symbol' in addition to each sector"
-        ) from e
+        raise TypeError("`sector_df` must be a Polars DataFrame, but it's missing required attributes") from e
+    except pl_exc.ColumnNotFoundError as e:
+        raise ValueError("`sector_df` must have columns for 'date' and 'symbol' in addition to each sector") from e
     try:
         styles = sorted(style_df.select(pl.exclude("date", "symbol")).columns)
     except AttributeError as e:
-        raise TypeError(
-            "`style_df` must be a Polars DataFrame, but it's missing required attributes"
-        ) from e
-    except pl.ColumnNotFoundError as e:
-        raise ValueError(
-            "`style_df` must have columns for 'date' and 'symbol' in addition to each style"
-        ) from e
+        raise TypeError("`style_df` must be a Polars DataFrame, but it's missing required attributes") from e
+    except pl_exc.ColumnNotFoundError as e:
+        raise ValueError("`style_df` must have columns for 'date' and 'symbol' in addition to each style") from e
     try:
         returns_df = (
             returns_df.join(mkt_cap_df, on=["date", "symbol"])
@@ -140,7 +129,7 @@ def estimate_factor_returns(
         raise TypeError(
             "`returns_df` and `mkt_cap_df` must be Polars DataFrames, but there are missing attributes"
         ) from e
-    except pl.ColumnNotFoundError as e:
+    except pl_exc.ColumnNotFoundError as e:
         raise ValueError(
             "`returns_df` must have columns 'date', 'symbol' and 'asset_returns'; "
             "`mkt_cap_df` must have 'date', 'symbol' and 'market_cap' columns"