# import all nessary libraries
+import os
+import warnings
+
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import yfinance as yf
+import plotly.graph_objects as go
+import statsmodels.api as sm
+import statsmodels.formula.api as smf
+
+from matplotlib import pyplot as plt
+from scipy.stats import ttest_1samp
+from scipy.signal import argrelextrema
+
+dpi = 600
+transparent = False
+plot_folder = "stop_loss_10"
+stop_loss = -0.1
+
D-A-R Stock Selection Trading Strategy¶
Strategy¶
Strategy Overview¶
Parameter | +Details | +
---|---|
Market | +US Stock Market | +
Type | +Long/Short (Market Neutral) | +
Rebalance Frequency | +Monthly | +
Position Sizing | +Equal Weighted | +
Leverage | +1x | +
Commission | +$0 | +
Start Date | +1980-01 | +
End Date | +2022-12 | +
Predictor | +Dispersions (DISP), Net Operating Assets (NOA), Max Return in 5 Days per Month (RMAX5_21D) | +
Alpha | +2.65% | +
Win Rate | +61.55% | +
VaR | +15.748% | +
Strategy Rules¶
At month $i$, we rank all stocks in the universe using a hierachical sorting (Multi-level Sorting) method based on three predictors: DISP, NOA, and RMAX5_21D.
+-
+
- DISP: xxx. We initially sort all stocks according to their DISP values. +
- NOA: xxx. For stocks that fall within the top 20% or bottom 20% based on the DISP value, we further sort them according to their NOA values. +
- RMAX5_21D: Max. For stocks that fall within the top 20% or bottom 20% based on previous sort, we further sort them acording to their RMAX5_21d values. +
Then we long the top 20% stocks and short the bottom 20% stocks, and hold the positions for one month.
+Stop Loss: If the month $i$ return is less than -10%, then stop trading for $i+1$, $i+2$ months.
+++Notes: Top means large value, bottom means small value.
+
Backtest¶
Data¶
The dataset spans from January, 1980, to December, 2022, encompassing 43 years of monthly data. The market data includes market excess return, SMB, HML, and UMD factors. The data was provided by Haojun. SPX data was sourced from Yahoo Finance for regime classification.
+Performance¶
+ +Please click here to find the interactive return visualization.
+ +-
+
Positive Performance Edge?
+-
+
- t-test:
-
+
- Hypothesis: $H_0$: strategy return = 0. +
- Test Statistic: $\displaystyle t = \frac{{\bar{\text{portfolio return in each month}} - 0}}{\text{standard deviation of portfolio return in each month}/\sqrt{n}} \approx 8.66$ +
- P-value: 0 +
- Decision: Since p-value < 0.05, we reject the null hypothesis, indicating a significant performance edge. +
+
+- t-test:
How Big is the Edge?
+-
+
- CAPM: $\alpha$: 2.65% +
- FF3: $\alpha$: 2.59% +
- FF4: $\alpha$: 2.6% +
+How Consistent is the Edge?
+-
+
- Win Rate: 61.55% +
+What is the potential loss for this edge?
+-
+
- VaR (Value at Risk): 15.73% +
+How long can the edge last?
+-
+
- Compare performance from -12 month to 12 month. 0 is where we stand. +
+What is the benifit of this strategy compare with other 2 predictor or 1 predictor strategy?
+-
+
- Compare the performance of the current strategy with those of strategies based on individual predictors or a combination of two predictors. +
+Performance in different market conditions +Market downturn +Market upturn
+
+What is the relationship between each factor and predicted return?
+-
+
- Fama-MacBeth Regression +
+
def TrendDetecter(data, win = 10, plot_trend = False):
+ # ignore futurewarning
+ warnings.simplefilter(action='ignore', category=FutureWarning)
+
+ # identify all local max and min location according to win
+ local_max_index = argrelextrema(data.values, np.greater, order=win)
+ local_min_index = argrelextrema(data.values, np.less, order=win)
+
+ # delete consecutive max or min points
+ i = 0
+ j = 0
+ max_flag = False
+ max_after_min = []
+ min_after_max = []
+
+ while i < len(local_max_index[0]) and j < len(local_min_index[0]):
+ # if first point is the left most point, then we add it as a max or min point.
+ if i == 0 and j == 0:
+ if local_max_index[0][i] < local_min_index[0][j]:
+ max_after_min.append(local_max_index[0][i])
+ i += 1
+ max_flag = True
+ else:
+ min_after_max.append(local_min_index[0][j])
+ j += 1
+ max_flag = False
+ else:
+ # if point at time t is max, then the next point after t should be min. If the next point after t is still max, choose the larger one between max at time t and the next point as the new max.
+ if max_flag:
+ if local_max_index[0][i] < local_min_index[0][j]:
+ if data[local_max_index[0][i]] > data[max_after_min[-1]]:
+ max_after_min.pop()
+ max_after_min.append(local_max_index[0][i])
+ i += 1
+ max_flag = True
+ else:
+ min_after_max.append(local_min_index[0][j])
+ j += 1
+ max_flag = False
+ # if the point at time t is min, then the new point should be max. If the new point is still min, choose the smaller one between them as the new min.
+ else:
+ if local_min_index[0][j] < local_max_index[0][i]:
+ if data[local_min_index[0][j]] < data[min_after_max[-1]]:
+ min_after_max.pop()
+ min_after_max.append(local_min_index[0][j])
+ j += 1
+ max_flag = False
+ else:
+ max_after_min.append(local_max_index[0][i])
+ i += 1
+ max_flag = True
+
+ # downturn
+ downturn = pd.DataFrame(columns = ["period"])
+
+ # if the left most point is min, then discard the first point of min
+ if min_after_max[0] < max_after_min[0]:
+ min_after_max_downturn = min_after_max[1:]
+
+ # downturn period
+ for i in range(len(max_after_min)):
+ if i < len(min_after_max_downturn):
+ downturn.loc[i] = [[data.index[max_after_min[i]], data.index[min_after_max_downturn[i]]]]
+ else:
+ downturn.loc[i] = [[data.index[max_after_min[i]], data.index[-1]]]
+ for i in range(len(min_after_max_downturn)):
+ if i < len(max_after_min):
+ downturn.loc[i + len(max_after_min)] = [[data.index[min_after_max_downturn[i]], data.index[max_after_min[i]]]]
+ else:
+ downturn.loc[i + len(max_after_min)] = [[data.index[min_after_max_downturn[i]], data.index[-1]]]
+
+ # discard the pairs that the start date value is smaller than the end date value
+ for i in range(len(downturn)):
+ if data[downturn.loc[i][0][0]] < data[downturn.loc[i][0][1]]:
+ downturn = downturn.drop(i)
+
+ downturn = downturn.sort_values(by = "period")
+ downturn = downturn.reset_index(drop = True)
+ downturn.index.name = "index"
+ downturn.index = downturn.index.astype(int)
+
+ # upturn
+ upturn = pd.DataFrame(columns = ["period"])
+
+ # first period
+ if downturn.loc[0][0][0] == data.index[0]:
+ first_entry = pd.DataFrame({"period": [[downturn.loc[0][0][1], downturn.loc[1][0][0]]]})
+ else:
+ first_entry = pd.DataFrame({"period": [[data.index[0], downturn.loc[0][0][0]]]})
+
+ # last period
+ if downturn.iloc[-1, 0][1] == data.index[-1]:
+ last_entry = pd.DataFrame({"period": [[downturn.iloc[-2, 0][1], downturn.iloc[-1, 0][0]]]})
+ else:
+ last_entry = pd.DataFrame({"period": [[downturn.iloc[-1, 0][1], data.index[-1]]]})
+
+ # period between downturn
+ for i in range(len(downturn) - 1):
+ upturn.loc[i] = [[downturn.loc[i][0][1], downturn.loc[i+1][0][0]]]
+
+ # add the first and last entry
+ upturn = pd.concat([first_entry, upturn, last_entry], ignore_index = True)
+
+ upturn = upturn.sort_values(by = "period")
+ upturn = upturn.reset_index(drop = True)
+ upturn.index.name = "index"
+ upturn.index = upturn.index.astype(int)
+
+ if plot_trend:
+ start_date = data.index[0].strftime("%Y-%m-%d")
+ end_date = data.index[-1].strftime("%Y-%m-%d")
+
+ plt.figure(figsize = (15, 10))
+ # downturn red, upturn blue
+ for i in downturn.index:
+ if i == 0:
+ plt.plot(data[:downturn.loc[i][0][1]], color = 'red', alpha = 0.7, label = "Downturn")
+ else:
+ plt.plot(data[downturn.loc[i][0][0]:downturn.loc[i][0][1]], color = 'red', alpha = 0.7)
+ for i in upturn.index:
+ if i == 0:
+ plt.plot(data[upturn.loc[i][0][0]:upturn.loc[i][0][1]], color = '#4a8cff', label="Upturn")
+ else:
+ plt.plot(data[upturn.loc[i][0][0]:upturn.loc[i][0][1]], color = '#4a8cff')
+
+ plt.xlabel("Date", color = '#595959')
+ plt.ylabel(f"SPX 500", color = '#595959')
+ plt.title(f"SPX 500 (M) {start_date} to {end_date}", color = '#595959')
+ plt.legend(loc='upper left', frameon=False, fontsize = 12, facecolor = 'none', edgecolor = 'none', labelcolor = '#595959', ncol = 2)
+
+ # axis color, ticks color, lable color, grid
+ plt.gca().spines['top'].set_visible(False)
+ plt.gca().spines['right'].set_visible(False)
+ plt.gca().spines['left'].set_color('#d9d9d9')
+ plt.gca().spines['bottom'].set_color('#d9d9d9')
+ plt.tick_params(axis='x', colors='#d9d9d9')
+ plt.tick_params(axis='y', colors='#d9d9d9')
+ plt.grid(axis='y', linestyle='--', color='#d9d9d9')
+
+ plt.xticks(color='#595959')
+ plt.yticks(color='#595959')
+
+ # save the plot
+ plt.savefig(f"../../plot/{plot_folder}/market_trend.png", dpi = dpi, transparent = transparent)
+ plt.show()
+
+ return downturn, upturn
+
# ff3 market data
+data_ff3 = pd.read_stata("../../data/ff_factors.dta")
+data_ff3 = data_ff3.set_index("dateff")
+
+# spx500 data
+if os.path.exists("../../data/spx19802022.csv"):
+ spxdata = pd.read_csv('../../data/spx19802022.csv', index_col=0, parse_dates=True)
+else:
+ spxdata = yf.download('^GSPC', start='1980-01-01', end='2023-01-01')
+ spxdata.to_csv('../../data/spx19802022.csv')
+
+# spx monthly data
+spx = spxdata['Adj Close'].resample('M').last()
+
+# trend detection
+downturn, upturn = TrendDetecter(spx, win = 10, plot_trend = True)
+
# data
+raw_data = pd.read_csv("../../data/data19802022.csv", header=0)
+
+# drop na
+rows_before = raw_data.shape[0]
+raw_data = raw_data.dropna(subset=["date", "gvkey", "permno",
+ 'ret_12_7',
+ 'ret_3_1', 'ret_6_1', 'ret_9_1', 'ret_12_1', 'ret_18_1',
+ 'ret_1_0', 'ret_2_0', 'ret_3_0', 'ret_6_0', 'ret_9_0', 'ret_12_0',
+ 'ret_lead1m', 'ret_lead2m', 'ret_lead3m', 'ret_lead4m', 'ret_lead5m', 'ret_lead6m', 'ret_lead7m', 'ret_lead8m', 'ret_lead9m', 'ret_lead10m', 'ret_lead11m', 'ret_lead12m',
+ 'cum_rawret_1_6m', 'cum_rawret_1_3m', 'cum_rawret_1_12m',
+ 'feps', 'fep', 'disp', 'nsi', 'accrual', 'noa', 'ag', 'ia', 'pead', 'rmax5_21d', 'prc_highprc_252d', 'rmax1_21d'
+ ], how="all")
+print("ROWS DROPPED NA:", rows_before - raw_data.shape[0])
+
+# date to datetime
+raw_data["date"] = pd.to_datetime(raw_data["date"], format="%Y%m%d")
+
+# reorganize
+data = raw_data[["date", "gvkey", "permno", "ff49", 'mv', 'be_me', "prc",
+ 'ret_12_7',
+ 'ret_3_1', 'ret_6_1', 'ret_9_1', 'ret_12_1', 'ret_18_1',
+ 'ret_1_0', 'ret_2_0', 'ret_3_0', 'ret_6_0', 'ret_9_0', 'ret_12_0',
+ 'feps', 'fep', 'disp', 'nsi', 'accrual', 'noa', 'ag', 'ia', 'pead', 'rmax5_21d', 'prc_highprc_252d', 'rmax1_21d',
+ 'cum_rawret_1_6m', 'cum_rawret_1_3m', 'cum_rawret_1_12m',
+ 'ret_lead1m', 'ret_lead2m', 'ret_lead3m', 'ret_lead4m', 'ret_lead5m', 'ret_lead6m', 'ret_lead7m', 'ret_lead8m', 'ret_lead9m', 'ret_lead10m', 'ret_lead11m', 'ret_lead12m'
+ ]]
+
+# sort
+data = data.sort_values(by=["date", "gvkey", "permno", "ff49", "mv"])
+data = data.reset_index(drop=True)
+
+# describe data
+print("FIELDS:", data.columns.values)
+print("DIMENSION:", data.shape)
+print("DUPLICATED ROWS:", data.duplicated().sum())
+print("# OF MISSING VALUES:", data.isnull().sum().sum(), ", PERCENTAGE OF MISSING VALUES:", round(data.isnull().sum(
+).sum() / (data.shape[0] * data.shape[1]), 3), ", # OF COLUMNS HAVE MISSING VALUES:", data.isnull().any().sum())
+print("START DATE:", data["date"].min(), ", END DATE:", data["date"].max())
+
+# distribution of predictors
+fig, axs = plt.subplots(5, 5, figsize=(20, 20))
+for i in range(7, 32):
+ axs[(i - 7) // 5, (i - 7) % 5].hist(data.iloc[:, i], bins = 100, range = (data.iloc[:, i].quantile(0.001), data.iloc[:, i].quantile(0.999)))
+ axs[(i - 7) // 5, (i - 7) % 5].tick_params(axis='x', rotation=45)
+ axs[(i - 7) // 5, (i - 7) % 5].tick_params(axis='y', rotation=45)
+ axs[(i - 7) // 5, (i - 7) % 5].set_title(data.columns.values[i])
+ plt.subplots_adjust(hspace = 0.3, wspace = 0.3)
+plt.suptitle("Distribution of Each Predictors", fontsize = 20)
+plt.subplots_adjust(top = 0.94)
+
+# save plot
+plt.savefig(f"../../plot/{plot_folder}/predictors_distribution.png", dpi = dpi, transparent = transparent)
+plt.show()
+
+# boxplot of predictors
+fig, axs = plt.subplots(5, 5, figsize=(15, 15))
+for i in range(7, 32):
+ sns.boxplot(data.iloc[:, i], ax = axs[(i - 7) // 5, (i - 7) % 5])
+ axs[(i - 7) // 5, (i - 7) % 5].set_title(data.columns.values[i])
+ plt.subplots_adjust(hspace = 0.3, wspace = 0.3)
+plt.suptitle("Boxplot of Each Predictors", fontsize = 20)
+plt.subplots_adjust(top = 0.94)
+
+# save plot
+plt.savefig(f"../../plot/{plot_folder}/predictors_boxplot.png", dpi = dpi, transparent = transparent)
+plt.show()
+
+# disp, noa, rmax5_21d correlation matrix, heatmap
+corr = data[["disp", "noa", "rmax5_21d"]].corr()
+plt.figure(figsize = (10, 5))
+sns.heatmap(round(corr, 3), annot = True, cmap = "coolwarm", cbar = True, vmin = -1, vmax = 1, center = 0)
+plt.title("Correlation Matrix of disp, noa, rmax5_21d")
+
+# save plot
+plt.savefig(f"../../plot/{plot_folder}/disp_noa_rmax5_21d_corr.png", dpi = dpi, transparent = transparent)
+plt.show()
+display(round(corr, 2))
+
+# all predictors correlation matrix, heatmap
+predictors_corr = data.iloc[:, 7:31].corr()
+data_corr_heatmap = sns.heatmap(predictors_corr, annot = False, cmap = "coolwarm", cbar = True, vmin=-1, vmax=1, center= 0)
+
+# save plot
+plt.savefig(f"../../plot/{plot_folder}/predictors_corr_heatmap.png", dpi = dpi, transparent = transparent)
+plt.show()
+display(round(predictors_corr, 2))
+
+# save clean data
+if not os.path.exists("../../data/data19802022_clean.csv"):
+ data.to_csv("../../data/data19802022_clean.csv", index=False)
+
ROWS DROPPED NA: 0 +FIELDS: ['date' 'gvkey' 'permno' 'ff49' 'mv' 'be_me' 'prc' 'ret_12_7' 'ret_3_1' + 'ret_6_1' 'ret_9_1' 'ret_12_1' 'ret_18_1' 'ret_1_0' 'ret_2_0' 'ret_3_0' + 'ret_6_0' 'ret_9_0' 'ret_12_0' 'feps' 'fep' 'disp' 'nsi' 'accrual' 'noa' + 'ag' 'ia' 'pead' 'rmax5_21d' 'prc_highprc_252d' 'rmax1_21d' + 'cum_rawret_1_6m' 'cum_rawret_1_3m' 'cum_rawret_1_12m' 'ret_lead1m' + 'ret_lead2m' 'ret_lead3m' 'ret_lead4m' 'ret_lead5m' 'ret_lead6m' + 'ret_lead7m' 'ret_lead8m' 'ret_lead9m' 'ret_lead10m' 'ret_lead11m' + 'ret_lead12m'] +DIMENSION: (2960897, 46) +DUPLICATED ROWS: 0 +# OF MISSING VALUES: 17954352 , PERCENTAGE OF MISSING VALUES: 0.132 , # OF COLUMNS HAVE MISSING VALUES: 41 +START DATE: 1980-01-31 00:00:00 , END DATE: 2022-12-30 00:00:00 ++
+ | disp | +noa | +rmax5_21d | +
---|---|---|---|
disp | +1.00 | +-0.0 | +0.05 | +
noa | +-0.00 | +1.0 | +-0.00 | +
rmax5_21d | +0.05 | +-0.0 | +1.00 | +
+ | ret_12_7 | +ret_3_1 | +ret_6_1 | +ret_9_1 | +ret_12_1 | +ret_18_1 | +ret_1_0 | +ret_2_0 | +ret_3_0 | +ret_6_0 | +... | +disp | +nsi | +accrual | +noa | +ag | +ia | +pead | +rmax5_21d | +prc_highprc_252d | +rmax1_21d | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ret_12_7 | +1.00 | +0.01 | +0.01 | +0.27 | +0.58 | +0.40 | +0.00 | +0.01 | +0.01 | +0.01 | +... | +-0.02 | +-0.01 | +0.0 | +-0.00 | +-0.00 | +0.00 | +0.01 | +0.01 | +0.09 | +0.00 | +
ret_3_1 | +0.01 | +1.00 | +0.60 | +0.45 | +0.34 | +0.22 | +0.01 | +0.48 | +0.79 | +0.54 | +... | +-0.00 | +-0.00 | +0.0 | +-0.00 | +-0.00 | +-0.00 | +0.01 | +0.00 | +0.29 | +-0.00 | +
ret_6_1 | +0.01 | +0.60 | +1.00 | +0.75 | +0.57 | +0.38 | +0.01 | +0.28 | +0.47 | +0.88 | +... | +-0.01 | +-0.00 | +0.0 | +-0.00 | +-0.00 | +-0.00 | +0.01 | +0.00 | +0.33 | +0.00 | +
ret_9_1 | +0.27 | +0.45 | +0.75 | +1.00 | +0.78 | +0.55 | +0.01 | +0.22 | +0.35 | +0.66 | +... | +-0.01 | +-0.00 | +0.0 | +-0.00 | +-0.00 | +-0.00 | +0.01 | +0.00 | +0.31 | +0.00 | +
ret_12_1 | +0.58 | +0.34 | +0.57 | +0.78 | +1.00 | +0.71 | +0.01 | +0.16 | +0.27 | +0.50 | +... | +-0.02 | +-0.00 | +0.0 | +-0.00 | +-0.00 | +-0.00 | +0.01 | +0.01 | +0.24 | +0.00 | +
ret_18_1 | +0.40 | +0.22 | +0.38 | +0.55 | +0.71 | +1.00 | +-0.01 | +0.09 | +0.17 | +0.32 | +... | +-0.02 | +-0.00 | +0.0 | +-0.00 | +-0.00 | +0.00 | +0.01 | +0.01 | +0.13 | +0.00 | +
ret_1_0 | +0.00 | +0.01 | +0.01 | +0.01 | +0.01 | +-0.01 | +1.00 | +0.71 | +0.56 | +0.37 | +... | +0.00 | +-0.00 | +-0.0 | +0.00 | +-0.00 | +-0.00 | +0.00 | +0.42 | +0.28 | +0.37 | +
ret_2_0 | +0.01 | +0.48 | +0.28 | +0.22 | +0.16 | +0.09 | +0.71 | +1.00 | +0.81 | +0.54 | +... | +-0.00 | +-0.00 | +-0.0 | +0.00 | +-0.00 | +-0.00 | +0.01 | +0.30 | +0.35 | +0.27 | +
ret_3_0 | +0.01 | +0.79 | +0.47 | +0.35 | +0.27 | +0.17 | +0.56 | +0.81 | +1.00 | +0.68 | +... | +-0.00 | +-0.00 | +0.0 | +0.00 | +-0.00 | +-0.00 | +0.01 | +0.25 | +0.38 | +0.22 | +
ret_6_0 | +0.01 | +0.54 | +0.88 | +0.66 | +0.50 | +0.32 | +0.37 | +0.54 | +0.68 | +1.00 | +... | +-0.01 | +-0.00 | +0.0 | +-0.00 | +-0.00 | +-0.00 | +0.01 | +0.19 | +0.40 | +0.16 | +
ret_9_0 | +0.25 | +0.42 | +0.68 | +0.91 | +0.72 | +0.49 | +0.29 | +0.42 | +0.53 | +0.77 | +... | +-0.01 | +-0.00 | +0.0 | +-0.00 | +-0.00 | +-0.00 | +0.01 | +0.17 | +0.37 | +0.14 | +
ret_12_0 | +0.52 | +0.31 | +0.51 | +0.69 | +0.92 | +0.64 | +0.21 | +0.31 | +0.39 | +0.58 | +... | +-0.02 | +-0.00 | +0.0 | +-0.00 | +-0.00 | +-0.00 | +0.01 | +0.14 | +0.28 | +0.11 | +
feps | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +... | +-0.00 | +0.04 | +0.0 | +-0.00 | +-0.00 | +-0.00 | +0.00 | +-0.01 | +0.01 | +-0.01 | +
fep | +0.01 | +-0.02 | +-0.02 | +-0.01 | +-0.00 | +0.02 | +-0.02 | +-0.02 | +-0.02 | +-0.02 | +... | +-0.04 | +-0.01 | +0.0 | +0.00 | +0.00 | +-0.00 | +0.03 | +-0.07 | +0.06 | +-0.06 | +
disp | +-0.02 | +-0.00 | +-0.01 | +-0.01 | +-0.02 | +-0.02 | +0.00 | +-0.00 | +-0.00 | +-0.01 | +... | +1.00 | +0.00 | +-0.0 | +-0.00 | +-0.00 | +-0.00 | +-0.01 | +0.05 | +-0.06 | +0.04 | +
nsi | +-0.01 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +... | +0.00 | +1.00 | +-0.0 | +0.00 | +0.00 | +-0.00 | +0.00 | +-0.00 | +-0.01 | +-0.00 | +
accrual | +0.00 | +0.00 | +0.00 | +0.00 | +0.00 | +0.00 | +-0.00 | +-0.00 | +0.00 | +0.00 | +... | +-0.00 | +-0.00 | +1.0 | +0.00 | +0.00 | +-0.00 | +0.00 | +-0.00 | +0.00 | +-0.00 | +
noa | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +0.00 | +0.00 | +0.00 | +-0.00 | +... | +-0.00 | +0.00 | +0.0 | +1.00 | +0.09 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +
ag | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +... | +-0.00 | +0.00 | +0.0 | +0.09 | +1.00 | +0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +
ia | +0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +0.00 | +-0.00 | +-0.00 | +-0.00 | +-0.00 | +... | +-0.00 | +-0.00 | +-0.0 | +-0.00 | +0.00 | +1.00 | +0.00 | +0.00 | +-0.01 | +0.00 | +
pead | +0.01 | +0.01 | +0.01 | +0.01 | +0.01 | +0.01 | +0.00 | +0.01 | +0.01 | +0.01 | +... | +-0.01 | +0.00 | +0.0 | +-0.00 | +-0.00 | +0.00 | +1.00 | +-0.01 | +0.02 | +-0.01 | +
rmax5_21d | +0.01 | +0.00 | +0.00 | +0.00 | +0.01 | +0.01 | +0.42 | +0.30 | +0.25 | +0.19 | +... | +0.05 | +-0.00 | +-0.0 | +-0.00 | +-0.00 | +0.00 | +-0.01 | +1.00 | +-0.03 | +1.00 | +
prc_highprc_252d | +0.09 | +0.29 | +0.33 | +0.31 | +0.24 | +0.13 | +0.28 | +0.35 | +0.38 | +0.40 | +... | +-0.06 | +-0.01 | +0.0 | +-0.00 | +-0.00 | +-0.01 | +0.02 | +-0.03 | +1.00 | +-0.01 | +
rmax1_21d | +0.00 | +-0.00 | +0.00 | +0.00 | +0.00 | +0.00 | +0.37 | +0.27 | +0.22 | +0.16 | +... | +0.04 | +-0.00 | +-0.0 | +-0.00 | +-0.00 | +0.00 | +-0.01 | +1.00 | +-0.01 | +1.00 | +
24 rows × 24 columns
+# data preprocessing
+start_date = "1980-01-01"
+end_date = "2022-12-31"
+
+# add trend to data, downturn = 0, upturn = 1
+for period in downturn["period"]:
+ data.loc[(data["date"] >= period[0]) & (data["date"] <= period[1]), "regime"] = 0
+for period in upturn["period"]:
+ data.loc[(data["date"] >= period[0]) & (data["date"] <= period[1]), "regime"] = 1
+
+# data is all the data falls in the range of start_date and end_date
+data = data[(data["date"] >= start_date) & (data["date"] <= end_date)]
+spx = spx[(spx.index >= start_date) & (spx.index <= end_date)]
+
Stock Selection¶
+def MultiPredictorSort(data, predictors, bin, mv_factor = True, ret_list = ["ret_lead1m"], regime = None):
+ data_model = data[["date", "mv"] + predictors + ret_list]
+ if regime == 0:
+ data_model = data_model[data["regime"] == 0]
+ elif regime == 1:
+ data_model = data_model[data["regime"] == 1]
+ data_model = data_model.dropna(subset = predictors + ret_list, how = "any")
+ groupby_list = ["date"]
+ pred_name = "_".join(predictors)
+ folder_name = f"../../data/{len(predictors)}_predictor/{pred_name}"
+
+ # add mv as a predictor
+ if mv_factor:
+ predictors = ["mv"] + predictors
+
+ # sort by predictors, pred1 is the main predictor, pred2 is the second predictor, etc. {predictor}{1} is the smallest, {predictor}{bin} is the largest. header is the most important predictor
+ for i, predictor in enumerate(predictors):
+ new_col = f"pred{i + 1}_sort"
+ try:
+ data_model[new_col] = data_model.groupby(groupby_list)[predictor].transform(pd.qcut, bin, labels = [f"{predictor}{i}" for i in range(1, bin + 1)], duplicates = "drop")
+ except ValueError:
+ data_model[new_col] = data_model.groupby(groupby_list)[predictor].transform(pd.qcut, bin, duplicates = "drop")
+ groupby_list.append(new_col)
+
+ # frequency table, across time average return table
+ if len(predictors) == 1:
+ freq_table = data_model["pred1_sort"].value_counts()
+ freq_table = pd.DataFrame(freq_table)
+
+ ret_table = data_model.groupby("pred1_sort")[ret_list].mean()
+ ret_table = pd.DataFrame(ret_table)
+ else:
+ col_list = [data_model[f"pred1_sort"]]
+ row_list = [data_model[f"pred{i + 1}_sort"] for i in range(1, len(predictors))]
+
+ freq_table = pd.crosstab(row_list, col_list)
+ ret_table = pd.crosstab(row_list, col_list, values = data_model[ret_list], aggfunc = np.mean)
+
+ # summary table at each time, a time series table
+ ts_table = data_model.groupby(groupby_list)[ret_list].mean().reset_index()
+
+ # store the frequency table, return table, and time series table. the naming is based on the importance of the predictor, from the most important to the least important.
+ if not os.path.exists(folder_name):
+ os.makedirs(folder_name)
+ if mv_factor:
+ file_name = "mv_" + pred_name
+ if regime != None:
+ file_name = f"{pred_name}_{'downturn' if regime == 0 else 'upturn'}"
+ else:
+ file_name = pred_name
+ # save files
+ # if not os.path.exists(f"{folder_name}/{file_name}_{ret_list[0]}_freq.csv"):
+ freq_table.to_csv(f"{folder_name}/{file_name}_{ret_list[0]}_freq.csv")
+ # if not os.path.exists(f"{folder_name}/{file_name}_{ret_list[0]}.csv"):
+ ret_table.to_csv(f"{folder_name}/{file_name}_{ret_list[0]}.csv")
+ # if not os.path.exists(f"{folder_name}/{file_name}_{ret_list[0]}_ts.csv"):
+ ts_table.to_csv(f"{folder_name}/{file_name}_{ret_list[0]}_ts.csv")
+
+ return freq_table, ret_table, ts_table
+
# sort disp, noa, rmax5_21d
+predictors = ["disp", "noa", "rmax5_21d"]
+
+_, disp_noa_rmax5_21d_ret_lead1m_table, disp_noa_rmax5_21d_ret_lead1m_ts_table = MultiPredictorSort(data, predictors, 5, mv_factor = False, ret_list=["ret_lead1m"], regime = None)
+_, disp_noa_rmax5_21d_ret_lead1m_downturn_table, disp_noa_rmax5_21d_ret_lead1m_downturn_ts_table = MultiPredictorSort(data, predictors, 5, mv_factor = False, ret_list=["ret_lead1m"], regime = 0)
+_, disp_noa_rmax5_21d_ret_lead1m_upturn_table, disp_noa_rmax5_21d_ret_lead1m_upturn_ts_table = MultiPredictorSort(data, predictors, 5, mv_factor = False, ret_list=["ret_lead1m"], regime = 1)
+
Portfolio Construction¶
+def FindMaxMin(predictors, file_name_partial, regime = None, ret_list = ["ret_lead1m"]):
+ file_name_partial = f"{file_name_partial}_{ret_list[0]}"
+
+ # read the return table
+ ret_table = pd.read_csv(f"{file_name_partial}.csv")
+
+ # find the largest and smallest term and the corresponding index
+ if len(predictors) == 1:
+ ret_table = ret_table.iloc[:, 1:]
+ else:
+ ret_table = ret_table.iloc[:, len(predictors) - 1:]
+
+ max_coordinate = ret_table.stack().idxmax()
+ min_coordinate = ret_table.stack().idxmin()
+
+ return max_coordinate, min_coordinate
+
def StopLossDate(port_ret_ts, ret_list, stop_loss):
+ date = port_ret_ts.index.unique()
+ date = np.sort(date)
+
+ # stop loss location t
+ stop_loss_date = port_ret_ts[port_ret_ts[ret_list[0]] < stop_loss].index
+ stop_loss_date_loc = np.searchsorted(date, stop_loss_date)
+
+ # find t+1, t+2 month
+ stop_loss_lead1m_date_loc = stop_loss_date_loc + 1
+ stop_loss_lead2m_date_loc = stop_loss_date_loc + 2
+ stop_loss_date_loc = np.concatenate([stop_loss_lead1m_date_loc, stop_loss_lead2m_date_loc])
+ stop_loss_date_loc = np.unique(stop_loss_date_loc)
+ stop_loss_date_loc = np.sort(stop_loss_date_loc)
+ stop_loss_date = date[stop_loss_date_loc]
+
+ return stop_loss_date
+
def MultiPredictorLSRet(predictors, mv_factor=False, cond = None, ret_list=["ret_lead1m"], regime=None, type = "ts", stop_loss = None):
+ folder_name = f"../../data/{len(predictors)}_predictor/{'_'.join(predictors)}"
+ file_name_partial = f"{folder_name}/{'_'.join(predictors)}"
+
+ # add mv as a predictor
+ if mv_factor:
+ predictors = ["mv"] + predictors
+
+ if regime != None:
+ file_name_partial = f"{folder_name}/{'_'.join(predictors)}_{'downturn' if regime == 0 else 'upturn'}"
+ data_ts = pd.read_csv(f"{file_name_partial}_{ret_list[0]}_ts.csv", index_col=0)
+
+ # max and min condition
+ if cond == None:
+ data_ret = pd.read_csv(f"{file_name_partial}_ret_lead1m.csv")
+
+ # find the largest and smallest term and corresonding index
+ max_coordinate, min_coordinate = FindMaxMin(predictors, file_name_partial, regime = regime, ret_list = ret_list)
+
+ if type == "ts":
+ pred_sort_list = [f"pred{i + 1}_sort" for i in range(len(predictors))]
+
+ if len(predictors) == 1:
+ max_cond_dict = {pred_sort_list[0]: data_ret.iloc[max_coordinate[0], 0]}
+ min_cond_dict = {pred_sort_list[0]: data_ret.iloc[min_coordinate[0], 0]}
+ else:
+ for i in range(len(predictors)):
+ if i == 0:
+ max_cond_dict = {pred_sort_list[i]: max_coordinate[1]}
+ min_cond_dict = {pred_sort_list[i]: min_coordinate[1]}
+ else:
+ max_cond_dict[pred_sort_list[i]] = data_ret.iloc[max_coordinate[0], i - 1]
+ min_cond_dict[pred_sort_list[i]] = data_ret.iloc[min_coordinate[0], i - 1]
+ else:
+ if type == "ts":
+ max_cond_dict = cond[0]
+ min_cond_dict = cond[1]
+ elif type == "average":
+ max_coordinate = cond[0]
+ min_coordinate = cond[1]
+
+ if type == "ts":
+ # make the condition for long and short
+ long_cond = np.all([data_ts[key] == value for key, value in max_cond_dict.items()], axis=0)
+ short_cond = np.all([data_ts[key] == value for key, value in min_cond_dict.items()], axis=0)
+
+ # calculate return
+ long_ret_ts = data_ts[long_cond][ret_list[0]].reset_index(drop=True)
+ short_ret_ts = data_ts[short_cond][ret_list[0]].reset_index(drop=True)
+ port_ret_ts = long_ret_ts - short_ret_ts
+
+ # change to dataframe
+ port_ret_ts.index = data_ts[long_cond]["date"]
+ port_ret_ts = port_ret_ts.to_frame()
+ port_ret_ts.index = pd.to_datetime(port_ret_ts.index, format="%Y-%m-%d")
+ long_ret_ts.index = data_ts[long_cond]["date"]
+ long_ret_ts = long_ret_ts.to_frame()
+ long_ret_ts.index = pd.to_datetime(long_ret_ts.index, format="%Y-%m-%d")
+ short_ret_ts.index = data_ts[short_cond]["date"]
+ short_ret_ts = short_ret_ts.to_frame()
+ short_ret_ts.index = pd.to_datetime(short_ret_ts.index, format="%Y-%m-%d")
+
+ # stop loss
+ if stop_loss != None:
+ stop_loss_date = StopLossDate(port_ret_ts, ret_list, stop_loss)
+ port_ret_ts.loc[port_ret_ts.index.isin(stop_loss_date), ret_list[0]] = 0
+ else:
+ stop_loss_date = None
+
+ return port_ret_ts, stop_loss_date, long_ret_ts, short_ret_ts
+
+ elif type == "average":
+ port_ret = pd.DataFrame(columns = ["long_ret", "short_ret"], index = ret_list)
+ data_ret_mean = pd.read_csv(f"{file_name_partial}_{ret_list[0]}.csv")
+
+ # long short return
+ if len(predictors) == 1:
+ data_ret_mean = data_ret_mean.iloc[:, 1:]
+ port_ret.loc[ret_list[0], "long_ret"] = data_ret_mean.iloc[max_coordinate[0], 0]
+ port_ret.loc[ret_list[0], "short_ret"] = data_ret_mean.iloc[min_coordinate[0], 0]
+ else:
+ data_ret_mean = data_ret_mean.iloc[:, len(predictors) - 1:]
+ port_ret.loc[ret_list[0], "long_ret"] = data_ret_mean.loc[max_coordinate[0], max_coordinate[1]]
+ port_ret.loc[ret_list[0], "short_ret"] = data_ret_mean.loc[min_coordinate[0], min_coordinate[1]]
+
+ # standardize return to monthly return
+ if "_0" in ret_list[0]:
+ month = int(ret_list[0].split("_")[1])
+ port_ret.loc[ret_list[0], "long_ret"] = port_ret.loc[ret_list[0], "long_ret"] / month
+ port_ret.loc[ret_list[0], "short_ret"] = port_ret.loc[ret_list[0], "short_ret"] / month
+
+ # portfolio return
+ port_ret["port_ret"] = port_ret["long_ret"] - port_ret["short_ret"]
+
+ return port_ret
+
# port_ret time series
+disp_noa_rmax5_21d_ts_cond = [{"pred1_sort": "disp1", "pred2_sort": "noa1", "pred3_sort": "rmax5_21d1"}, {"pred1_sort": "disp5", "pred2_sort": "noa5", "pred3_sort": "rmax5_21d5"}]
+disp_noa_rmax5_21d_ls_ret_lead1m_ts, stop_loss_date, disp_noa_rmax5_21d_l_ret_lead1m_ts, disp_noa_rmax5_21d_s_ret_lead1m_ts = MultiPredictorLSRet(predictors, cond = disp_noa_rmax5_21d_ts_cond, mv_factor=False, type = "ts", stop_loss=stop_loss)
+disp_noa_rmax5_21d_ls_ret_lead1m_downturn_ts, _, _, _= MultiPredictorLSRet(predictors, cond = disp_noa_rmax5_21d_ts_cond, mv_factor=False, regime = 0, type = "ts", stop_loss=stop_loss)
+disp_noa_rmax5_21d_ls_ret_lead1m_upturn_ts, _, _, _ = MultiPredictorLSRet(predictors, cond = disp_noa_rmax5_21d_ts_cond, mv_factor=False, regime = 1, type = "ts", stop_loss=stop_loss)
+
# # update freq table, ret table, ts table, stop loss date return = 0
+ret_list = ["ret_lead1m"]
+data.loc[data["date"].isin(stop_loss_date), ret_list[0]] = 0
+
+_, disp_noa_rmax5_21d_ret_lead1m_table, disp_noa_rmax5_21d_ret_lead1m_ts_table = MultiPredictorSort(data, predictors, 5, mv_factor = False, ret_list=["ret_lead1m"], regime = None)
+_, disp_noa_rmax5_21d_ret_lead1m_downturn_table, disp_noa_rmax5_21d_ret_lead1m_downturn_ts_table = MultiPredictorSort(data, predictors, 5, mv_factor = False, ret_list=["ret_lead1m"], regime = 0)
+_, disp_noa_rmax5_21d_ret_lead1m_upturn_table, disp_noa_rmax5_21d_ret_lead1m_upturn_ts_table = MultiPredictorSort(data, predictors, 5, mv_factor = False, ret_list=["ret_lead1m"], regime = 1)
+
disp_noa_rmax5_21d_ls_ret_lead1m_ts, stop_loss_date, disp_noa_rmax5_21d_l_ret_lead1m_ts, disp_noa_rmax5_21d_s_ret_lead1m_ts = MultiPredictorLSRet(predictors, cond = disp_noa_rmax5_21d_ts_cond, mv_factor=False, type = "ts", stop_loss=-0.1)
+disp_noa_rmax5_21d_ls_ret_lead1m_downturn_ts, _, _, _= MultiPredictorLSRet(predictors, cond = disp_noa_rmax5_21d_ts_cond, mv_factor=False, regime = 0, type = "ts", stop_loss=stop_loss)
+disp_noa_rmax5_21d_ls_ret_lead1m_upturn_ts, _, _, _ = MultiPredictorLSRet(predictors, cond = disp_noa_rmax5_21d_ts_cond, mv_factor=False, regime = 1, type = "ts", stop_loss=stop_loss)
+
Performance¶
+def ThreePredictorPlot(data, title, HTML = False):
+ # ignore the first two predictors columns
+ data_model = data.iloc[:, 2:]
+
+ # find predictors
+ predictors = title.split("_")
+ if "prc_highprc_252d" in title:
+ predictors = [p.replace("252d", "prc_highprc_252d") for p in predictors]
+ excluded_items = ["highprc", "prc"]
+ predictors = [p for p in predictors if p not in excluded_items]
+ if "rmax5_21d" in title:
+ predictors = [p.replace("rmax5", "rmax5_21d") for p in predictors]
+ excluded_items = ["21d"]
+ predictors = [p for p in predictors if p not in excluded_items]
+ if "rmax1_21d" in title:
+ predictors = [p.replace("rmax1", "rmax1_21d") for p in predictors]
+ excluded_items = ["21d"]
+ predictors = [p for p in predictors if p not in excluded_items]
+
+ # reshape to 3d matrix, order is [predictor2, predictor3, predictor1]
+ try:
+ data_model = data_model.values.reshape((5, 5, 5))
+ except ValueError:
+ return
+
+ x, y, z = np.meshgrid(np.arange(1, 1 + data_model.shape[0]), np.arange(1, 1 + data_model.shape[1]), np.arange(1, 1 + data_model.shape[2]))
+
+ # point size
+ point_size = np.exp(data_model.flatten()) ** 500
+ point_size[np.argmin(data_model)] = 500
+
+ # point color
+ colors = ["#eb9c9e" if value > 0 else "#5fcd53" for value in data_model.flatten()]
+ colors[np.argmax(data_model)] = "red"
+ if np.min(data_model) > 0:
+ colors[np.argmin(data_model)] = "#519aba"
+ if np.min(data_model) <= 0:
+ colors[np.argmin(data_model)] = "#70a187"
+
+ if not HTML:
+ # scatter plot
+ fig = plt.figure(figsize = (11, 11))
+ ax = fig.add_subplot(111, projection='3d')
+ sc = ax.scatter(x.flatten(), y.flatten(), z.flatten(), s = point_size, c = colors, alpha = 0.6)
+
+ # title
+ ax.set_title(title)
+
+ # surface and grid transparent
+ ax.xaxis.pane.fill = False
+ ax.yaxis.pane.fill = False
+ ax.zaxis.pane.fill = False
+ ax.xaxis._axinfo["grid"]["color"] = "#d9d9d9"
+ ax.yaxis._axinfo["grid"]["color"] = "#d9d9d9"
+ ax.zaxis._axinfo["grid"]["color"] = "#d9d9d9"
+
+ # axis axis color, label, label color, tick color, tick name
+ ax.xaxis.line.set_color('#c3c3c3')
+ ax.yaxis.line.set_color('#c3c3c3')
+ ax.zaxis.line.set_color('#c3c3c3')
+ ax.set_xlabel(predictors[1])
+ ax.set_ylabel(predictors[2])
+ ax.set_zlabel(predictors[0])
+ ax.tick_params(axis='x', colors='#c3c3c3')
+ ax.tick_params(axis='y', colors='#c3c3c3')
+ ax.tick_params(axis='z', colors='#c3c3c3')
+ ax.set_xticks(np.arange(1, 6))
+ ax.set_yticks(np.arange(1, 6))
+ ax.set_zticks(np.arange(1, 6))
+
+ # invert x, y axis and plot angle
+ ax.invert_xaxis()
+ ax.invert_yaxis()
+ ax.view_init(elev=25, azim=125)
+
+ # legend
+ red_patch = plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=7, label='Max', alpha = 0.6)
+ blue_patch = plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#519aba', markersize=7, label='Min & +', alpha = 0.6)
+ green_patch = plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#70a187', markersize=7, label='Min & -', alpha = 0.6)
+ positive_patch = plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#eb9c9e', markersize=7, label='+', alpha = 0.6)
+ negative_patch = plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#5fcd53', markersize=7, label='-', alpha = 0.6)
+ plt.legend(handles=[red_patch, blue_patch, green_patch, positive_patch, negative_patch], loc = 'upper right', bbox_to_anchor=(1, 0.99), ncol = 5, frameon = False, fontsize = 10)
+
+ # save the plot
+ plt.savefig(f"../../plot/{plot_folder}/{title}.png", dpi = dpi, transparent = transparent)
+ plt.show()
+ else:
+ fig = go.Figure(data=[go.Scatter3d(
+ x=x.flatten(),
+ y=y.flatten(),
+ z=z.flatten(),
+ mode='markers',
+ marker=dict(
+ size=point_size/20,
+ color=colors,
+ opacity=0.8
+ ),
+ name = "Return"
+ )])
+
+ fig.update_layout(
+ title = title,
+ title_x = 0.5,
+ scene = dict(
+ xaxis_title=predictors[1],
+ yaxis_title=predictors[2],
+ zaxis_title=predictors[0],
+ xaxis = dict(tickvals = np.arange(1, 6), ticktext = np.arange(1, 6)),
+ yaxis = dict(tickvals = np.arange(1, 6), ticktext = np.arange(1, 6)),
+ zaxis = dict(tickvals = np.arange(1, 6), ticktext = np.arange(1, 6)),
+ ),
+ autosize=True,
+ width=800,
+ height=800,
+ margin=dict(
+ l=65,
+ r=65,
+ b=65,
+ t=90
+ )
+ )
+
+ # add legend
+ fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None], mode='markers', marker=dict(size=10, color='red'), name='Max'))
+ fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None], mode='markers', marker=dict(size=10, color='#519aba'), name='Min & +'))
+ fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None], mode='markers', marker=dict(size=10, color='#70a187'), name='Min & -'))
+ fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None], mode='markers', marker=dict(size=10, color='#eb9c9e'), name='+'))
+ fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None], mode='markers', marker=dict(size=10, color='#5fcd53'), name='-'))
+ # do not show trace[0]
+
+ # save the plot
+ fig.write_html(f"../../data/3_predictor/{title}/{title}.html", config = dict(displayModeBar = False))
+
# return visualization
+title = "_".join(predictors)
+
+_ = ThreePredictorPlot(disp_noa_rmax5_21d_ret_lead1m_table.reset_index(), title, HTML=True)
+_ = ThreePredictorPlot(disp_noa_rmax5_21d_ret_lead1m_table.reset_index(), title, HTML=False)
+
def LSRetPlot(port_ret, predictors, downturn = None, upturn = None, spx_data = None, start_date = None, end_date = None, mv_factor = False, plot_type = "ts", var = False, spx = False):
+ plot_name = "_".join(predictors)
+ if mv_factor:
+ plot_name = "mv_" + plot_name
+ if start_date == None:
+ start_date = port_ret.index[0]
+ start_date = start_date.strftime("%Y-%m-%d")
+ if end_date == None:
+ end_date = port_ret.index[-1]
+ end_date = end_date.strftime("%Y-%m-%d")
+
+ # set the time
+ ret = port_ret["ret_lead1m"][start_date:end_date]
+
+ plt.figure(figsize = (15, 10))
+
+ if plot_type == "ts":
+ if downturn is None:
+ plt.plot(ret, color = '#4a8cff', label = "Portfolio Return")
+ plt.title(f"Portfolio Return (M) {plot_name} {start_date} to {end_date}", color = '#595959')
+ if downturn is not None:
+ for i in downturn.index:
+ if i == 0:
+ plt.plot(ret[downturn.loc[i][0][0]:downturn.loc[i][0][1]], color = 'red', alpha = 0.4, label = "Portfolio Downturn Return")
+ else:
+ plt.plot(ret[downturn.loc[i][0][0]:downturn.loc[i][0][1]], color = 'red', alpha = 0.4)
+ for i in upturn.index:
+ if i == 0:
+ plt.plot(ret[upturn.loc[i][0][0]:upturn.loc[i][0][1]], color = '#4a8cff', alpha = 0.4, label = "Portfolio Upturn Return")
+ else:
+ plt.plot(ret[upturn.loc[i][0][0]:upturn.loc[i][0][1]], color = '#4a8cff', alpha = 0.4)
+
+ plt.axhline(y=0, color='r', linestyle='--', label = "0", alpha = 0.4)
+ plt.xlabel("Date", color = '#595959')
+ plt.ylabel(f"{plot_name} Return", color = '#595959')
+ plt.legend(loc='upper left', frameon=False, fontsize = 12, facecolor = 'none', edgecolor = 'none', labelcolor = '#595959', ncol = 4)
+ if spx:
+ # axis color, ticks color, lable color, grid
+ plt.gca().spines['top'].set_visible(False)
+ plt.gca().spines['left'].set_color('#d9d9d9')
+ plt.gca().spines['right'].set_color('#d9d9d9')
+ plt.gca().spines['bottom'].set_color('#d9d9d9')
+ plt.tick_params(axis='x', colors='#d9d9d9')
+ plt.tick_params(axis='y', colors='#d9d9d9')
+ plt.xticks(color='#595959')
+ plt.yticks(color='#595959')
+
+ # plot spx with the same time period in the same plot, left y-axis is for portfolio return, right y-axis is for spx price
+ ax2 = plt.gca().twinx()
+ for i in downturn.index:
+ if i == 0:
+ plt.plot(spx_data[downturn.loc[i][0][0]:downturn.loc[i][0][1]], color = 'red', alpha = 1, label = "Downturn")
+ else:
+ plt.plot(spx_data[downturn.loc[i][0][0]:downturn.loc[i][0][1]], color = 'red', alpha = 1)
+ for i in upturn.index:
+ if i == 0:
+ plt.plot(spx_data[upturn.loc[i][0][0]:upturn.loc[i][0][1]], color = '#4a8cff', label="Upturn")
+ else:
+ plt.plot(spx_data[upturn.loc[i][0][0]:upturn.loc[i][0][1]], color = '#4a8cff')
+ plt.legend(loc='upper right', frameon=False, fontsize = 12, facecolor = 'none', edgecolor = 'none', labelcolor = '#595959', ncol = 4)
+ plt.title(f"Portfolio Return (M) {plot_name} & SPX500 {start_date} to {end_date}", color = '#595959')
+ ax2.set_ylabel("SPX 500", color = '#595959')
+
+ if plot_type == "dist":
+ plt.hist(ret, bins = 200, color = '#4a8cff', edgecolor = 'grey', label = "Portfolio Return")
+ if var:
+ plt.axvline(np.percentile(ret.dropna(), 1), color = "purple", linestyle = "--", label = "99% VaR")
+ plt.axvline(0, color='b', linestyle='--', label = "0")
+ plt.axvline(ret.mean(), color='r', linestyle='--', label = "Mean")
+ plt.xlabel(f"{plot_name} Return", color = '#595959')
+ plt.ylabel("Frequency", color = '#595959')
+ plt.title(f"Portfolio Return (M) {plot_name} Distribution", color = '#595959')
+ plt.legend(frameon=False, ncol = 1, fontsize = 12, facecolor = 'none', edgecolor = 'none', labelcolor = '#595959')
+
+ # axis color, ticks color, lable color, grid
+ plt.gca().spines['top'].set_visible(False)
+ plt.gca().spines['right'].set_visible(False)
+ plt.gca().spines['left'].set_color('#d9d9d9')
+ plt.gca().spines['bottom'].set_color('#d9d9d9')
+ plt.tick_params(axis='x', colors='#d9d9d9')
+ plt.tick_params(axis='y', colors='#d9d9d9')
+ plt.grid(axis='y', linestyle='--', color='#d9d9d9')
+
+ plt.xticks(color='#595959')
+ plt.yticks(color='#595959')
+
+ # save plot
+ plt.savefig(f"../../plot/{plot_folder}/Return_{plot_name}_{start_date}_{end_date}_{plot_type}.png", dpi = dpi, transparent = transparent)
+
+ plt.show()
+
predictors = ["disp", "noa", "rmax5_21d"]
+
+# drop the rows = 0
+disp_noa_rmax5_21d_ls_ret_lead1m_ts_plot = disp_noa_rmax5_21d_ls_ret_lead1m_ts[disp_noa_rmax5_21d_ls_ret_lead1m_ts["ret_lead1m"] != 0]
+
+LSRetPlot(disp_noa_rmax5_21d_ls_ret_lead1m_ts_plot, predictors, plot_type = "dist", var=True)
+LSRetPlot(disp_noa_rmax5_21d_ls_ret_lead1m_ts_plot, predictors, downturn = downturn, upturn = upturn, spx_data = spx, mv_factor = False, plot_type = "ts", spx = True)
+
def MultiPredictorP2FRet(predictors, ret_list, cond, type = "ts"):
+ # construct portfolio return from past to future
+ port_ret_p2f_ts = pd.DataFrame(columns = ret_list)
+
+ for i, ret in enumerate(ret_list):
+ if type == "ts":
+ port_ret, _, _, _ = MultiPredictorLSRet(predictors, ret_list=[ret], cond = cond, type = "ts", mv_factor=False)
+ elif type == "average":
+ port_ret = MultiPredictorLSRet(predictors, ret_list=[ret], cond = cond, type = "average", mv_factor=False).T
+
+ port_ret_p2f_ts[ret] = port_ret[ret]
+
+ if i == 0 :
+ port_ret_p2f_ts.index = port_ret.index
+
+ # standardize cumulative return to monthly return
+ if "_0" in ret:
+ month = int(ret.split("_")[1])
+ port_ret_p2f_ts[ret] = port_ret_p2f_ts[ret] / month
+
+ return port_ret_p2f_ts
+
# one sample t-test
+def TTest(predictors, ret_list, cond = None):
+ port_ret_p2f_ts = MultiPredictorP2FRet(predictors, ret_list, cond, type = "ts")
+
+ # perform the t-test on each column
+ t_test = pd.DataFrame(columns = ["t-statistic", "p-value"], index = port_ret_p2f_ts.columns)
+ for col in port_ret_p2f_ts.columns:
+ t_stat, p_val = ttest_1samp(port_ret_p2f_ts[col].dropna(how="any"), 0)
+ t_test.loc[col, "t-statistic"] = t_stat
+ t_test.loc[col, "p-value"] = p_val
+
+ return t_test.T
+
ret_list = ['ret_12_0', 'ret_9_0', 'ret_6_0', 'ret_3_0', 'ret_2_0', 'ret_1_0',
+ 'ret_lead1m', 'ret_lead2m', 'ret_lead3m', 'ret_lead4m', 'ret_lead5m', 'ret_lead6m', 'ret_lead7m', 'ret_lead8m', 'ret_lead9m', 'ret_lead10m', 'ret_lead11m', 'ret_lead12m']
+
+# generate ts files for different returns
+for ret in ret_list:
+ _, _, _ = MultiPredictorSort(data, predictors, 5, mv_factor = False, ret_list=[ret], regime = None)
+ _, _, _ = MultiPredictorSort(data, predictors, 5, mv_factor = False, ret_list=[ret], regime = 0)
+ _, _, _ = MultiPredictorSort(data, predictors, 5, mv_factor = False, ret_list=[ret], regime = 1)
+
+display(TTest(predictors, ret_list, cond = disp_noa_rmax5_21d_ts_cond))
+
+ | ret_12_0 | +ret_9_0 | +ret_6_0 | +ret_3_0 | +ret_2_0 | +ret_1_0 | +ret_lead1m | +ret_lead2m | +ret_lead3m | +ret_lead4m | +ret_lead5m | +ret_lead6m | +ret_lead7m | +ret_lead8m | +ret_lead9m | +ret_lead10m | +ret_lead11m | +ret_lead12m | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
t-statistic | +0.279719 | +1.397854 | +0.483051 | +-5.322571 | +-10.240991 | +-22.538607 | +8.655697 | +5.820639 | +4.265612 | +5.277713 | +4.21871 | +3.429607 | +3.876551 | +3.056768 | +3.343559 | +3.133205 | +3.476274 | +2.998557 | +
p-value | +0.779805 | +0.162759 | +0.629265 | +0.0 | +0.0 | +0.0 | +0.0 | +0.0 | +0.000024 | +0.0 | +0.000029 | +0.000654 | +0.00012 | +0.002355 | +0.000888 | +0.00183 | +0.000552 | +0.002847 | +
def TestAlpha(port_ret_ts, data_ff3, ret_list, model = "CAPM"):
+ ret_reg = port_ret_ts.merge(data_ff3, left_index = True, right_index = True)
+
+ # portfolio excess return
+ ret_reg["port_exc_ret"] = ret_reg[ret_list[0]] - ret_reg["rf"]
+ ret_reg = ret_reg.dropna(subset = ["port_exc_ret", "mktrf"], how = "any")
+
+ # regression
+ Y = ret_reg["port_exc_ret"]
+
+ if model == "CAPM":
+ X = ret_reg["mktrf"]
+ X = sm.add_constant(X)
+ LiRegOLS = sm.OLS(Y, X)
+ results = LiRegOLS.fit()
+ # print(results.summary())
+
+ # alpha
+ alpha = results.params["const"]
+ alpha_p = results.pvalues["const"]
+ alpha_t = results.tvalues["const"]
+
+ elif model == "ff3":
+ X = ret_reg[["mktrf", "smb", "hml"]]
+ X = sm.add_constant(X)
+ LiRegOLS = sm.OLS(Y, X)
+ results = LiRegOLS.fit()
+ # print(results.summary())
+
+ # alpha
+ alpha = results.params["const"]
+ alpha_p = results.pvalues["const"]
+ alpha_t = results.tvalues["const"]
+
+ elif model == "ff4":
+ X = ret_reg[["mktrf", "smb", "hml", "umd"]]
+ X = sm.add_constant(X)
+ LiRegOLS = sm.OLS(Y, X)
+ results = LiRegOLS.fit()
+ # print(results.summary())
+
+ # alpha
+ alpha = results.params["const"]
+ alpha_p = results.pvalues["const"]
+ alpha_t = results.tvalues["const"]
+
+ return alpha, alpha_p, alpha_t
+
disp_noa_rmax5_21d_CAPM_alpha, disp_noa_rmax5_21d_CAPM_alpha_p_value, disp_noa_rmax5_21d_CAPM_alpha_t = TestAlpha(disp_noa_rmax5_21d_ls_ret_lead1m_ts, data_ff3, ["ret_lead1m"], model = "CAPM")
+disp_noa_rmax5_21d_ff3_alpha, disp_noa_rmax5_21d_ff3_alpha_p_value, disp_noa_rmax5_21d_ff3_alpha_t = TestAlpha(disp_noa_rmax5_21d_ls_ret_lead1m_ts, data_ff3, ["ret_lead1m"], model = "ff3")
+disp_noa_rmax5_21d_l_CAPM_alpha, disp_noa_rmax5_21d_l_CAPM_alpha_p_value, disp_noa_rmax5_21d_l_CAPM_alpha_t = TestAlpha(disp_noa_rmax5_21d_l_ret_lead1m_ts, data_ff3, ["ret_lead1m"], model = "CAPM")
+disp_noa_rmax5_21d_l_ff3_alpha, disp_noa_rmax5_21d_l_ff3_alpha_p_value, disp_noa_rmax5_21d_l_ff3_alpha_t = TestAlpha(disp_noa_rmax5_21d_l_ret_lead1m_ts, data_ff3, ["ret_lead1m"], model = "ff3")
+disp_noa_rmax5_21d_s_CAPM_alpha, disp_noa_rmax5_21d_s_CAPM_alpha_p_value, disp_noa_rmax5_21d_s_CAPM_alpha_t = TestAlpha(disp_noa_rmax5_21d_s_ret_lead1m_ts, data_ff3, ["ret_lead1m"], model = "CAPM")
+disp_noa_rmax5_21d_s_ff3_alpha, disp_noa_rmax5_21d_s_ff3_alpha_p_value, disp_noa_rmax5_21d_s_ff3_alpha_t = TestAlpha(disp_noa_rmax5_21d_s_ret_lead1m_ts, data_ff3, ["ret_lead1m"], model = "ff3")
+disp_noa_rmax5_21d_ff4_alpha, disp_noa_rmax5_21d_ff4_alpha_p_value, disp_noa_rmax5_21d_ff4_alpha_t = TestAlpha(disp_noa_rmax5_21d_ls_ret_lead1m_ts, data_ff3, ["ret_lead1m"], model = "ff4")
+disp_noa_rmax5_21d_l_ff4_alpha, disp_noa_rmax5_21d_l_ff4_alpha_p_value, disp_noa_rmax5_21d_l_ff4_alpha_t = TestAlpha(disp_noa_rmax5_21d_l_ret_lead1m_ts, data_ff3, ["ret_lead1m"], model = "ff4")
+disp_noa_rmax5_21d_s_ff4_alpha, disp_noa_rmax5_21d_s_ff4_alpha_p_value, disp_noa_rmax5_21d_s_ff4_alpha_t = TestAlpha(disp_noa_rmax5_21d_s_ret_lead1m_ts, data_ff3, ["ret_lead1m"], model = "ff4")
+
+# table display
+disp_noa_rmax5_21d_alpha_table = pd.DataFrame(columns = ["CAPM Alpha", "CAPM p-value", "CAPM t-statistic", "FF3 Alpha", "FF3 p-value", "FF3 t-statistic", "FF4 Alpha", "FF4 p-value", "FF4 t-statistic"], index = ["All", "Long", "Short"])
+disp_noa_rmax5_21d_alpha_table.loc["All", :] = [disp_noa_rmax5_21d_CAPM_alpha, disp_noa_rmax5_21d_CAPM_alpha_p_value, disp_noa_rmax5_21d_CAPM_alpha_t, disp_noa_rmax5_21d_ff3_alpha, disp_noa_rmax5_21d_ff3_alpha_p_value, disp_noa_rmax5_21d_ff3_alpha_t, disp_noa_rmax5_21d_ff4_alpha, disp_noa_rmax5_21d_ff4_alpha_p_value, disp_noa_rmax5_21d_ff4_alpha_t]
+disp_noa_rmax5_21d_alpha_table.loc["Long", :] = [disp_noa_rmax5_21d_l_CAPM_alpha, disp_noa_rmax5_21d_l_CAPM_alpha_p_value, disp_noa_rmax5_21d_l_CAPM_alpha_t, disp_noa_rmax5_21d_l_ff3_alpha, disp_noa_rmax5_21d_l_ff3_alpha_p_value, disp_noa_rmax5_21d_l_ff3_alpha_t, disp_noa_rmax5_21d_l_ff4_alpha, disp_noa_rmax5_21d_l_ff4_alpha_p_value, disp_noa_rmax5_21d_l_ff4_alpha_t]
+disp_noa_rmax5_21d_alpha_table.loc["Short", :] = [disp_noa_rmax5_21d_s_CAPM_alpha, disp_noa_rmax5_21d_s_CAPM_alpha_p_value, disp_noa_rmax5_21d_s_CAPM_alpha_t, disp_noa_rmax5_21d_s_ff3_alpha, disp_noa_rmax5_21d_s_ff3_alpha_p_value, disp_noa_rmax5_21d_s_ff3_alpha_t, disp_noa_rmax5_21d_s_ff4_alpha, disp_noa_rmax5_21d_s_ff4_alpha_p_value, disp_noa_rmax5_21d_s_ff4_alpha_t]
+
+display(disp_noa_rmax5_21d_alpha_table)
+
+ | CAPM Alpha | +CAPM p-value | +CAPM t-statistic | +FF3 Alpha | +FF3 p-value | +FF3 t-statistic | +FF4 Alpha | +FF4 p-value | +FF4 t-statistic | +
---|---|---|---|---|---|---|---|---|---|
All | +0.026483 | +0.0 | +7.961592 | +0.025897 | +0.0 | +7.743233 | +0.026034 | +0.0 | +7.644343 | +
Long | +0.012179 | +0.0 | +6.509637 | +0.012426 | +0.0 | +6.597473 | +0.012761 | +0.0 | +6.659471 | +
Short | +-0.017588 | +0.000024 | +-4.259169 | +-0.016739 | +0.000063 | +-4.03399 | +-0.016498 | +0.000107 | +-3.904767 | +
# 99% VaR in return, win rate
+def VaRWin(port_ret, var = 0.99):
+ var99 = np.percentile(port_ret.dropna(), (1 - var) * 100)
+ win_rate = len(port_ret[port_ret > 0]) / len(port_ret)
+ return var99, win_rate
+
disp_noa_rmax5_21d_ls_ret_lead1m_var99, disp_noa_rmax5_21d_ls_ret_lead1m_win_rate = VaRWin(disp_noa_rmax5_21d_ls_ret_lead1m_ts["ret_lead1m"])
+
+# build a table
+disp_noa_rmax5_21d_var_win_table = pd.DataFrame(columns = ["99% VaR", "Win Rate"], index = ["Portfolio"])
+disp_noa_rmax5_21d_var_win_table.loc["Portfolio", :] = [disp_noa_rmax5_21d_ls_ret_lead1m_var99, disp_noa_rmax5_21d_ls_ret_lead1m_win_rate]
+display(disp_noa_rmax5_21d_var_win_table)
+
+ | 99% VaR | +Win Rate | +
---|---|---|
Portfolio | +-0.157376 | +0.615534 | +
def PlotP2F(predictors_list, port_ret, ret_list, plot_type = "LSP", color_scheme = ["red", "#e0a794", "#a5d3eb", "#3ca7ed", "#56bb74", "#c1a3d7", "#b19d94", "#361e6a"]):
+ x = []
+ for i, ret in enumerate(ret_list):
+ if "_0" in ret:
+ x.append(-int(ret.split("_")[1]))
+ elif "lead" in ret:
+ x.append(int(ret.split("lead")[1][:-1]))
+
+ legend_list = ["_".join(predictors) for predictors in predictors_list]
+
+ if plot_type == "LSP":
+ plt.figure(figsize=(15, 10))
+ for i in range(int((port_ret.shape[0] + 1) / len(predictors_list))):
+ # subplot
+ plt.subplot(int((port_ret.shape[0] + 1) / len(predictors_list)), 1, i + 1)
+ for j in range(len(predictors_list)):
+ plt.plot(x, port_ret.iloc[i + j * 3, :], color = color_scheme[j], label = legend_list[j])
+
+ # axis color, ticks color, lable color, grid
+ plt.gca().spines['top'].set_visible(False)
+ plt.gca().spines['right'].set_visible(False)
+ plt.gca().spines['left'].set_color('#d9d9d9')
+ plt.gca().spines['bottom'].set_color('#d9d9d9')
+ plt.tick_params(axis='x', colors='#d9d9d9')
+ plt.tick_params(axis='y', colors='#d9d9d9')
+ plt.grid(axis='y', linestyle='--', color='#d9d9d9')
+ plt.locator_params(axis='y', nbins=5)
+ plt.xticks(color='#595959')
+ plt.yticks(color='#595959')
+
+ plt.xlabel("Time (M)", color = '#595959')
+ plt.ylabel(f"{port_ret.index[i]}", color = '#595959')
+
+ # bad code here, need to be modified.
+ position = "lower right"
+ if i == 1:
+ position = "upper left"
+ plt.legend(loc=position, frameon=False, fontsize = 12, facecolor = 'none', edgecolor = 'none', labelcolor = '#595959', ncol = 2)
+
+ plt.subplots_adjust(top=0.95)
+ plt.suptitle(f"Comparison Between Predictors(-12M to 12M)", color = '#595959')
+
+ # save plot
+ plt.savefig(f"../../plot/{plot_folder}/lsp_{legend_list[0]}.png", dpi = dpi, transparent = transparent)
+
+ if plot_type == "port":
+ plt.figure(figsize = (15, 8))
+ plt.plot(x, port_ret.loc["long_ret", :], color = 'red', alpha = 0.9, linestyle = "--", label = "Long")
+ plt.plot(x, port_ret.loc["short_ret", :], color = 'red', alpha = 0.5, linestyle = ":", label = "Short")
+ plt.plot(x, port_ret.loc["port_ret", :], color = '#4a8cff', label = "Portfolio")
+
+ # axis color, ticks color, lable color, grid
+ plt.gca().spines['top'].set_visible(False)
+ plt.gca().spines['right'].set_visible(False)
+ plt.gca().spines['left'].set_color('#d9d9d9')
+ plt.gca().spines['bottom'].set_color('#d9d9d9')
+ plt.tick_params(axis='x', colors='#d9d9d9')
+ plt.tick_params(axis='y', colors='#d9d9d9')
+ plt.grid(axis='y', linestyle='--', color='#d9d9d9')
+
+ # x-axis is the -12, 12
+ plt.locator_params(axis='x', nbins=25)
+ plt.xticks(color='#595959')
+ plt.yticks(color='#595959')
+
+ plt.xlabel("Time (M)", color = '#595959')
+ plt.ylabel(f"Return", color = '#595959')
+ plt.title(f"Source of Return Comparison {legend_list[0]}", color = '#595959')
+ plt.legend(loc='upper left', frameon=False, fontsize = 12, facecolor = 'none', edgecolor = 'none', labelcolor = '#595959', ncol = 3)
+
+ # save plot
+ plt.savefig(f"../../plot/{plot_folder}/port_{legend_list[0]}.png", dpi = dpi, transparent = transparent)
+
+ plt.show()
+
# -12m to 12m performence comparison
+ret_list_future = ['ret_lead1m', 'ret_lead2m', 'ret_lead3m', 'ret_lead4m', 'ret_lead5m', 'ret_lead6m', 'ret_lead7m', 'ret_lead8m', 'ret_lead9m', 'ret_lead10m', 'ret_lead11m', 'ret_lead12m']
+predictors_list = [["disp"], ["disp", "rmax5_21d"], ["disp", "noa"], ["disp", "noa", "rmax5_21d"]]
+disp_noa_rmax5_21d_average_cond = [[0, "disp1"], [24, "disp5"]]
+disp_noa_average_cond = [[0, "disp1"], [4, "disp5"]]
+disp_rmax5_21d_average_cond = [[0, "disp1"], [4, "disp5"]]
+disp_average_cond = [[0], [4]]
+
+# calculate average return for different predictor portfolios
+disp_noa_rmax5_21d_ret_lead1m_p2f = MultiPredictorP2FRet(predictors, ret_list, cond = disp_noa_rmax5_21d_average_cond, type = "average")
+disp_noa_ret_lead1m_p2f = MultiPredictorP2FRet(["disp", "noa"], ret_list, cond = disp_noa_average_cond, type = "average")
+disp_rmax5_21d_ret_lead1m_p2f = MultiPredictorP2FRet(["disp", "rmax5_21d"], ret_list, cond = disp_rmax5_21d_average_cond, type = "average")
+disp_ret_lead1m_p2f = MultiPredictorP2FRet(["disp"], ret_list, cond = disp_average_cond, type = "average")
+
+PlotP2F(predictors_list, pd.concat([disp_ret_lead1m_p2f, disp_rmax5_21d_ret_lead1m_p2f, disp_noa_ret_lead1m_p2f, disp_noa_rmax5_21d_ret_lead1m_p2f], axis=0), ret_list, plot_type = "LSP")
+PlotP2F([predictors], disp_noa_rmax5_21d_ret_lead1m_p2f, ret_list, plot_type = "port")
+
def RegimeBarPlot(ret_comp, predictors_list):
+ legend_list = ["_".join(predictors) for predictors in predictors_list]
+ color_scheme = ["#3366d6", "#4a8cff", "#71a3f7", "#a0c2fa"]
+ plt.figure(figsize = (15, 10))
+
+ barWidth = 0.2
+ r = np.arange(len(ret_comp.index))
+ for i in range(len(predictors_list)):
+ plt.bar(r, ret_comp[legend_list[i]], color = color_scheme[i], width = barWidth, edgecolor = color_scheme[i], label = legend_list[i])
+ r = [x + barWidth for x in r]
+
+ plt.xticks([r + barWidth for r in range(len(ret_comp.index))], ret_comp.index, color='#595959')
+ plt.yticks(color='#595959')
+
+ plt.xlabel("Group", color = '#595959')
+ plt.ylabel(f"Return", color = '#595959')
+ plt.title(f"Return Comparison", color = '#595959')
+ plt.legend(loc='upper left', frameon=False, fontsize = 12, facecolor = 'none', edgecolor = 'none', labelcolor = '#595959', ncol = 2)
+
+ # axis color, ticks color, lable color, grid
+ plt.gca().spines['top'].set_visible(False)
+ plt.gca().spines['right'].set_visible(False)
+ plt.gca().spines['left'].set_color('#d9d9d9')
+ plt.gca().spines['bottom'].set_color('#d9d9d9')
+ plt.tick_params(axis='x', colors='#d9d9d9')
+ plt.tick_params(axis='y', colors='#d9d9d9')
+ plt.grid(axis='y', linestyle='--', color='#d9d9d9')
+
+ # save plot
+ plt.savefig(f"../../plot/{plot_folder}/ret_comp_{legend_list[0]}.png", dpi = dpi, transparent = transparent)
+
+ plt.show()
+
# generate files
+_, disp_noa_ret_lead1m_table, _ = MultiPredictorSort(data, ["disp", "noa"], 5, mv_factor = False, ret_list=["ret_lead1m"], regime = None)
+_, disp_noa_ret_lead1m_downturn_table, _ = MultiPredictorSort(data, ["disp", "noa"], 5, mv_factor = False, ret_list=["ret_lead1m"], regime = 0)
+_, disp_noa_ret_lead1m_upturn_table, _ = MultiPredictorSort(data, ["disp", "noa"], 5, mv_factor = False, ret_list=["ret_lead1m"], regime = 1)
+
+_, disp_rmax5_21d_ret_lead1m_table, _ = MultiPredictorSort(data, ["disp", "rmax5_21d"], 5, mv_factor = False, ret_list=["ret_lead1m"], regime = None)
+_, disp_rmax5_21d_ret_lead1m_downturn_table, _ = MultiPredictorSort(data, ["disp", "rmax5_21d"], 5, mv_factor = False, ret_list=["ret_lead1m"], regime = 0)
+_, disp_rmax5_21d_ret_lead1m_upturn_table, _ = MultiPredictorSort(data, ["disp", "rmax5_21d"], 5, mv_factor = False, ret_list=["ret_lead1m"], regime = 1)
+
+_, disp_ret_lead1m_table, _ = MultiPredictorSort(data, ["disp"], 5, mv_factor = False, ret_list=["ret_lead1m"], regime = None)
+_, disp_ret_lead1m_downturn_table, _ = MultiPredictorSort(data, ["disp"], 5, mv_factor = False, ret_list=["ret_lead1m"], regime = 0)
+_, disp_ret_lead1m_upturn_table, _ = MultiPredictorSort(data, ["disp"], 5, mv_factor = False, ret_list=["ret_lead1m"], regime = 1)
+
disp_noa_rmax5_21d_ret_lead1m_average = MultiPredictorLSRet(predictors, mv_factor = False, cond = disp_noa_rmax5_21d_average_cond, type = "average")
+disp_noa_ret_lead1m_average = MultiPredictorLSRet(["disp", "noa"], mv_factor = False, cond = disp_noa_average_cond, type = "average")
+disp_rmax5_21d_ret_lead1m_average = MultiPredictorLSRet(["disp", "rmax5_21d"], mv_factor = False, cond = disp_rmax5_21d_average_cond, type = "average")
+disp_ret_lead1m_average = MultiPredictorLSRet(["disp"], mv_factor = False, cond = disp_average_cond, type = "average")
+
+disp_noa_rmax5_21d_ret_lead1m_downturn_average = MultiPredictorLSRet(predictors, mv_factor = False, cond = disp_noa_rmax5_21d_average_cond, type = "average", regime = 0)
+disp_noa_ret_lead1m_downturn_average = MultiPredictorLSRet(["disp", "noa"], mv_factor = False, cond = disp_noa_average_cond, type = "average", regime = 0)
+disp_rmax5_21d_ret_lead1m_downturn_average = MultiPredictorLSRet(["disp", "rmax5_21d"], mv_factor = False, cond = disp_rmax5_21d_average_cond, type = "average", regime = 0)
+disp_ret_lead1m_downturn_average = MultiPredictorLSRet(["disp"], mv_factor = False, cond = disp_average_cond, type = "average", regime = 0)
+
+disp_noa_rmax5_21d_ret_lead1m_upturn_average = MultiPredictorLSRet(predictors, mv_factor = False, cond = disp_noa_rmax5_21d_average_cond, type = "average", regime = 1)
+disp_noa_ret_lead1m_upturn_average = MultiPredictorLSRet(["disp", "noa"], mv_factor = False, cond = disp_noa_average_cond, type = "average", regime = 1)
+disp_rmax5_21d_ret_lead1m_upturn_average = MultiPredictorLSRet(["disp", "rmax5_21d"], mv_factor = False, cond = disp_rmax5_21d_average_cond, type = "average", regime = 1)
+disp_ret_lead1m_upturn_average = MultiPredictorLSRet(["disp"], mv_factor = False, cond = disp_average_cond, type = "average", regime = 1)
+
+# the order between rmax5_21d and noa is different against the others
+multi_predictor_ret_lead1m_average = pd.concat([disp_ret_lead1m_average, disp_noa_ret_lead1m_average, disp_rmax5_21d_ret_lead1m_average, disp_noa_rmax5_21d_ret_lead1m_average], axis = 0)
+multi_predictor_ret_lead1m_downturn_average = pd.concat([disp_ret_lead1m_downturn_average, disp_noa_ret_lead1m_downturn_average, disp_rmax5_21d_ret_lead1m_downturn_average, disp_noa_rmax5_21d_ret_lead1m_downturn_average], axis = 0)
+multi_predictor_ret_lead1m_upturn_average = pd.concat([disp_ret_lead1m_upturn_average, disp_noa_ret_lead1m_upturn_average, disp_rmax5_21d_ret_lead1m_upturn_average, disp_noa_rmax5_21d_ret_lead1m_upturn_average], axis = 0)
+
+multi_predictor_ret_lead1m_average_comp = pd.concat([multi_predictor_ret_lead1m_average["port_ret"], multi_predictor_ret_lead1m_downturn_average["port_ret"], multi_predictor_ret_lead1m_upturn_average["port_ret"]], axis = 1)
+multi_predictor_ret_lead1m_average_comp.columns = ["all", "downturn", "upturn"]
+multi_predictor_ret_lead1m_average_comp.reset_index(inplace = True, drop = True)
+multi_predictor_ret_lead1m_average_comp.index = ["disp", "disp_noa", "disp_rmax5_21d", "disp_noa_rmax5_21d"]
+multi_predictor_ret_lead1m_average_comp = multi_predictor_ret_lead1m_average_comp.T
+
+RegimeBarPlot(multi_predictor_ret_lead1m_average_comp, predictors_list)
+
# Fama-Macbeth regression
+# step 1: cross sectional regression
+data_fmb = data[["date", "ret_lead1m"] + predictors].copy()
+data_fmb = data_fmb.dropna(subset = ["ret_lead1m"] + predictors, how = "any")
+predictor_exposure = data_fmb.groupby("date").apply(lambda x: sm.OLS(x["ret_lead1m"], sm.add_constant(x[predictors].dropna(subset = predictors, how = "any"))).fit().params).reset_index()
+
+# step 2: time series regression
+Y = data_fmb.groupby("date")["ret_lead1m"].mean().reset_index(drop=True)
+predictor_exposure = pd.concat([predictor_exposure, Y], axis = 1)
+
+model1 = smf.ols("ret_lead1m ~ disp", data = predictor_exposure).fit()
+model2 = smf.ols("ret_lead1m ~ disp + rmax5_21d", data = predictor_exposure).fit()
+model3 = smf.ols("ret_lead1m ~ disp + noa", data = predictor_exposure).fit()
+model4 = smf.ols("ret_lead1m ~ disp + noa + rmax5_21d", data = predictor_exposure).fit()
+
+anova_results = sm.stats.anova_lm(model3, model4)
+# print(model1.summary())
+# print(model2.summary())
+# print(model3.summary())
+print(model4.summary())
+# print(sm.stats.anova_lm(model4, typ = 2))
+# print(sm.stats.anova_lm(model1, model4))
+
OLS Regression Results +============================================================================== +Dep. Variable: ret_lead1m R-squared: 0.442 +Model: OLS Adj. R-squared: 0.438 +Method: Least Squares F-statistic: 134.7 +Date: Wed, 17 Apr 2024 Prob (F-statistic): 2.69e-64 +Time: 13:09:16 Log-Likelihood: 928.37 +No. Observations: 515 AIC: -1849. +Df Residuals: 511 BIC: -1832. +Df Model: 3 +Covariance Type: nonrobust +============================================================================== + coef std err t P>|t| [0.025 0.975] +------------------------------------------------------------------------------ +Intercept 0.0179 0.002 9.556 0.000 0.014 0.022 +disp 0.6227 0.424 1.470 0.142 -0.209 1.455 +noa 0.2795 0.179 1.558 0.120 -0.073 0.632 +rmax5_21d 0.0446 0.002 18.390 0.000 0.040 0.049 +============================================================================== +Omnibus: 104.845 Durbin-Watson: 1.822 +Prob(Omnibus): 0.000 Jarque-Bera (JB): 513.784 +Skew: -0.797 Prob(JB): 2.71e-112 +Kurtosis: 7.626 Cond. No. 248. +============================================================================== + +Notes: +[1] Standard Errors assume that the covariance matrix of the errors is correctly specified. ++
# return visualization of disp, disp_noa, disp_rmax5_21d, disp_noa_rmax5_21d
+ret_list = ["ret_lead1m"]
+predictors1 = ["disp"]
+predictors2 = ["disp", "noa"]
+predictors3 = ["disp", "rmax5_21d"]
+
+_, _, disp_ret_lead1m_ts_table = MultiPredictorSort(data, predictors1, 5, mv_factor = False, ret_list=ret_list, regime = None)
+_, _, disp_noa_ret_lead1m_ts_table = MultiPredictorSort(data, predictors2, 5, mv_factor = False, ret_list=ret_list, regime = None)
+_, _, disp_rmax5_21d_ret_lead1m_ts_table = MultiPredictorSort(data, predictors3, 5, mv_factor = False, ret_list=ret_list, regime = None)
+
+disp_ls_ret_lead1m_ts, _, _, _ = MultiPredictorLSRet(predictors1, ret_list = ret_list, mv_factor = False, type="ts")
+disp_noa_ls_ret_lead1m_ts, _, _, _ = MultiPredictorLSRet(predictors2, ret_list = ret_list, mv_factor = False, type="ts")
+disp_rmax5_21d_ls_ret_lead1m_ts, _, _, _ = MultiPredictorLSRet(predictors3, ret_list = ret_list, mv_factor = False, type="ts")
+
+disp_ls_ret_lead1m_ts_plot = disp_ls_ret_lead1m_ts[disp_ls_ret_lead1m_ts["ret_lead1m"] != 0]
+disp_noa_ls_ret_lead1m_ts_plot = disp_noa_ls_ret_lead1m_ts[disp_noa_ls_ret_lead1m_ts["ret_lead1m"] != 0]
+disp_rmax5_21d_ls_ret_lead1m_ts_plot = disp_rmax5_21d_ls_ret_lead1m_ts[disp_rmax5_21d_ls_ret_lead1m_ts["ret_lead1m"] != 0]
+
+LSRetPlot(disp_ls_ret_lead1m_ts_plot, predictors1, plot_type = "dist", var=True)
+LSRetPlot(disp_noa_ls_ret_lead1m_ts_plot, predictors2, plot_type = "dist", var=True)
+LSRetPlot(disp_rmax5_21d_ls_ret_lead1m_ts_plot, predictors3, plot_type = "dist", var=True)
+
+LSRetPlot(disp_ls_ret_lead1m_ts_plot, predictors1, downturn = downturn, upturn = upturn, spx_data = spx, mv_factor = False, plot_type = "ts", spx = True)
+LSRetPlot(disp_noa_ls_ret_lead1m_ts_plot, predictors2, downturn = downturn, upturn = upturn, spx_data = spx, mv_factor = False, plot_type = "ts", spx = True)
+LSRetPlot(disp_rmax5_21d_ls_ret_lead1m_ts_plot, predictors3, downturn = downturn, upturn = upturn, spx_data = spx, mv_factor = False, plot_type = "ts", spx = True)
+
+LSRetPlot(disp_ls_ret_lead1m_ts_plot, predictors1, mv_factor = False, plot_type = "ts")
+LSRetPlot(disp_noa_ls_ret_lead1m_ts_plot, predictors2, mv_factor = False, plot_type = "ts")
+LSRetPlot(disp_rmax5_21d_ls_ret_lead1m_ts_plot, predictors3, mv_factor = False, plot_type = "ts")
+