From 97c9db1af8b2961afd1721c6cd3a1bdfccdb2f5a Mon Sep 17 00:00:00 2001 From: Dominik Stiller Date: Wed, 13 Nov 2024 14:09:37 -0800 Subject: [PATCH] Ignore nans in correlation for duplicate finding --- cfr/proxy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cfr/proxy.py b/cfr/proxy.py index 32aa79e..a07c4bf 100644 --- a/cfr/proxy.py +++ b/cfr/proxy.py @@ -1373,7 +1373,7 @@ def find_duplicates(self, r_thresh=0.9, time_period=[0, 2000]): mask = (df_proxy.index>=time_period[0]) & (df_proxy.index<=time_period[-1]) df_proxy = df_proxy[mask] pid_list = df_proxy.columns.values - R = np.triu(np.corrcoef(df_proxy.values.T), k=1) + R = np.triu(df_proxy.corr(), k=1) R[R==0] = np.nan di, dj = np.where(R >= r_thresh) dup_pids = [] @@ -1972,4 +1972,4 @@ def count_availability(self, year=np.arange(2001)): for ptype in df_count.keys(): df_count[ptype]['Sum'] = df_count[ptype].sum(axis=1).astype(int) - return df_count \ No newline at end of file + return df_count