From 97c9db1af8b2961afd1721c6cd3a1bdfccdb2f5a Mon Sep 17 00:00:00 2001
From: Dominik Stiller <dstiller@uw.edu>
Date: Wed, 13 Nov 2024 14:09:37 -0800
Subject: [PATCH] Ignore nans in correlation for duplicate finding

---
 cfr/proxy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cfr/proxy.py b/cfr/proxy.py
index 32aa79e..a07c4bf 100644
--- a/cfr/proxy.py
+++ b/cfr/proxy.py
@@ -1373,7 +1373,7 @@ def find_duplicates(self, r_thresh=0.9, time_period=[0, 2000]):
         mask = (df_proxy.index>=time_period[0]) & (df_proxy.index<=time_period[-1])
         df_proxy = df_proxy[mask]
         pid_list = df_proxy.columns.values
-        R = np.triu(np.corrcoef(df_proxy.values.T), k=1) 
+        R = np.triu(df_proxy.corr(), k=1)
         R[R==0] = np.nan
         di, dj = np.where(R >= r_thresh)
         dup_pids = []
@@ -1972,4 +1972,4 @@ def count_availability(self, year=np.arange(2001)):
         for ptype in df_count.keys():
             df_count[ptype]['Sum'] = df_count[ptype].sum(axis=1).astype(int)
 
-        return df_count
\ No newline at end of file
+        return df_count