-
Notifications
You must be signed in to change notification settings - Fork 1
/
library_lead_lag_analysis.py
537 lines (438 loc) · 22 KB
/
library_lead_lag_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
import numpy as np
import os
import itertools
import pandas as pd
from datetime import datetime,timedelta
from statsmodels.stats.multitest import multipletests
import networkx as nx
import lead_lag
from glob import glob
from matplotlib import rcParams
from networkx.drawing.nx_pydot import graphviz_layout
from matplotlib.patches import Polygon
from matplotlib import patches, path, pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
from library_pyRMT import clipped
def calculate_llr(trade_dir_path, directory_path2,date,min_precision=0.1):
"""Calculates Lead Lag Ratios between two times series of log returns
trade_dir_path (string) : path to time series 1
trade_dir_path2 (string) : path to time series 2
date : date of time series to consider
min_precision (float) : minimum time resolution of lead-lag calculation """
file_key1 = os.path.join(trade_dir_path, f"*{date}*.parquet")
matching_files1 = glob(file_key1)
file_key2 = os.path.join(directory_path2, f"*{date}*.parquet")
matching_files2 = glob(file_key2)
df_1 = pd.read_parquet(matching_files1,columns=['time_exchange','log_returns'])
df_2 = pd.read_parquet(matching_files2,columns=['time_exchange','log_returns'])
df_1['time_exchange'] = pd.to_datetime(df_1['time_exchange'])
df_2['time_exchange'] = pd.to_datetime(df_2['time_exchange'])
timeseries1 = df_1.dropna().groupby('time_exchange').sum().round(16)
timeseries2 = df_2.dropna().groupby('time_exchange').sum().round(16)
ll = lead_lag.LeadLag(
verbose=False,
ts1=timeseries1['log_returns'],
ts2=timeseries2['log_returns'],
max_lag=60, # [-X seconds, +X seconds]
min_precision=min_precision # in seconds.
)
ll.run_inference()
ll_rev = lead_lag.LeadLag(
verbose=False,
ts1=timeseries2['log_returns'],
ts2=timeseries1['log_returns'],
max_lag=60, # [-X seconds, +X seconds]
min_precision=min_precision # in seconds.
)
ll_rev.run_inference()
result = {
'date': date,
'max_contrast_time' : ll.lag_range[ll.contrasts.argmax()]*min_precision,
'max_contrast' : ll.contrasts[ll.contrasts.argmax()],
'llr': ll.llr,
'max_contrast_time_rev' : ll_rev.lag_range[ll_rev.contrasts.argmax()]*min_precision,
'max_contrast_rev' : ll_rev.contrasts[ll_rev.contrasts.argmax()],
'llr_rev': ll_rev.llr,
}
return result
def calculate_llr_google_trends(trade_dir_path, google_trends_series,keyword,date,min_precision=0.1):
"""Calculates Lead Lag Ratios between a times series of log returns and of google trends data
trade_dir_path (string) : path to time series of log returns
google_trends_series (string) : path to time series of google trends data
keyword (string) : google trends keyword for naming purposes
date : date of the time series for naming purposes
min_precision (float) : minimum time resolution of lead-lag calculation """
file_key1 = os.path.join(trade_dir_path, f"*{date}*.parquet")
matching_files1 = glob(file_key1)
print(matching_files1)
df_1 = pd.read_parquet(matching_files1,columns=['time_exchange','log_returns'])
df_1['time_exchange'] = pd.to_datetime(df_1['time_exchange'])
timeseries1 = df_1.dropna().groupby('time_exchange').sum().round(16)
min_precision = 1
ll = lead_lag.LeadLag(
ts1=timeseries1['log_returns'],
ts2=google_trends_series,
max_lag=60, # [-X seconds, +X seconds]
min_precision=min_precision # in seconds.
)
ll.run_inference()
ll_rev = lead_lag.LeadLag(
ts1=google_trends_series,
ts2=timeseries1['log_returns'],
max_lag=60, # [-X seconds, +X seconds]
min_precision=min_precision, # in seconds
)
ll_rev.run_inference()
result = {
'date': date,
'max_contrast_time' : ll.lag_range[ll.contrasts.argmax()]*min_precision,
'max_contrast' : ll.contrasts[ll.contrasts.argmax()],
'llr': ll.llr,
'max_contrast_time_rev' : ll_rev.lag_range[ll_rev.contrasts.argmax()]*min_precision,
'max_contrast_rev' : ll_rev.contrasts[ll_rev.contrasts.argmax()],
'llr_rev': ll_rev.llr,
}
return result
### Marsili Giada Clustering
### This function was as code for the class FIN-525 "Financial Big Data"
def expand_grid_unique(x, y, include_equals=False):
x = list(set(x))
y = list(set(y))
def g(i):
z = [val for val in y if val not in x[:i - include_equals]]
if z:
return [x[i - 1]] + z
combinations = [g(i) for i in range(1, len(x) + 1)]
return [combo for combo in combinations if combo]
### This function was as code for the class FIN-525 "Financial Big Data"
def max_likelihood(c, n):
if n > 1:
return np.log(n / c) + (n - 1) * np.log((n * n - n) / (n * n - c))
else:
return 0
### This function was as code for the class FIN-525 "Financial Big Data"
def max_likelihood_list(cs, ns):
Lc = {}
for x in cs.keys():
if ns[x] > 1:
Lc[x] = np.log(ns[x] / cs[x]) + (ns[x] - 1) * np.log((ns[x] * ns[x] - ns[x]) / (ns[x] * ns[x] - cs[x]))
else:
Lc[x] = 0
return Lc
### This function was as code for the class FIN-525 "Financial Big Data"
def find_max_improving_pair(C, cs, ns, i_s):
N = len(i_s)
Lc_new = {}
Lc_old = max_likelihood_list(cs, ns)
names_cs = list(cs.keys())
max_impr = -1e10
pair_max_improv = []
for i in names_cs[:-1]:
names_cs_j = names_cs[names_cs.index(i) + 1:]
for j in names_cs_j:
ns_new = ns[i] + ns[j]
i_s_new = i_s[i] + i_s[j]
# ipdb.set_trace()
cs_new = np.sum(C[np.ix_(i_s_new, i_s_new)])
max_likelihood_new = max_likelihood(cs_new, ns_new)
improvement = max_likelihood_new - Lc_old[i] - Lc_old[j]
if improvement > max_impr:
max_impr = improvement
pair_max_improv = [i, j]
Lc_max_impr = max_likelihood_new
# print(pair_max_improv)
# print(max_impr)
# print(Lc_max_impr)
# print("*************")
return {"pair": pair_max_improv, "Lc_new": Lc_max_impr, "Lc_old": [Lc_old[x] for x in pair_max_improv]}
### This function was as code for the class FIN-525 "Financial Big Data"
def aggregate_clusters(C, only_log_likelihood_improving_merges=False):
N = C.shape[0]
cs = {i: 1 for i in range(N)}
s_i = {i: [i] for i in range(N)}
ns = {i: 1 for i in range(N)}
i_s = {i: [i] for i in range(N)}
Lc = {i: 0 for i in range(N)}
all_pairs = [(i, j) for i in range(1, N + 1) for j in range(1, N + 1)]
clusters = []
for i in range(1, N): # hierarchical merging
improvement = find_max_improving_pair(C, cs, ns, i_s)
Lc_old = improvement['Lc_old']
Lc_new = improvement['Lc_new']
if Lc_new < sum(Lc_old):
print(" HALF CLUSTER Lc.new > max(Lc.old)")
# browser()
if Lc_new <= max(Lc_old):
print("Lc.new <= max(Lc.old), exiting")
break
pair = improvement['pair']
s_i = [pair[0] if x == pair[1] else x for x in s_i]
cluster1 = pair[0]
cluster2 = pair[1]
i_s[cluster1].extend(i_s[cluster2]) # merge the elements of the two clusters
del i_s[cluster2] # removes reference to merged cluster2
ns[cluster1] += ns[cluster2]
del ns[cluster2]
cs[cluster1] = np.sum(C[i_s[cluster1]][:, i_s[cluster1]]) # sums C over the elements of cluster1
del cs[cluster2]
cs_vec = list(cs.values())
ns_vec = list(ns.values())
clusters.append({
'Lc': max_likelihood_list(cs, ns),
'pair_merged': pair,
's_i': s_i,
'i_s': i_s,
'cs': cs,
'ns': ns
})
last_clusters = clusters[-1]
return last_clusters
def plot_giada_marsili_clustering(RMT_corrected=True):
"""
This function generates Fig. 20 and 21 of our report.
Arugments:
RMT_corrected (bool) : True if correlation matrix of networks is cleaned with RMT before clustering"""
llrs = pd.read_csv("data/clean/lead_lag_graphs/all_graphs.csv",header=None).values
if RMT_corrected:
#pyRMT cleaned correlation matrix
correlation_matrix = clipped(llrs)
else :
correlation_matrix = np.corrcoef(llrs)
#Apply clustering from course code.
clusters = aggregate_clusters(correlation_matrix)
all_clusters = clusters['i_s']
inverted_clusters = {day: cluster for cluster, days in all_clusters.items() for day in days}
cluster_reps = list(set(inverted_clusters.values()))
rows = 12
columns = 31
matrix = np.full((rows, columns), np.nan)
# Assign cluster colors to the matrix
start_date = datetime(2021, 1, 1)
index_dict = {element: index for index, element in enumerate(cluster_reps)}
for day in range(0, 365):
current_date = start_date + timedelta(days=day)
month, date = current_date.month - 1, current_date.day - 1
matrix[month, date] = index_dict.get(inverted_clusters.get(day, np.nan), np.nan) # Adjust day + 1 to start from 1
# Create a figure with 7 columns
_, ax = plt.subplots()
# Create a mask for non-NaN values
mask = np.isnan(matrix)
# Create a heatmap using seaborn with black borders only in the outer region
sns.heatmap(matrix, cmap=sns.color_palette("Spectral", 3), xticklabels=True, yticklabels=True, ax=ax, mask=mask, linewidths=1, linecolor='black')
# Set y-axis ticks to month names
month_names = [datetime(2021, i + 1, 1).strftime('%B') for i in range(12)]
ax.set_yticks(np.arange(rows) + 0.5)
ax.set_yticklabels(month_names, rotation=0)
# Set x-axis ticks and labels
ax.set_xticks(np.arange(columns) + 0.5)
ax.set_xticklabels(np.arange(1, columns + 1))
# Set minor ticks for both x and y axes
ax.set_xticks(np.arange(31), minor=True)
ax.set_yticks(np.arange(12), minor=True)
# Customize the colorbar
vmap = {i: chr(65 + i) for i in range(len(all_clusters))}
n = len(vmap)
colorbar = ax.collections[0].colorbar
# Calculate positions to place the labels evenly distributed across the colorbar
r = colorbar.vmax - colorbar.vmin
colorbar.set_ticks([colorbar.vmin + 0.5 * r / n + r * i / n for i in range(n)])
colorbar.set_ticklabels(list(vmap.values()))
# Customize the grid
ax.grid(True, which="minor", color="w", linewidth=2)
ax.tick_params(which="minor", left=False, bottom=False)
# Show the plot
plt.show()
plt.savefig(f"figures/clustering.png")
### Validating Statistical Significance of Lead-Lag Ratios
def empirical_cdf(x, data, bins=100):
"""Generates an empirical cdf of the data, used to calculate p-values from a null-model."""
hist, bin_edges = np.histogram(data, bins=bins, density=True)
cdf = np.cumsum(hist * np.diff(bin_edges))
# Linear interpolation to find the CDF at x
cdf_at_x = np.interp(x, bin_edges[1:], cdf)
return cdf_at_x
def remove_outliers(data):
"""Clips the distribution in quantiles 1% and 99% to avoid artifacts in the LLR calculation."""
q1 = data.quantile(0.01)
q3 = data.quantile(0.99)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
return data[(data >= lower_bound) & (data <= upper_bound)]
def llr_plot(df_result, date, threshold):
"""This function generates Fig. 19 of our report."""
#Class to have a polygon with rounded corners in matplotlib. Code originally from scleronomic in https://stackoverflow.com/questions/19270673/matplotlib-radius-in-polygon-edges-is-it-possible
class RoundedPolygon(patches.PathPatch):
def __init__(self, xy, pad, **kwargs):
p = path.Path(*self.__round(xy=xy, pad=pad))
super().__init__(path=p, **kwargs)
def __round(self, xy, pad):
n = len(xy)
for i in range(0, n):
x0, x1, x2 = np.atleast_1d(xy[i - 1], xy[i], xy[(i + 1) % n])
d01, d12 = x1 - x0, x2 - x1
d01, d12 = d01 / np.linalg.norm(d01), d12 / np.linalg.norm(d12)
x00 = x0 + pad * d01
x01 = x1 - pad * d01
x10 = x1 + pad * d12
x11 = x2 - pad * d12
if i == 0:
verts = [x00, x01, x1, x10]
else:
verts += [x01, x1, x10]
codes = [path.Path.MOVETO] + n*[path.Path.LINETO, path.Path.CURVE3, path.Path.CURVE3]
return np.atleast_1d(verts, codes)
test = {}
row = df_result.loc[date]
# Loop through the columns and extract nodeX, nodeY, and the corresponding llrs value, to plot directed edges
for column in df_result.columns:
if 'llr_' in column:
if 'keyword' in column:
if 'rev' in column:
_, _, nodeY, _, nodeX = column.split('_')
else:
_, nodeX, _, nodeY = column.split('_')
llrs_value = row[column]
else:
if 'rev' in column:
_, _, nodeY, nodeX, _ = column.split('_')
else:
_, nodeX, nodeY, _ = column.split('_')
llrs_value = row[column]
if nodeX not in test:
test[nodeX] = {}
test[nodeX][nodeY] = llrs_value
G = nx.DiGraph((k, v, {'weight': weight}) for k, vs in test.items() for v, weight in vs.items())
adjacency_matrix = nx.to_pandas_adjacency(G).values.flatten()
# Graph nodes
ADA_nodes = ['binanceADA']
ADA_keyword_nodes = ['Cardano']
BTC_nodes = ['binanceBTC', 'bitstampBTC']
BTC_keyword_nodes = ['btc']
ETH_nodes = ['binanceETH','bitstampETH']
ETH_keyword_nodes = ['Ethereum']
LTC_nodes = ['binanceLTC','bitstampLTC']
LTC_keyword_nodes = ['Litecoin']
trading_node = ['trading']
musk_node = ['Musk']
FED_node = ['FED']
node_list = ADA_nodes + BTC_nodes + ETH_nodes + LTC_nodes + musk_node + FED_node + ADA_keyword_nodes + BTC_keyword_nodes + ETH_keyword_nodes + LTC_keyword_nodes
def filter_edges(graph,threshold, all_keywords=False):
"""This function filters out edges that are not statistically significant
Arguments:
graph (nx.Graph) : the graph that will have edges removed
threshold (float) : The p-value that is the threshold for statistical significant edges
all_keywords (bool) : True if edges coming from statistically not significant keywords should also be filtered. False if only statistically significant keywords should be kept."""
filtered_edges = []
for edge in graph.edges(data=True):
source, target, data = edge
if all_keywords or (source in node_list and target in node_list):
reverse_edge_exists = graph.has_edge(target, source)
if not reverse_edge_exists and data['weight'] > threshold:
filtered_edges.append((source, target, data))
else:
reverse_edge_weight = graph.get_edge_data(target, source).get('weight', 0)
original_edge_weight = data.get('weight', 0)
if original_edge_weight > reverse_edge_weight and data['weight'] > threshold:
filtered_edges.append((source, target, data))
return filtered_edges
filtered_edges = sorted(filter_edges(G,threshold), key=lambda x: x[2]['weight'], reverse=True)
rcParams['font.size']
#Generate a new graph with the filtered edge list.
G2 = nx.DiGraph()
G2.add_edges_from(filtered_edges)
#Node positions in the plot.
pos = {'binanceETH': (0,2),
'binanceLTC': (1,0),
'binanceADA': (-1,0),
'binanceBTC': (0,-2),
'bitstampETH': (2,2),
'bitstampBTC': (2,-2),
'bitstampLTC': (2,0),
'Cardano' : (2,3),
'Ethereum' : (1,3),
'btc' : (0,3),
'Litecoin' : (-1,3),
'Musk' : (-1,4),
'trading' : (0,4),
'FED' : (1,4)}
#Create legend handles to label nodes correctly.
nodes_handle1 = nx.draw_networkx_nodes(G2, pos, nodelist=ADA_nodes,label='ADA coin', node_color='#33a02c',node_size=15**2)
ADA_keyword_nodes = nx.draw_networkx_nodes(G2, pos, nodelist=ADA_keyword_nodes,label='Cardano keyword', node_shape = 's', node_color='#33a02c',node_size=15**2)
nodes_handle2 = nx.draw_networkx_nodes(G2, pos, nodelist=BTC_nodes,label='BTC coin', node_color='#1f78b4', node_size=15**2)
BTC_keyword_nodes = nx.draw_networkx_nodes(G2, pos, nodelist=BTC_keyword_nodes,label='BTC keyword', node_shape = 's', node_color='#1f78b4', node_size=15**2)
nodes_handle3 = nx.draw_networkx_nodes(G2, pos, nodelist=ETH_nodes,label='ETH coin', node_color='#e31a1c', node_size=15**2)
ETH_keyword_nodes = nx.draw_networkx_nodes(G2, pos, nodelist=ETH_keyword_nodes,label='Ethereum keyword', node_shape = 's', node_color='#e31a1c', node_size=15**2)
nodes_handle4 = nx.draw_networkx_nodes(G2, pos, nodelist=LTC_nodes,label='LTC coin', node_color='#6a3d9a', node_size=15**2)
LTC_keyword_nodes = nx.draw_networkx_nodes(G2, pos, nodelist=LTC_keyword_nodes,label='Litecoin keyword', node_shape = 's', node_color='#6a3d9a', node_size=15**2)
nodes_handle5 = nx.draw_networkx_nodes(G2, pos, nodelist=musk_node,label='Musk keyword', node_shape = 's', node_color='#525252', node_size=15**2)
nodes_handle6 = nx.draw_networkx_nodes(G2, pos, nodelist=trading_node,label='trading keyword', node_shape = 's', node_color='#737373', node_size=15**2)
nodes_handle7 = nx.draw_networkx_nodes(G2, pos, nodelist=FED_node,label='FED keyword', node_shape = 's', linewidths=1,node_color='#bdbdbd', node_size=15**2)
weights = np.array(list(nx.get_edge_attributes(G,'weight').values()))
#Position of nodes in the plot.
binance_vertices = np.array([[-1.5, 2.4], [1.5, 2.4], [1.5, -2.4], [-1.5, -2.4]])
bitstamp_vertices = np.array([[1.6, 2.4], [2.4, 2.4], [2.4, -2.4], [1.6, -2.4]])
GT_vertices = np.array([[-1.5, 4.4], [2.4, 4.4], [2.4, 2.5], [-1.5, 2.5]])
#Position of patches corresponding to exchanges and Google Trends in the plot.
binance_patch = RoundedPolygon(xy=binance_vertices, pad= 0.3, facecolor='white', edgecolor='black', linestyle='-', linewidth=2,joinstyle='round', alpha=1)
bitstamp_patch = RoundedPolygon(xy=bitstamp_vertices, pad= 0.3, facecolor='white', edgecolor='black', linestyle='-', linewidth=2,joinstyle='round', alpha=0.7)
GT_patch = RoundedPolygon(xy=GT_vertices, pad= 0.3, facecolor='white', edgecolor='black', linestyle='-', linewidth=2,joinstyle='round', alpha=0.4)
#Plot configs.
plt.axis("off")
plt.legend(handles=[nodes_handle1,nodes_handle2,nodes_handle3,nodes_handle4, nodes_handle5, nodes_handle6, nodes_handle7, ADA_keyword_nodes, BTC_keyword_nodes,ETH_keyword_nodes,LTC_keyword_nodes], loc="lower left",bbox_to_anchor=(1.05,0))
plt.gca().add_patch(binance_patch)
plt.gca().add_patch(bitstamp_patch)
plt.gca().add_patch(GT_patch)
nx.draw_networkx_edges(G2,pos,width=weights,connectionstyle='arc3, rad = 0.1')
plt.text(-1.7, 3.6, 'Google Trends', fontsize=14, color='black', ha='center', va='center',rotation=90,alpha=0.4)
plt.text(-1.7, 1.8, 'Binance', fontsize=14, color='black', ha='center', va='center',rotation=90)
plt.text(2.6, 1.6, 'Bistamp', fontsize=14, color='black', ha='center', va='center',rotation=270,alpha=0.7)
plt.ylim([-2.5,4.5])
plt.xlim([-4,2.5])
plt.tight_layout()
plt.savefig(f"figures/lead_lag_graph_{date}")
plt.show()
def degrees_to_dataframe_row(graph):
"""This function is used to calculate centrality metrics of the graph for Table 7 in the report."""
data = {}
for node in graph.nodes:
in_degree = graph.in_degree(node)
out_degree = graph.out_degree(node)
data[f"In-Degree_{node}"] = in_degree
data[f"Out-Degree_{node}"] = out_degree
return pd.DataFrame([data])
return adjacency_matrix, filtered_edges, node_list, degrees_to_dataframe_row(G2)
def longest_running_link(array):
"""This function is used to calculate lead-lag metrics of the graph for Table 7 in the report."""
max_len = 0
current_len = 0
index = 0
for index in range(len(array)):
if array[index] == 0:
max_len = max(max_len,current_len)
current_len = 0
else:
current_len += 1
index += 1
max_len = max( max_len, current_len)
return max_len
def fetch_lead_lag_ratios():
"""This function fetches an intermediary result to avoid duplicate computations : we save lead_lag_ratios of all 365 daily networks as vectors in order to compute their correlation and perform clustering."""
directory = "data/clean/lead_lag_ratios"
empirical_distribution_null_model = pd.read_csv(os.path.join(directory,"null_model.csv"))
files = os.listdir(directory)
llrs_files = [file for file in files if file.endswith('.csv') and 'null_model' not in file]
df_result = pd.read_csv(os.path.join(directory,llrs_files[0]), index_col='date')
df_result.rename(lambda x : x + "_"+ os.path.splitext(llrs_files[0])[0],inplace=True,axis=1)
for file in llrs_files[1:]:
df_temp = pd.read_csv(os.path.join(directory,file), index_col='date')
df_temp.rename(lambda x : x + "_"+ os.path.splitext(file)[0],inplace=True,axis=1)
df_result = df_result.join(df_temp)
llr_columns = [col for col in df_result.columns if col.startswith('llr')]
llr_df = df_result[llr_columns]
flattened_data = pd.Series(llr_df.values.flatten())
filtered_llrs = remove_outliers(flattened_data)
statistically_significant_llr = np.array([llr for llr in filtered_llrs if multipletests(1 - empirical_cdf(llr, empirical_distribution_null_model['LLR'].values))[0]])
minimum_statistically_significant_llr = np.min(statistically_significant_llr)
return llr_df, minimum_statistically_significant_llr