-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathks_2samp_sparksql.py
59 lines (49 loc) · 1.54 KB
/
ks_2samp_sparksql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import numpy as np
import pyspark.sql.functions as funcs
from pyspark.sql.window import Window
from scipy.stats import distributions
CDF_1 = 'cdf_1'
CDF_2 = 'cdf_2'
FILLED_CDF_1 = 'filled_cdf_1'
FILLED_CDF_2 = 'filled_cdf_2'
def get_cdf(df, variable, col_name):
cdf = df.select(variable).na.drop().\
withColumn(
col_name,
funcs.cume_dist().over(Window.orderBy(variable))
).distinct()
return cdf
def ks_2samp(df1, var1, df2, var2):
ks_stat = get_cdf(df1, var1, CDF_1).\
join(
get_cdf(df2, var2, CDF_2),
on=df1[var1] == df2[var2],
how='outer'
).\
withColumn(
FILLED_CDF_1,
funcs.last(funcs.col(CDF_1), ignorenulls=True).
over(Window.rowsBetween(Window.unboundedPreceding, Window.currentRow))
).\
withColumn(
FILLED_CDF_2,
funcs.last(funcs.col(CDF_2), ignorenulls=True).
over(Window.rowsBetween(Window.unboundedPreceding, Window.currentRow))
).\
select(
funcs.max(
funcs.abs(
funcs.col(FILLED_CDF_1) - funcs.col(FILLED_CDF_2)
)
)
).\
collect()[0][0]
# Adapted from scipy.stats ks_2samp
n1 = df1.select(var1).na.drop().count()
n2 = df2.select(var2).na.drop().count()
en = np.sqrt(n1 * n2 / float(n1 + n2))
try:
prob = distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * ks_stat)
except:
prob = 1.0
return ks_stat, prob