From 0823f4361850145152a94e9086bede6a000d8a4a Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 15 Apr 2024 10:08:21 -0500 Subject: [PATCH] gh-115532: Minor tweaks to kde() (gh-117897) --- Doc/library/statistics.rst | 3 ++- Lib/statistics.py | 34 +++++++++++++++++++++++----------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 197c123f8356d8..873ccd650f45cd 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1163,7 +1163,7 @@ accurately approximated inverse cumulative distribution function. .. testcode:: from random import choice, random, seed - from math import sqrt, log, pi, tan, asin + from math import sqrt, log, pi, tan, asin, cos, acos from statistics import NormalDist kernel_invcdfs = { @@ -1172,6 +1172,7 @@ accurately approximated inverse cumulative distribution function. 'sigmoid': lambda p: log(tan(p * pi/2)), 'rectangular': lambda p: 2*p - 1, 'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p), + 'parabolic': lambda p: 2 * cos((acos(2*p-1) + pi) / 3), 'cosine': lambda p: 2*asin(2*p - 1)/pi, } diff --git a/Lib/statistics.py b/Lib/statistics.py index 58fb31def8896e..fc00891b083dc3 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -919,13 +919,13 @@ def kde(data, h, kernel='normal', *, cumulative=False): sqrt2pi = sqrt(2 * pi) sqrt2 = sqrt(2) K = lambda t: exp(-1/2 * t * t) / sqrt2pi - I = lambda t: 1/2 * (1.0 + erf(t / sqrt2)) + W = lambda t: 1/2 * (1.0 + erf(t / sqrt2)) support = None case 'logistic': # 1.0 / (exp(t) + 2.0 + exp(-t)) K = lambda t: 1/2 / (1.0 + cosh(t)) - I = lambda t: 1.0 - 1.0 / (exp(t) + 1.0) + W = lambda t: 1.0 - 1.0 / (exp(t) + 1.0) support = None case 'sigmoid': @@ -933,39 +933,39 @@ def kde(data, h, kernel='normal', *, cumulative=False): c1 = 1 / pi c2 = 2 / pi K = lambda t: c1 / cosh(t) - I = lambda t: c2 * atan(exp(t)) + W = lambda t: c2 * atan(exp(t)) support = None case 'rectangular' | 'uniform': K = lambda t: 1/2 - I = lambda t: 1/2 * t + 1/2 + W = lambda t: 1/2 * t + 1/2 support = 1.0 case 'triangular': K = lambda t: 1.0 - abs(t) - I = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2 + W = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2 support = 1.0 case 'parabolic' | 'epanechnikov': K = lambda t: 3/4 * (1.0 - t * t) - I = lambda t: -1/4 * t**3 + 3/4 * t + 1/2 + W = lambda t: -1/4 * t**3 + 3/4 * t + 1/2 support = 1.0 case 'quartic' | 'biweight': K = lambda t: 15/16 * (1.0 - t * t) ** 2 - I = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2 + W = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2 support = 1.0 case 'triweight': K = lambda t: 35/32 * (1.0 - t * t) ** 3 - I = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2 + W = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2 support = 1.0 case 'cosine': c1 = pi / 4 c2 = pi / 2 K = lambda t: c1 * cos(c2 * t) - I = lambda t: 1/2 * sin(c2 * t) + 1/2 + W = lambda t: 1/2 * sin(c2 * t) + 1/2 support = 1.0 case _: @@ -974,10 +974,14 @@ def kde(data, h, kernel='normal', *, cumulative=False): if support is None: def pdf(x): + n = len(data) return sum(K((x - x_i) / h) for x_i in data) / (n * h) def cdf(x): - return sum(I((x - x_i) / h) for x_i in data) / n + + n = len(data) + return sum(W((x - x_i) / h) for x_i in data) / n + else: @@ -985,16 +989,24 @@ def cdf(x): bandwidth = h * support def pdf(x): + nonlocal n, sample + if len(data) != n: + sample = sorted(data) + n = len(data) i = bisect_left(sample, x - bandwidth) j = bisect_right(sample, x + bandwidth) supported = sample[i : j] return sum(K((x - x_i) / h) for x_i in supported) / (n * h) def cdf(x): + nonlocal n, sample + if len(data) != n: + sample = sorted(data) + n = len(data) i = bisect_left(sample, x - bandwidth) j = bisect_right(sample, x + bandwidth) supported = sample[i : j] - return sum((I((x - x_i) / h) for x_i in supported), i) / n + return sum((W((x - x_i) / h) for x_i in supported), i) / n if cumulative: cdf.__doc__ = f'CDF estimate with {h=!r} and {kernel=!r}'