From 72a2d74c7c9992c7e1ab6eb8ad19902ec3cca3ea Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 25 Mar 2024 10:14:16 -0500 Subject: [PATCH 01/14] kde_random() recipe --- Doc/library/statistics.rst | 47 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 8cd43c2d6305d8..d95611f7359eb3 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1147,6 +1147,53 @@ The final prediction goes to the largest posterior. This is known as the 'female' +Sampling from an estimated kernel density function +************************************************** + +The *kde()* function creates a continuous probability density function +from discrete samples. Some applications need a way to make random +selections from that distribution. + +Since the estimated function is just the sum of kernels centered +at each data point, the problem can be reduced to sampling the +kernel function and recentering the result around a randomly chosen +data point. This works for kernels that have a known or accurately +approximated inverse cumulative distribution functions. + +.. testcode:: + + from math import log, tan, sqrt, asin + from random import choice, random, seed + + kernel_invcdfs = { + 'normal': NormalDist().inv_cdf, + 'logisitic': lambda p: log(p / (1 - p)), + 'sigmoid': lambda p: log(tan(p * pi/2)), + 'rectangular': lambda p: 2*p - 1, + 'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p), + 'cosine': lambda p: 2*asin(2*p - 1)/pi, + } + kernel_invcdfs['gauss'] = kernel_invcdfs['normal'] + kernel_invcdfs['uniform'] = kernel_invcdfs['rectangular'] + + def kde_random(data, h, kernel='normal'): + 'Generate random a sample from an estimated kernel density function.' + kernel_invcdf = kernel_invcdfs[kernel] + def rand(): + return choice(data) + h * kernel_invcdf(random()) + return rand + +For example: + +.. doctest:: + + >>> discrete_samples = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + >>> rand = kde_random(discrete_samples, h=1.5) + >>> seed(8675309) + >>> [round(rand(), 1) for i in range(10)] + [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6] + + .. # This modelines must appear within the last ten lines of the file. kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8; From 4117a614cd5cac69e61df99947b56b94cb8a10ac Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 25 Mar 2024 10:17:54 -0500 Subject: [PATCH 02/14] Tweak markup and wording --- Doc/library/statistics.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index d95611f7359eb3..27772245c3a9e6 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1150,20 +1150,20 @@ The final prediction goes to the largest posterior. This is known as the Sampling from an estimated kernel density function ************************************************** -The *kde()* function creates a continuous probability density function -from discrete samples. Some applications need a way to make random -selections from that distribution. +The :func:`kde()` function creates a continuous probability density +function from discrete samples. Some applications need a way to make +random selections from that distribution. -Since the estimated function is just the sum of kernels centered -at each data point, the problem can be reduced to sampling the -kernel function and recentering the result around a randomly chosen -data point. This works for kernels that have a known or accurately -approximated inverse cumulative distribution functions. +Since the estimated function is just the sum of kernels centered at each +data point, the problem can be reduced to sampling the kernel function +and recentering the result around a randomly chosen data point. This +works for kernels that have a known or accurately approximated inverse +cumulative distribution functions. .. testcode:: - from math import log, tan, sqrt, asin from random import choice, random, seed + from math import log, tan, sqrt, asin kernel_invcdfs = { 'normal': NormalDist().inv_cdf, From 4bd19c9d383c868f09862747a0c834de95718e94 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 26 Mar 2024 09:39:22 -0500 Subject: [PATCH 03/14] The aliases are visual clutter and not essential to the recipe. --- Doc/library/statistics.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 27772245c3a9e6..d5f5e34c1b5663 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1157,7 +1157,7 @@ random selections from that distribution. Since the estimated function is just the sum of kernels centered at each data point, the problem can be reduced to sampling the kernel function and recentering the result around a randomly chosen data point. This -works for kernels that have a known or accurately approximated inverse +works for kernels that have known or accurately approximated inverse cumulative distribution functions. .. testcode:: @@ -1173,8 +1173,6 @@ cumulative distribution functions. 'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p), 'cosine': lambda p: 2*asin(2*p - 1)/pi, } - kernel_invcdfs['gauss'] = kernel_invcdfs['normal'] - kernel_invcdfs['uniform'] = kernel_invcdfs['rectangular'] def kde_random(data, h, kernel='normal'): 'Generate random a sample from an estimated kernel density function.' From 211836c9369447ff7729cf941dca49b501a76b86 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 26 Mar 2024 10:07:44 -0500 Subject: [PATCH 04/14] Add invcdf() / cdf() round trip tests --- Doc/library/statistics.rst | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index d5f5e34c1b5663..ba898e2fd27d1e 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1163,11 +1163,12 @@ cumulative distribution functions. .. testcode:: from random import choice, random, seed - from math import log, tan, sqrt, asin + from math import sqrt, log, pi, tan, asin + from statistics import NormalDist kernel_invcdfs = { 'normal': NormalDist().inv_cdf, - 'logisitic': lambda p: log(p / (1 - p)), + 'logistic': lambda p: log(p / (1 - p)), 'sigmoid': lambda p: log(tan(p * pi/2)), 'rectangular': lambda p: 2*p - 1, 'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p), @@ -1191,6 +1192,18 @@ For example: >>> [round(rand(), 1) for i in range(10)] [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6] +.. testcode:: + :hide: + + from statistics import kde + from math import isclose + + # Verify that cdf / invcdf will round trip + xarr = [i/100 for i in range(-100, 101)] + for kernel, invcdf in kernel_invcdfs.items(): + cdf = kde([0], h=1.0, kernel=kernel, cumulative=True) + for x in xarr: + assert isclose(invcdf(cdf(x)), x, abs_tol=1E-9) .. # This modelines must appear within the last ten lines of the file. From 6f0aafc956be947301818260b49faef686716b0a Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 26 Mar 2024 10:10:19 -0500 Subject: [PATCH 05/14] . --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index ba898e2fd27d1e..65b6970170d55c 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1201,7 +1201,7 @@ For example: # Verify that cdf / invcdf will round trip xarr = [i/100 for i in range(-100, 101)] for kernel, invcdf in kernel_invcdfs.items(): - cdf = kde([0], h=1.0, kernel=kernel, cumulative=True) + cdf = kde([0.0], h=1.0, kernel=kernel, cumulative=True) for x in xarr: assert isclose(invcdf(cdf(x)), x, abs_tol=1E-9) From e2c8d1c92affcc74d0188e2226bd73c7c6834757 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 26 Mar 2024 16:20:50 -0500 Subject: [PATCH 06/14] Improve docstring --- Doc/library/statistics.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 65b6970170d55c..cc66b1cab9435f 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1154,7 +1154,7 @@ The :func:`kde()` function creates a continuous probability density function from discrete samples. Some applications need a way to make random selections from that distribution. -Since the estimated function is just the sum of kernels centered at each +Since the smoothed probability density function is just the sum of kernels centered at each data point, the problem can be reduced to sampling the kernel function and recentering the result around a randomly chosen data point. This works for kernels that have known or accurately approximated inverse @@ -1176,7 +1176,7 @@ cumulative distribution functions. } def kde_random(data, h, kernel='normal'): - 'Generate random a sample from an estimated kernel density function.' + 'Generate a random sample from a smoothed probability density function.' kernel_invcdf = kernel_invcdfs[kernel] def rand(): return choice(data) + h * kernel_invcdf(random()) From 7ac2b835e42d6a5aa734494f8d50ace4707d9852 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 26 Mar 2024 18:38:14 -0500 Subject: [PATCH 07/14] More direct description of the recipe --- Doc/library/statistics.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index f416f0a4e15a4b..9a31c31d3e3b7f 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1155,11 +1155,11 @@ The :func:`kde()` function creates a continuous probability density function from discrete samples. Some applications need a way to make random selections from that distribution. -Since the smoothed probability density function is just the sum of kernels centered at each -data point, the problem can be reduced to sampling the kernel function -and recentering the result around a randomly chosen data point. This -works for kernels that have known or accurately approximated inverse -cumulative distribution functions. +The technique is to pick a sample from the kernel function, scale the +result by the bandwidth, and then recenter the result our around a +randomly chosen point from the input data. This can be done with any +kernel that has a known or accurately approximated inverse cumulative +distribution function. .. testcode:: @@ -1180,7 +1180,7 @@ cumulative distribution functions. 'Generate a random sample from a smoothed probability density function.' kernel_invcdf = kernel_invcdfs[kernel] def rand(): - return choice(data) + h * kernel_invcdf(random()) + return kernel_invcdf(random()) * h + choice(data) return rand For example: From 4398863f9be75dc93ead8e6d56673152ce284434 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 26 Mar 2024 18:47:35 -0500 Subject: [PATCH 08/14] Tighter wording --- Doc/library/statistics.rst | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 9a31c31d3e3b7f..d0d6c73e3b3ecd 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1155,11 +1155,10 @@ The :func:`kde()` function creates a continuous probability density function from discrete samples. Some applications need a way to make random selections from that distribution. -The technique is to pick a sample from the kernel function, scale the -result by the bandwidth, and then recenter the result our around a -randomly chosen point from the input data. This can be done with any -kernel that has a known or accurately approximated inverse cumulative -distribution function. +The technique is to pick a sample from a bandwidth scaled kernel +function and recenter the result our around a randomly chosen point from +the input data. This can be done with any kernel that has a known or +accurately approximated inverse cumulative distribution function. .. testcode:: @@ -1177,10 +1176,10 @@ distribution function. } def kde_random(data, h, kernel='normal'): - 'Generate a random sample from a smoothed probability density function.' + 'Return a function that samples from kde() smoothed data.' kernel_invcdf = kernel_invcdfs[kernel] def rand(): - return kernel_invcdf(random()) * h + choice(data) + return h * kernel_invcdf(random()) + choice(data) return rand For example: From e827d8a549da6566b2862c7a7d086b73a8582fa0 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 26 Mar 2024 18:54:23 -0500 Subject: [PATCH 09/14] Separate core logic from formatting in the example --- Doc/library/statistics.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index d0d6c73e3b3ecd..0a80532b3db730 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1187,9 +1187,10 @@ For example: .. doctest:: >>> discrete_samples = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] - >>> rand = kde_random(discrete_samples, h=1.5) >>> seed(8675309) - >>> [round(rand(), 1) for i in range(10)] + >>> rand = kde_random(discrete_samples, h=1.5) + >>> selections = [rand() for i in range(10)] + >>> [round(x, 1) for x in selections)] [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6] .. testcode:: From d30e554feffc5f479d901504149d4ef8b0d06055 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 26 Mar 2024 18:55:21 -0500 Subject: [PATCH 10/14] Moving seeding to just before calling rand() --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 0a80532b3db730..a31bb993e8c4b1 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1187,8 +1187,8 @@ For example: .. doctest:: >>> discrete_samples = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] - >>> seed(8675309) >>> rand = kde_random(discrete_samples, h=1.5) + >>> seed(8675309) >>> selections = [rand() for i in range(10)] >>> [round(x, 1) for x in selections)] [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6] From 2a570d971f0ca9bc1b302c3b8b810c92c3e4c069 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 26 Mar 2024 19:11:32 -0500 Subject: [PATCH 11/14] Remove spurious closing paren --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index a31bb993e8c4b1..89267942c7b66c 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1190,7 +1190,7 @@ For example: >>> rand = kde_random(discrete_samples, h=1.5) >>> seed(8675309) >>> selections = [rand() for i in range(10)] - >>> [round(x, 1) for x in selections)] + >>> [round(x, 1) for x in selections] [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6] .. testcode:: From d17e6989c54a4ba9d0a09df093ef23433b6500b6 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 26 Mar 2024 19:22:40 -0500 Subject: [PATCH 12/14] More brief section title --- Doc/library/statistics.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 89267942c7b66c..d9c94cf523bfb8 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1148,8 +1148,8 @@ The final prediction goes to the largest posterior. This is known as the 'female' -Sampling from an estimated kernel density function -************************************************** +Sampling from kernel density estimation +*************************************** The :func:`kde()` function creates a continuous probability density function from discrete samples. Some applications need a way to make From a56b1080733a37f76d95b92edd373106c290fbf6 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 26 Mar 2024 19:32:56 -0500 Subject: [PATCH 13/14] choice() and random() do not commute --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index d9c94cf523bfb8..994b9f0d66c281 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1191,7 +1191,7 @@ For example: >>> seed(8675309) >>> selections = [rand() for i in range(10)] >>> [round(x, 1) for x in selections] - [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6] + [4.7, 7.4, 1.2, 7.8, 6.9, -1.3, 5.8, 0.2, -1.4, 5.7] .. testcode:: :hide: From 02ed395a157638b32a9bacd297b9c10e5bff20ed Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 27 Mar 2024 08:35:49 -0500 Subject: [PATCH 14/14] Fix typo --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 994b9f0d66c281..197c123f8356d8 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1156,7 +1156,7 @@ function from discrete samples. Some applications need a way to make random selections from that distribution. The technique is to pick a sample from a bandwidth scaled kernel -function and recenter the result our around a randomly chosen point from +function and recenter the result around a randomly chosen point from the input data. This can be done with any kernel that has a known or accurately approximated inverse cumulative distribution function.