From 707ce1048244d0e2b8bc1445e2af92cbaf809b74 Mon Sep 17 00:00:00 2001 From: Tim Date: Tue, 15 Oct 2019 23:43:05 +0200 Subject: [PATCH 01/28] Add covariance and Pearson's correlation --- Lib/statistics.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/Lib/statistics.py b/Lib/statistics.py index 0d747b3d6c0531..4b1f371a1fb9f7 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -815,6 +815,55 @@ def pstdev(data, mu=None): return var.sqrt() except AttributeError: return math.sqrt(var) + + +# === Measures of joint variability === + +# See https://en.wikipedia.org/wiki/Covariance +# https://en.wikipedia.org/wiki/Pearson_correlation_coefficient + + +def covariance(x, y): + """Covariance + + >>> x = list(range(9)) + >>> y = list(range(3)) * 3 + >>> covariance(x, z) + 0.75 + """ + n = len(x) + if len(y) != n: + raise StatisticsError('covariance requires that x and y have same number of data points') + if n < 1: + raise StatisticsError('covariance requires at least one data point') + xbar = mean(x) + ybar = mean(y) + total = fsum((x - xbar) * (y - ybar)) + return total / n + + +def pearsons_correlation(x, y): + """Pearson's correlation coefficient + + >>> x = list(range(9)) + >>> y = list(reverded(x)) + >>> pearsons_correlation(x, x) + 1 + >>> pearsons_correlation(x, y) + -1 + >>> z = list(range(3)) * 3 + >>> pearsons_correlation(x, z) + 0.31 + """ + n = len(x) + if len(y) != n: + raise StatisticsError('pearsons_correlation requires that x and y have same number of data points') + if n < 1: + raise StatisticsError('pearsons_correlation requires at least one data point') + cov = covariance(x, y) + stdx = stdev(x) + stdy = stdev(y) + return cov / (stdx * stdy) ## Normal Distribution ##################################################### From fab1daba4866a679fbe975ba8471bbbed08d2469 Mon Sep 17 00:00:00 2001 From: Tim Date: Tue, 15 Oct 2019 23:47:41 +0200 Subject: [PATCH 02/28] Fix doctest --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 4b1f371a1fb9f7..786455e1ea1683 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -828,7 +828,7 @@ def covariance(x, y): >>> x = list(range(9)) >>> y = list(range(3)) * 3 - >>> covariance(x, z) + >>> covariance(x, y) 0.75 """ n = len(x) From c06839b79c57993ff931ba7300e76fab44fe756b Mon Sep 17 00:00:00 2001 From: Tim Date: Wed, 16 Oct 2019 00:09:41 +0200 Subject: [PATCH 03/28] Bugfixes --- Lib/statistics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 786455e1ea1683..be6860e485416d 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -838,8 +838,8 @@ def covariance(x, y): raise StatisticsError('covariance requires at least one data point') xbar = mean(x) ybar = mean(y) - total = fsum((x - xbar) * (y - ybar)) - return total / n + total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) + return total / (n - 1) def pearsons_correlation(x, y): @@ -848,9 +848,9 @@ def pearsons_correlation(x, y): >>> x = list(range(9)) >>> y = list(reverded(x)) >>> pearsons_correlation(x, x) - 1 + 1.0 >>> pearsons_correlation(x, y) - -1 + -1.0 >>> z = list(range(3)) * 3 >>> pearsons_correlation(x, z) 0.31 From 485f55f3035915749b3802100e746150e1dbb7ff Mon Sep 17 00:00:00 2001 From: Tim Date: Wed, 16 Oct 2019 00:20:48 +0200 Subject: [PATCH 04/28] Bugfix in doctest --- Lib/statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index be6860e485416d..4eab1f750cbe11 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -846,14 +846,14 @@ def pearsons_correlation(x, y): """Pearson's correlation coefficient >>> x = list(range(9)) - >>> y = list(reverded(x)) + >>> y = list(reversed(x)) >>> pearsons_correlation(x, x) 1.0 >>> pearsons_correlation(x, y) -1.0 >>> z = list(range(3)) * 3 - >>> pearsons_correlation(x, z) - 0.31 + >>> pearsons_correlation(x, z) #doctest: +ELLIPSIS + 0.31... """ n = len(x) if len(y) != n: From 3707299b00753de4955ebea28c1204abdf484a09 Mon Sep 17 00:00:00 2001 From: Tim Date: Wed, 16 Oct 2019 00:33:30 +0200 Subject: [PATCH 05/28] Fix doctest --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 4eab1f750cbe11..af1fca6a21390a 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -852,8 +852,8 @@ def pearsons_correlation(x, y): >>> pearsons_correlation(x, y) -1.0 >>> z = list(range(3)) * 3 - >>> pearsons_correlation(x, z) #doctest: +ELLIPSIS - 0.31... + >>> pearsons_correlation(x, z) + 0.31622776601683794 """ n = len(x) if len(y) != n: From 50975098b310afce3e5a1b9f3071a94662110258 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Wed, 16 Oct 2019 09:27:54 +0200 Subject: [PATCH 06/28] Improve documentation --- Lib/statistics.py | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index af1fca6a21390a..d662aa9d10b234 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -73,6 +73,26 @@ 2.5 +Statistics for relations between two variables +---------------------------------------------- + +================== ==================================================== +Function Description +================== ==================================================== +covariance Sample covariance for two variables. +correlation Pearson's correlation coefficient for two variables. +================== ==================================================== + +Calculate covariance and Pearson's correlation for two variables: + +>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] +>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] +>>> covariance(x, y) +0.75 +>>> correlation(x, y) +0.31622776601683794 + + Exceptions ---------- @@ -815,21 +835,22 @@ def pstdev(data, mu=None): return var.sqrt() except AttributeError: return math.sqrt(var) - + # === Measures of joint variability === # See https://en.wikipedia.org/wiki/Covariance # https://en.wikipedia.org/wiki/Pearson_correlation_coefficient - + def covariance(x, y): """Covariance - - >>> x = list(range(9)) - >>> y = list(range(3)) * 3 + + >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] + >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] >>> covariance(x, y) 0.75 + """ n = len(x) if len(y) != n: @@ -842,18 +863,16 @@ def covariance(x, y): return total / (n - 1) -def pearsons_correlation(x, y): +def correlation(x, y): """Pearson's correlation coefficient - - >>> x = list(range(9)) + + >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] >>> y = list(reversed(x)) - >>> pearsons_correlation(x, x) + >>> correlation(x, x) 1.0 - >>> pearsons_correlation(x, y) + >>> correlation(x, y) -1.0 - >>> z = list(range(3)) * 3 - >>> pearsons_correlation(x, z) - 0.31622776601683794 + """ n = len(x) if len(y) != n: From 2dbabdb7b1bc55a8f2d5039f140386159e977e0f Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Wed, 16 Oct 2019 08:08:15 +0000 Subject: [PATCH 07/28] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst diff --git a/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst b/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst new file mode 100644 index 00000000000000..ef5394bd340b95 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst @@ -0,0 +1 @@ +Covariance and Pearson's correlation functionality was added to statistics module \ No newline at end of file From 411f2c07a15c963ff9e096db6080a508cabab599 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Wed, 16 Oct 2019 10:15:17 +0200 Subject: [PATCH 08/28] Update docstring --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index d662aa9d10b234..3ec79fac491d6c 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -876,9 +876,9 @@ def correlation(x, y): """ n = len(x) if len(y) != n: - raise StatisticsError('pearsons_correlation requires that x and y have same number of data points') + raise StatisticsError('correlation requires that x and y have same number of data points') if n < 1: - raise StatisticsError('pearsons_correlation requires at least one data point') + raise StatisticsError('correlation requires at least one data point') cov = covariance(x, y) stdx = stdev(x) stdy = stdev(y) From 4ed02eb71876987e0555deafd78376fc21a3062f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Wed, 16 Oct 2019 10:35:35 +0200 Subject: [PATCH 09/28] Improved exceptions handling --- Lib/statistics.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 3ec79fac491d6c..90f1b827cc3717 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -843,7 +843,7 @@ def pstdev(data, mu=None): # https://en.wikipedia.org/wiki/Pearson_correlation_coefficient -def covariance(x, y): +def covariance(x, y, /): """Covariance >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] @@ -854,16 +854,16 @@ def covariance(x, y): """ n = len(x) if len(y) != n: - raise StatisticsError('covariance requires that x and y have same number of data points') - if n < 1: - raise StatisticsError('covariance requires at least one data point') + raise StatisticsError('covariance requires that both variables have same number of data points') + if n < 2: + raise StatisticsError('covariance requires at least two data points') xbar = mean(x) ybar = mean(y) total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) return total / (n - 1) -def correlation(x, y): +def correlation(x, y, /): """Pearson's correlation coefficient >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] @@ -876,13 +876,16 @@ def correlation(x, y): """ n = len(x) if len(y) != n: - raise StatisticsError('correlation requires that x and y have same number of data points') + raise StatisticsError('correlation requires that both variables have same number of data points') if n < 1: - raise StatisticsError('correlation requires at least one data point') + raise StatisticsError('correlation requires at least two data points') cov = covariance(x, y) stdx = stdev(x) stdy = stdev(y) - return cov / (stdx * stdy) + try: + return cov / (stdx * stdy) + except ZeroDivisionError: + raise StatisticsError('standard deviation of at least one of the variables is zero') ## Normal Distribution ##################################################### From 917f9afa42346e385c1a2d9f83f95df70f63a3fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Wed, 16 Oct 2019 10:39:51 +0200 Subject: [PATCH 10/28] Updated Misc/ACKS --- Misc/ACKS | 1 + 1 file changed, 1 insertion(+) diff --git a/Misc/ACKS b/Misc/ACKS index d8e2630814a869..0e4d26cd52f829 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1835,6 +1835,7 @@ David Wolever Klaus-Juergen Wolf Dan Wolfe Richard Wolff +Tymoteusz Wołodźko Adam Woodbeck Steven Work Gordon Worley From 30c284266ed2448b3456dbee49674cfe962a09b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Wed, 16 Oct 2019 10:43:06 +0200 Subject: [PATCH 11/28] Fix exception handling --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 90f1b827cc3717..66d9bed09b78da 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -877,7 +877,7 @@ def correlation(x, y, /): n = len(x) if len(y) != n: raise StatisticsError('correlation requires that both variables have same number of data points') - if n < 1: + if n < 2: raise StatisticsError('correlation requires at least two data points') cov = covariance(x, y) stdx = stdev(x) From 1b9c3856b1826113b9340416cf08e9e0ebe94fba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Wed, 16 Oct 2019 11:01:20 +0200 Subject: [PATCH 12/28] Update __all__ --- Lib/statistics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/statistics.py b/Lib/statistics.py index 66d9bed09b78da..12914eee760eaf 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -118,6 +118,8 @@ 'quantiles', 'stdev', 'variance', + 'correlation', + 'covariance', ] import math From b3afc6383e33ac5d0a23dc9714ab53b586a66ee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Wed, 16 Oct 2019 11:23:37 +0200 Subject: [PATCH 13/28] Add linear_regression --- Lib/statistics.py | 42 +++++++++++++++++-- .../2019-10-16-08-08-14.bpo-38490.QbDXEF.rst | 2 +- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 12914eee760eaf..9fb61956b0dfc8 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -81,6 +81,7 @@ ================== ==================================================== covariance Sample covariance for two variables. correlation Pearson's correlation coefficient for two variables. +linear_regression Intercept and slope fot simple linear regression. ================== ==================================================== Calculate covariance and Pearson's correlation for two variables: @@ -89,8 +90,10 @@ >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] >>> covariance(x, y) 0.75 ->>> correlation(x, y) -0.31622776601683794 +>>> correlation(x, y) #doctest: +ELLIPSIS +0.31622776601... +>>> linear_regression(x, y) #doctest: +ELLIPSIS +(1.5, 0.099999999999...) Exceptions @@ -120,6 +123,7 @@ 'variance', 'correlation', 'covariance', + 'linear_regression', ] import math @@ -839,10 +843,11 @@ def pstdev(data, mu=None): return math.sqrt(var) -# === Measures of joint variability === +# === Statistics for relations between two variables === # See https://en.wikipedia.org/wiki/Covariance # https://en.wikipedia.org/wiki/Pearson_correlation_coefficient +# https://en.wikipedia.org/wiki/Simple_linear_regression def covariance(x, y, /): @@ -890,6 +895,37 @@ def correlation(x, y, /): raise StatisticsError('standard deviation of at least one of the variables is zero') +def linear_regression(x, y, /): + """Calculate intercept and slope for simple linear regression + + Return the ``(intercept, slope)`` tuple of the simple linear regression parameters. Simple linear regression + describes relationship between *x* and *y* in terms of linear function:: + + y = intercept + slope * x + noise + + where ``intercept`` and ``slope`` are the regression parameters that are estimated, and + noise term is an unobserved random variable, the unexplained variability of the data. + + >>> x = [1, 2, 3, 4, 5] + >>> noise = NormalDist().samples(5, seed=42) + >>> y = [2 + 3 * x[i] + noise[i] for i in range(5)] + >>> linear_regression(x, y) #doctest: +ELLIPSIS + (1.75684970486..., 3.09078914170...) + + """ + n = len(x) + if len(y) != n: + raise StatisticsError('linear regression requires that both variables have same number of data points') + if n < 2: + raise StatisticsError('linear regression requires at least two data points') + try: + slope = correlation(x, y) * ( stdev(y) / stdev(x) ) + except ZeroDivisionError: + raise StatisticsError('standard deviation of at least one of the variables is zero') + intercept = mean(y) - slope * mean(x) + return intercept, slope + + ## Normal Distribution ##################################################### diff --git a/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst b/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst index ef5394bd340b95..d4ae43b20bca3a 100644 --- a/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst +++ b/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst @@ -1 +1 @@ -Covariance and Pearson's correlation functionality was added to statistics module \ No newline at end of file +Covariance, Pearson's correlation, and simple linear regression functionality was added to statistics module \ No newline at end of file From e29e9d130e605210399a6c29a07b9c55256455cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Wed, 16 Oct 2019 13:18:42 +0200 Subject: [PATCH 14/28] Improve docstrings --- Lib/statistics.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Lib/statistics.py b/Lib/statistics.py index 9fb61956b0dfc8..69ce6db4da2e79 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -873,6 +873,11 @@ def covariance(x, y, /): def correlation(x, y, /): """Pearson's correlation coefficient + Return the Pearson's correlation coefficient for two variables. Pearson's correlation coefficient *r* + takes values between +1 and -1. It measures the strength and direction of the linear relationship between + two variables, where +1 means very strong, positive linear relationship, -1 very strong, negative linear + relationship, and 0 no linear relationship. + >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] >>> y = list(reversed(x)) >>> correlation(x, x) From b3c726e7f4c6321c311f3c428d6c70483ec05e4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Thu, 17 Oct 2019 08:10:36 +0200 Subject: [PATCH 15/28] Better naming of arguments in linear_regression --- Lib/statistics.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 69ce6db4da2e79..60154ab62a7aab 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -900,34 +900,35 @@ def correlation(x, y, /): raise StatisticsError('standard deviation of at least one of the variables is zero') -def linear_regression(x, y, /): +def linear_regression(regressor, dependent_variable): """Calculate intercept and slope for simple linear regression Return the ``(intercept, slope)`` tuple of the simple linear regression parameters. Simple linear regression - describes relationship between *x* and *y* in terms of linear function:: + describes relationship between *regressor* and *dependent variable* in terms of linear function:: - y = intercept + slope * x + noise + dependent_variable = intercept + slope * regressor + noise where ``intercept`` and ``slope`` are the regression parameters that are estimated, and - noise term is an unobserved random variable, the unexplained variability of the data. + noise term is an unobserved random variable, the unexplained variability of the data (the difference between + prediction and the actual values of dependent variable). - >>> x = [1, 2, 3, 4, 5] + >>> regressor = [1, 2, 3, 4, 5] >>> noise = NormalDist().samples(5, seed=42) - >>> y = [2 + 3 * x[i] + noise[i] for i in range(5)] - >>> linear_regression(x, y) #doctest: +ELLIPSIS + >>> dependent_variable = [2 + 3 * regressor[i] + noise[i] for i in range(5)] + >>> linear_regression(regressor, dependent_variable) #doctest: +ELLIPSIS (1.75684970486..., 3.09078914170...) """ - n = len(x) - if len(y) != n: + n = len(regressor) + if len(dependent_variable) != n: raise StatisticsError('linear regression requires that both variables have same number of data points') if n < 2: raise StatisticsError('linear regression requires at least two data points') try: - slope = correlation(x, y) * ( stdev(y) / stdev(x) ) + slope = correlation(regressor, dependent_variable) * ( stdev(dependent_variable) / stdev(regressor) ) except ZeroDivisionError: raise StatisticsError('standard deviation of at least one of the variables is zero') - intercept = mean(y) - slope * mean(x) + intercept = mean(dependent_variable) - slope * mean(regressor) return intercept, slope From 256de8d3a10ab94ef7b387140e3e1ccb63c38170 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Thu, 17 Oct 2019 08:16:01 +0200 Subject: [PATCH 16/28] Fix docstring --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 60154ab62a7aab..81cd480261b0a7 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -874,7 +874,7 @@ def correlation(x, y, /): """Pearson's correlation coefficient Return the Pearson's correlation coefficient for two variables. Pearson's correlation coefficient *r* - takes values between +1 and -1. It measures the strength and direction of the linear relationship between + takes values between -1 and +1. It measures the strength and direction of the linear relationship between two variables, where +1 means very strong, positive linear relationship, -1 very strong, negative linear relationship, and 0 no linear relationship. From f44c4688adaf97050c53faec80396d622175a3b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Thu, 17 Oct 2019 15:45:18 +0200 Subject: [PATCH 17/28] Improve code formatting --- Lib/statistics.py | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 81cd480261b0a7..bf9e8cfebe68cb 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -84,7 +84,8 @@ linear_regression Intercept and slope fot simple linear regression. ================== ==================================================== -Calculate covariance and Pearson's correlation for two variables: +Calculate covariance, Pearson's correlation, and simple linear regression +for two variables: >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] @@ -853,10 +854,18 @@ def pstdev(data, mu=None): def covariance(x, y, /): """Covariance + Calculates covariance of two variables *x* and *y*. Covariance is + a measure of the joint variability of two variables. + >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] >>> covariance(x, y) 0.75 + >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1] + >>> covariance(x, z) + -7.5 + >>> covariance(z, x) + -7.5 """ n = len(x) @@ -873,13 +882,14 @@ def covariance(x, y, /): def correlation(x, y, /): """Pearson's correlation coefficient - Return the Pearson's correlation coefficient for two variables. Pearson's correlation coefficient *r* - takes values between -1 and +1. It measures the strength and direction of the linear relationship between - two variables, where +1 means very strong, positive linear relationship, -1 very strong, negative linear + Return the Pearson's correlation coefficient for two variables. Pearson's + correlation coefficient *r* takes values between -1 and +1. It measures the + strength and direction of the linear relationship, where +1 means very + strong, positive linear relationship, -1 very strong, negative linear relationship, and 0 no linear relationship. >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] - >>> y = list(reversed(x)) + >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1] >>> correlation(x, x) 1.0 >>> correlation(x, y) @@ -903,14 +913,16 @@ def correlation(x, y, /): def linear_regression(regressor, dependent_variable): """Calculate intercept and slope for simple linear regression - Return the ``(intercept, slope)`` tuple of the simple linear regression parameters. Simple linear regression - describes relationship between *regressor* and *dependent variable* in terms of linear function:: + Return the ``(intercept, slope)`` tuple of the simple linear regression + parameters. Simple linear regression describes relationship between + *regressor* and *dependent variable* in terms of linear function:: dependent_variable = intercept + slope * regressor + noise - where ``intercept`` and ``slope`` are the regression parameters that are estimated, and - noise term is an unobserved random variable, the unexplained variability of the data (the difference between - prediction and the actual values of dependent variable). + where ``intercept`` and ``slope`` are the regression parameters that are + estimated, and noise term is an unobserved random variable, the unexplained + variability of the data (the difference between prediction and the actual + values of dependent variable). >>> regressor = [1, 2, 3, 4, 5] >>> noise = NormalDist().samples(5, seed=42) @@ -925,7 +937,10 @@ def linear_regression(regressor, dependent_variable): if n < 2: raise StatisticsError('linear regression requires at least two data points') try: - slope = correlation(regressor, dependent_variable) * ( stdev(dependent_variable) / stdev(regressor) ) + cor = correlation(regressor, dependent_variable) + stdx = stdev(regressor) + stdy = stdev(dependent_variable) + slope = cor * (stdy / stdx) except ZeroDivisionError: raise StatisticsError('standard deviation of at least one of the variables is zero') intercept = mean(dependent_variable) - slope * mean(regressor) From 117f56789ca18fc47bc2a4b3f516b8ecb48bb505 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Thu, 17 Oct 2019 16:07:37 +0200 Subject: [PATCH 18/28] Documentation for the new functionalities --- Doc/library/statistics.rst | 98 ++++++++++++++++++++++++++++++++++++++ Lib/statistics.py | 9 ++-- 2 files changed, 103 insertions(+), 4 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index a702b2463c39b1..4b3d64db6d963b 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -68,6 +68,17 @@ tends to deviate from the typical or average values. :func:`variance` Sample variance of data. ======================= ============================================= +Statistics for relations between two variables +---------------------------------------------- + +These functions calculate statistics regarding relations between two random variables. + +========================= ===================================================== +:func:`covariance` Sample covariance for two variables. +:func:`correlation` Pearson's correlation coefficient for two variables. +:func:`linear_regression` Intercept and slope fot simple linear regression. +========================= ===================================================== + Function details ---------------- @@ -559,6 +570,93 @@ However, for reading convenience, most of the examples show sorted sequences. .. versionadded:: 3.8 +.. function:: covariance(x, y, /) + + Calculates covariance of two variables *x* and *y*. Covariance is + a measure of the joint variability of two variables. + + Raises :exc:`StatisticsError` if both variables have same number of data + points, or if any of the variables has less then two data points. + + Examples: + + .. doctest:: + + >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] + >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] + >>> covariance(x, y) + 0.75 + >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1] + >>> covariance(x, z) + -7.5 + >>> covariance(z, x) + -7.5 + +.. function:: correlation(x, y, /) + + Return the `Pearson's correlation coefficient + `_ + for two variables. Pearson's correlation coefficient *r* takes values + between -1 and +1. It measures the strength and direction of the linear + relationship, where +1 means very strong, positive linear relationship, + -1 very strong, negative linear relationship, and 0 no linear relationship. + + Raises :exc:`StatisticsError` if both variables have same number of data + points, or if any of the variables has less then two data points, or if + :func:`stdev` of any of the two variables is equal to zero (it is constant). + + Examples: + + .. doctest:: + + >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] + >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1] + >>> correlation(x, x) + 1.0 + >>> correlation(x, y) + -1.0 + +.. function:: linear_regression(regressor, dependent_variable) + + Return the ``(intercept, slope)`` tuple of the `simple linear regression + `_ + parameters. Simple linear regression describes relationship between + *regressor* and *dependent variable* in terms of linear function: + + *dependent_variable = intercept + slope * regressor + noise* + + where ``intercept`` and ``slope`` are the regression parameters that are + estimated, and noise term is an unobserved random variable, for the + variability of the data that was not explained byt the linear regression + (it is equal to the difference between prediction and the actual values + of dependent variable). + + Raises :exc:`StatisticsError` if both variables have same number of data + points, or if any of the variables has less then two data points, or if + :func:`stdev` of any of the two variables is equal to zero (it is constant). + + For example, if we took the data on the data on `release dates of the Monty + Python films `_, and used + it to predict the cumulative number of Monty Python films produced, we could + predict what would be the number of films they could have made till year + 2019, assuming that they kept the pace. + + .. doctest:: + + >>> year = [1971, 1975, 1979, 1982, 1983] + >>> films_total = [1, 2, 3, 4, 5] + >>> intercept, slope = linear_regression(year, films_total) + >>> round(intercept + slope * 2019) + 16 + + We could also use it to predict how many Monty Python films existed when + Brian Cohen was born. + + .. doctest:: + + >>> round(intercept + slope * 1) + -610 + Exceptions ---------- diff --git a/Lib/statistics.py b/Lib/statistics.py index bf9e8cfebe68cb..ca2dea595916b7 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -919,10 +919,11 @@ def linear_regression(regressor, dependent_variable): dependent_variable = intercept + slope * regressor + noise - where ``intercept`` and ``slope`` are the regression parameters that are - estimated, and noise term is an unobserved random variable, the unexplained - variability of the data (the difference between prediction and the actual - values of dependent variable). + where ``intercept`` and ``slope`` are the regression parameters that are + estimated, and noise term is an unobserved random variable, for the + variability of the data that was not explained byt the linear regression + (it is equal to the difference between prediction and the actual values + of dependent variable). >>> regressor = [1, 2, 3, 4, 5] >>> noise = NormalDist().samples(5, seed=42) From 3ad902abcead0ed0bdba53be02256b934c211b5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Wed, 13 Nov 2019 22:36:45 +0100 Subject: [PATCH 19/28] Update Doc/library/statistics.rst Co-Authored-By: Tal Einat --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 4b3d64db6d963b..75164759be35e2 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -76,7 +76,7 @@ These functions calculate statistics regarding relations between two random vari ========================= ===================================================== :func:`covariance` Sample covariance for two variables. :func:`correlation` Pearson's correlation coefficient for two variables. -:func:`linear_regression` Intercept and slope fot simple linear regression. +:func:`linear_regression` Intercept and slope for simple linear regression. ========================= ===================================================== From 5ed2846a08255a4fcb96e65fd06be3a5a80f39ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Fri, 15 Nov 2019 14:10:18 +0100 Subject: [PATCH 20/28] Fixed documentation after code review --- Doc/library/statistics.rst | 6 +++--- Lib/statistics.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 75164759be35e2..57d864e2798bf2 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -575,7 +575,7 @@ However, for reading convenience, most of the examples show sorted sequences. Calculates covariance of two variables *x* and *y*. Covariance is a measure of the joint variability of two variables. - Raises :exc:`StatisticsError` if both variables have same number of data + Raises :exc:`StatisticsError` if both variables have different number of data points, or if any of the variables has less then two data points. Examples: @@ -601,7 +601,7 @@ However, for reading convenience, most of the examples show sorted sequences. relationship, where +1 means very strong, positive linear relationship, -1 very strong, negative linear relationship, and 0 no linear relationship. - Raises :exc:`StatisticsError` if both variables have same number of data + Raises :exc:`StatisticsError` if both variables have different number of data points, or if any of the variables has less then two data points, or if :func:`stdev` of any of the two variables is equal to zero (it is constant). @@ -631,7 +631,7 @@ However, for reading convenience, most of the examples show sorted sequences. (it is equal to the difference between prediction and the actual values of dependent variable). - Raises :exc:`StatisticsError` if both variables have same number of data + Raises :exc:`StatisticsError` if both variables have different number of data points, or if any of the variables has less then two data points, or if :func:`stdev` of any of the two variables is equal to zero (it is constant). diff --git a/Lib/statistics.py b/Lib/statistics.py index ca2dea595916b7..361a0a295f901f 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -81,7 +81,7 @@ ================== ==================================================== covariance Sample covariance for two variables. correlation Pearson's correlation coefficient for two variables. -linear_regression Intercept and slope fot simple linear regression. +linear_regression Intercept and slope for simple linear regression. ================== ==================================================== Calculate covariance, Pearson's correlation, and simple linear regression From 5979dbafbdf4ac3a8d6def900fcd1ea89db4d7a0 Mon Sep 17 00:00:00 2001 From: Tim Date: Sat, 16 Nov 2019 00:56:14 +0100 Subject: [PATCH 21/28] Initial fixes after code review --- Doc/library/statistics.rst | 22 +++---- Lib/statistics.py | 29 ++++---- Lib/test/test_statistics.py | 66 +++++++++++++++++++ .../2019-10-16-08-08-14.bpo-38490.QbDXEF.rst | 2 +- 4 files changed, 91 insertions(+), 28 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 57d864e2798bf2..ec6d367c5f89f3 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -572,11 +572,11 @@ However, for reading convenience, most of the examples show sorted sequences. .. function:: covariance(x, y, /) - Calculates covariance of two variables *x* and *y*. Covariance is - a measure of the joint variability of two variables. + Return the sample covariance of two inputs *x* and *y*. Covariance + is a measure of the joint variability of two inputs. - Raises :exc:`StatisticsError` if both variables have different number of data - points, or if any of the variables has less then two data points. + Both inputs must be of the same length (no less than two), otherwise + :exc:`StatisticsError` is raised. Examples: @@ -596,14 +596,14 @@ However, for reading convenience, most of the examples show sorted sequences. Return the `Pearson's correlation coefficient `_ - for two variables. Pearson's correlation coefficient *r* takes values + for two inputs. Pearson's correlation coefficient *r* takes values between -1 and +1. It measures the strength and direction of the linear relationship, where +1 means very strong, positive linear relationship, -1 very strong, negative linear relationship, and 0 no linear relationship. - Raises :exc:`StatisticsError` if both variables have different number of data - points, or if any of the variables has less then two data points, or if - :func:`stdev` of any of the two variables is equal to zero (it is constant). + Both inputs must be of the same length (no less than two), and :func:`stdev` + of both inputs needs to be greater then zero, otherwise :exc:`StatisticsError` + is raised. Examples: @@ -631,9 +631,9 @@ However, for reading convenience, most of the examples show sorted sequences. (it is equal to the difference between prediction and the actual values of dependent variable). - Raises :exc:`StatisticsError` if both variables have different number of data - points, or if any of the variables has less then two data points, or if - :func:`stdev` of any of the two variables is equal to zero (it is constant). + Both inputs must be of the same length (no less than two), and :func:`stdev` + of both inputs needs to be greater then zero, otherwise :exc:`StatisticsError` + is raised. For example, if we took the data on the data on `release dates of the Monty Python films `_, and used diff --git a/Lib/statistics.py b/Lib/statistics.py index 361a0a295f901f..0a2b8c92847404 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -93,8 +93,8 @@ 0.75 >>> correlation(x, y) #doctest: +ELLIPSIS 0.31622776601... ->>> linear_regression(x, y) #doctest: +ELLIPSIS -(1.5, 0.099999999999...) +>>> linear_regression(x, y) #doctest: +(1.5, 0.1) Exceptions @@ -844,7 +844,7 @@ def pstdev(data, mu=None): return math.sqrt(var) -# === Statistics for relations between two variables === +# === Statistics for relations between two inputs === # See https://en.wikipedia.org/wiki/Covariance # https://en.wikipedia.org/wiki/Pearson_correlation_coefficient @@ -854,8 +854,8 @@ def pstdev(data, mu=None): def covariance(x, y, /): """Covariance - Calculates covariance of two variables *x* and *y*. Covariance is - a measure of the joint variability of two variables. + Return the covariance of two inputs *x* and *y*. Covariance is + a measure of the joint variability of two inputs. >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] @@ -870,7 +870,7 @@ def covariance(x, y, /): """ n = len(x) if len(y) != n: - raise StatisticsError('covariance requires that both variables have same number of data points') + raise StatisticsError('covariance requires that both inputs have same number of data points') if n < 2: raise StatisticsError('covariance requires at least two data points') xbar = mean(x) @@ -882,7 +882,7 @@ def covariance(x, y, /): def correlation(x, y, /): """Pearson's correlation coefficient - Return the Pearson's correlation coefficient for two variables. Pearson's + Return the Pearson's correlation coefficient for two inputs. Pearson's correlation coefficient *r* takes values between -1 and +1. It measures the strength and direction of the linear relationship, where +1 means very strong, positive linear relationship, -1 very strong, negative linear @@ -898,7 +898,7 @@ def correlation(x, y, /): """ n = len(x) if len(y) != n: - raise StatisticsError('correlation requires that both variables have same number of data points') + raise StatisticsError('correlation requires that both inputs have same number of data points') if n < 2: raise StatisticsError('correlation requires at least two data points') cov = covariance(x, y) @@ -907,11 +907,11 @@ def correlation(x, y, /): try: return cov / (stdx * stdy) except ZeroDivisionError: - raise StatisticsError('standard deviation of at least one of the variables is zero') + raise StatisticsError('standard deviation of at least one of the inputs is zero') def linear_regression(regressor, dependent_variable): - """Calculate intercept and slope for simple linear regression + """Intercept and slope for simple linear regression Return the ``(intercept, slope)`` tuple of the simple linear regression parameters. Simple linear regression describes relationship between @@ -934,16 +934,13 @@ def linear_regression(regressor, dependent_variable): """ n = len(regressor) if len(dependent_variable) != n: - raise StatisticsError('linear regression requires that both variables have same number of data points') + raise StatisticsError('linear regression requires that both inputs have same number of data points') if n < 2: raise StatisticsError('linear regression requires at least two data points') try: - cor = correlation(regressor, dependent_variable) - stdx = stdev(regressor) - stdy = stdev(dependent_variable) - slope = cor * (stdy / stdx) + slope = covariance(regressor, dependent_variable) / variance(regressor) except ZeroDivisionError: - raise StatisticsError('standard deviation of at least one of the variables is zero') + raise StatisticsError('standard deviation of regressor is zero') intercept = mean(dependent_variable) - slope * mean(regressor) return intercept, slope diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index af26473e8fdfc3..f817a246082125 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2312,6 +2312,72 @@ def test_error_cases(self): quantiles([10, None, 30], n=4) # data is non-numeric +class TestMultivariateStatistics(unittest.TestCase): + + def test_unequal_size(self): + for x, y in [ + ([1, 2, 3], [1, 2]), + ([1, 2], [1, 2, 3]), + ]: + with self.assertRaises(self.module.StatisticsError): + statistics.covariance(x, y) + with self.assertRaises(self.module.StatisticsError): + statistics.covariance(x, y) + with self.assertRaises(self.module.StatisticsError): + statistics.linear_regression(x, y) + + def test_small_sample(self): + for x, y in [ + ([], []), + ([], [1, 2,]), + ([1, 2,], []), + ([1,], [1,]), + ([1,], [1, 2,]), + ([1, 2,], [1,]), + ]: + with self.assertRaises(self.module.StatisticsError): + statistics.covariance(x, y) + with self.assertRaises(self.module.StatisticsError): + statistics.covariance(x, y) + with self.assertRaises(self.module.StatisticsError): + statistics.linear_regression(x, y) + + +class TestCorrelation(unittest.TestCase): + + def test_results(self): + for x, y, result in [ + ([1, 2, 3], [1, 2, 3], 1), + ([1, 2, 3], [3, 2, 1], -1), + ([1, 2, 3], [1, 2, 1], 0), + ([1, 2, 3], [1, 3, 2], 0.5), + ]: + self.assertAlmostEqual(statistics.correlation(x, y), result) + + +class TestLinearRegression(unittest.TestCase): + + def test_constant_input(self): + x = [1, 1, 1,] + y = [1, 2, 3,] + with self.assertRaises(self.module.StatisticsError): + statistics.linear_regression(x, y) + + def test_results(self): + for x, y, true_intercept, true_slope in [ + ([1, 2, 3], [0, 0, 0], 0, 0), + ([1, 2, 3], [1, 2, 3], 0, 1), + ([1, 2, 3], [100, 100, 100], 100, 0), + ([1, 2, 3], [12, 14, 16], 10, 2), + ([1, 2, 3], [-1, -2, -3], 0, -1), + ([1, 2, 3], [21, 22, 23], 20, 1), + ([1, 2, 3], [5.1, 5.2, 5.3], 5, 0.1), + ]: + intercept, slope = statistics.linear_regression(x, y) + self.assertAlmostEqual(intercept, true_intercept) + self.assertAlmostEqual(slope, true_slope) + + class TestNormalDist: # General note on precision: The pdf(), cdf(), and overlap() methods diff --git a/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst b/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst index d4ae43b20bca3a..82b9e33be0e304 100644 --- a/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst +++ b/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst @@ -1 +1 @@ -Covariance, Pearson's correlation, and simple linear regression functionality was added to statistics module \ No newline at end of file +Covariance, Pearson's correlation, and simple linear regression functionality was added to statistics module. Patch by Tymoteusz Wołodźko. \ No newline at end of file From 4bd459dc5d5cdac4fed6fa37650be08396e63bb4 Mon Sep 17 00:00:00 2001 From: Tim Date: Sat, 16 Nov 2019 22:01:15 +0100 Subject: [PATCH 22/28] Fix documentation, typos, add new unit tests after review --- Doc/library/statistics.rst | 25 ++++++++-------- Lib/statistics.py | 29 ++++++++++--------- Lib/test/test_statistics.py | 58 ++++++++++++++++++++++--------------- 3 files changed, 62 insertions(+), 50 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index ec6d367c5f89f3..5cde3a7731fd49 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -68,10 +68,10 @@ tends to deviate from the typical or average values. :func:`variance` Sample variance of data. ======================= ============================================= -Statistics for relations between two variables ----------------------------------------------- +Statistics for relations between two inputs +------------------------------------------- -These functions calculate statistics regarding relations between two random variables. +These functions calculate statistics regarding relations between two inputs. ========================= ===================================================== :func:`covariance` Sample covariance for two variables. @@ -601,9 +601,8 @@ However, for reading convenience, most of the examples show sorted sequences. relationship, where +1 means very strong, positive linear relationship, -1 very strong, negative linear relationship, and 0 no linear relationship. - Both inputs must be of the same length (no less than two), and :func:`stdev` - of both inputs needs to be greater then zero, otherwise :exc:`StatisticsError` - is raised. + Both inputs must be of the same length (no less than two), and need + not to be constant, otherwise :exc:`StatisticsError` is raised. Examples: @@ -620,20 +619,20 @@ However, for reading convenience, most of the examples show sorted sequences. Return the ``(intercept, slope)`` tuple of the `simple linear regression `_ - parameters. Simple linear regression describes relationship between - *regressor* and *dependent variable* in terms of linear function: + parameters estimated using ordinary least squares. Simple linear + regression describes relationship between *regressor* and + *dependent variable* in terms of linear function: *dependent_variable = intercept + slope * regressor + noise* where ``intercept`` and ``slope`` are the regression parameters that are estimated, and noise term is an unobserved random variable, for the - variability of the data that was not explained byt the linear regression + variability of the data that was not explained by the linear regression (it is equal to the difference between prediction and the actual values of dependent variable). - Both inputs must be of the same length (no less than two), and :func:`stdev` - of both inputs needs to be greater then zero, otherwise :exc:`StatisticsError` - is raised. + Both inputs must be of the same length (no less than two), and regressor + needs not to be constant, otherwise :exc:`StatisticsError` is raised. For example, if we took the data on the data on `release dates of the Monty Python films `_, and used @@ -649,7 +648,7 @@ However, for reading convenience, most of the examples show sorted sequences. >>> round(intercept + slope * 2019) 16 - We could also use it to predict how many Monty Python films existed when + We could also use it to "predict" how many Monty Python films existed when Brian Cohen was born. .. doctest:: diff --git a/Lib/statistics.py b/Lib/statistics.py index 0a2b8c92847404..79e37236518d23 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -73,8 +73,8 @@ 2.5 -Statistics for relations between two variables ----------------------------------------------- +Statistics for relations between two inputs +------------------------------------------- ================== ==================================================== Function Description @@ -85,7 +85,7 @@ ================== ==================================================== Calculate covariance, Pearson's correlation, and simple linear regression -for two variables: +for two inputs: >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] @@ -854,8 +854,8 @@ def pstdev(data, mu=None): def covariance(x, y, /): """Covariance - Return the covariance of two inputs *x* and *y*. Covariance is - a measure of the joint variability of two inputs. + Return the sample covariance of two inputs *x* and *y*. Covariance + is a measure of the joint variability of two inputs. >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] @@ -907,23 +907,24 @@ def correlation(x, y, /): try: return cov / (stdx * stdy) except ZeroDivisionError: - raise StatisticsError('standard deviation of at least one of the inputs is zero') + raise StatisticsError('at least one of the inputs is constant') def linear_regression(regressor, dependent_variable): """Intercept and slope for simple linear regression Return the ``(intercept, slope)`` tuple of the simple linear regression - parameters. Simple linear regression describes relationship between - *regressor* and *dependent variable* in terms of linear function:: + parameters estimated using ordinary least squares. Simple linear + regression describes relationship between *regressor* and + *dependent variable* in terms of linear function:: dependent_variable = intercept + slope * regressor + noise - where ``intercept`` and ``slope`` are the regression parameters that are - estimated, and noise term is an unobserved random variable, for the - variability of the data that was not explained byt the linear regression - (it is equal to the difference between prediction and the actual values - of dependent variable). + where ``intercept`` and ``slope`` are the regression parameters that are + estimated, and noise term is an unobserved random variable, for the + variability of the data that was not explained by the linear regression + (it is equal to the difference between prediction and the actual values + of dependent variable). >>> regressor = [1, 2, 3, 4, 5] >>> noise = NormalDist().samples(5, seed=42) @@ -940,7 +941,7 @@ def linear_regression(regressor, dependent_variable): try: slope = covariance(regressor, dependent_variable) / variance(regressor) except ZeroDivisionError: - raise StatisticsError('standard deviation of regressor is zero') + raise StatisticsError('regressor is constant') intercept = mean(dependent_variable) - slope * mean(regressor) return intercept, slope diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index f817a246082125..7cff797ab024cf 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2312,21 +2312,21 @@ def test_error_cases(self): quantiles([10, None, 30], n=4) # data is non-numeric -class TestMultivariateStatistics(unittest.TestCase): - - def test_unequal_size(self): +class TestBivariateStatistics(unittest.TestCase): + + def test_unequal_size_error(self): for x, y in [ ([1, 2, 3], [1, 2]), ([1, 2], [1, 2, 3]), ]: - with self.assertRaises(self.module.StatisticsError): - statistics.covariance(x, y) - with self.assertRaises(self.module.StatisticsError): + with self.assertRaises(statistics.StatisticsError): statistics.covariance(x, y) - with self.assertRaises(self.module.StatisticsError): + with self.assertRaises(statistics.StatisticsError): + statistics.correlation(x, y) + with self.assertRaises(statistics.StatisticsError): statistics.linear_regression(x, y) - - def test_small_sample(self): + + def test_small_sample_error(self): for x, y in [ ([], []), ([], [1, 2,]), @@ -2335,34 +2335,46 @@ def test_small_sample(self): ([1,], [1, 2,]), ([1, 2,], [1,]), ]: - with self.assertRaises(self.module.StatisticsError): - statistics.covariance(x, y) - with self.assertRaises(self.module.StatisticsError): + with self.assertRaises(statistics.StatisticsError): statistics.covariance(x, y) - with self.assertRaises(self.module.StatisticsError): + with self.assertRaises(statistics.StatisticsError): + statistics.correlation(x, y) + with self.assertRaises(statistics.StatisticsError): statistics.linear_regression(x, y) - - -class TestCorrelation(unittest.TestCase): - + + +class TestCorrelationAndCovariance(unittest.TestCase): + def test_results(self): for x, y, result in [ ([1, 2, 3], [1, 2, 3], 1), + ([1, 2, 3], [-1, -2, -3], -1), ([1, 2, 3], [3, 2, 1], -1), ([1, 2, 3], [1, 2, 1], 0), ([1, 2, 3], [1, 3, 2], 0.5), ]: self.assertAlmostEqual(statistics.correlation(x, y), result) + self.assertAlmostEqual(statistics.covariance(x, y), result) + + def test_different_scales(self): + x = [1, 2, 3] + y = [10, 30, 20] + self.assertAlmostEqual(statistics.correlation(x, y), 0.5) + self.assertAlmostEqual(statistics.covariance(x, y), 5) + + y = [.1, .2, .3] + self.assertAlmostEqual(statistics.correlation(x, y), 1) + self.assertAlmostEqual(statistics.covariance(x, y), 0.1) + - class TestLinearRegression(unittest.TestCase): - - def test_constant_input(self): + + def test_constant_input_error(self): x = [1, 1, 1,] y = [1, 2, 3,] - with self.assertRaises(self.module.StatisticsError): + with self.assertRaises(statistics.StatisticsError): statistics.linear_regression(x, y) - + def test_results(self): for x, y, true_intercept, true_slope in [ ([1, 2, 3], [0, 0, 0], 0, 0), @@ -2376,7 +2388,7 @@ def test_results(self): intercept, slope = statistics.linear_regression(x, y) self.assertAlmostEqual(intercept, true_intercept) self.assertAlmostEqual(slope, true_slope) - + class TestNormalDist: From be1cfb8b6ba7cf21c16f8c500088ab6d829b3d1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Sat, 16 Nov 2019 22:17:57 +0100 Subject: [PATCH 23/28] patchcheck fix --- Lib/test/test_statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 7cff797ab024cf..3a57ac078c077b 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2314,7 +2314,7 @@ def test_error_cases(self): class TestBivariateStatistics(unittest.TestCase): - def test_unequal_size_error(self): + def test_unequal_size_error(self): for x, y in [ ([1, 2, 3], [1, 2]), ([1, 2], [1, 2, 3]), @@ -2355,13 +2355,13 @@ def test_results(self): ]: self.assertAlmostEqual(statistics.correlation(x, y), result) self.assertAlmostEqual(statistics.covariance(x, y), result) - + def test_different_scales(self): x = [1, 2, 3] y = [10, 30, 20] self.assertAlmostEqual(statistics.correlation(x, y), 0.5) self.assertAlmostEqual(statistics.covariance(x, y), 5) - + y = [.1, .2, .3] self.assertAlmostEqual(statistics.correlation(x, y), 1) self.assertAlmostEqual(statistics.covariance(x, y), 0.1) From eeeac98250af5579399e8f6ce1dcb9ffcaefa152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Tue, 6 Oct 2020 09:58:58 +0200 Subject: [PATCH 24/28] In linear_regression use positional arguments & return namedtuple --- Lib/statistics.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 79e37236518d23..a38b4d23081ba8 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -94,7 +94,7 @@ >>> correlation(x, y) #doctest: +ELLIPSIS 0.31622776601... >>> linear_regression(x, y) #doctest: -(1.5, 0.1) +LinearRegression(intercept=1.5, slope=0.1) Exceptions @@ -137,7 +137,7 @@ from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum from operator import itemgetter -from collections import Counter +from collections import Counter, namedtuple # === Exceptions === @@ -910,7 +910,10 @@ def correlation(x, y, /): raise StatisticsError('at least one of the inputs is constant') -def linear_regression(regressor, dependent_variable): +LinearRegression = namedtuple('LinearRegression', ['intercept', 'slope']) + + +def linear_regression(regressor, dependent_variable, /): """Intercept and slope for simple linear regression Return the ``(intercept, slope)`` tuple of the simple linear regression @@ -930,7 +933,7 @@ def linear_regression(regressor, dependent_variable): >>> noise = NormalDist().samples(5, seed=42) >>> dependent_variable = [2 + 3 * regressor[i] + noise[i] for i in range(5)] >>> linear_regression(regressor, dependent_variable) #doctest: +ELLIPSIS - (1.75684970486..., 3.09078914170...) + LinearRegression(intercept=1.75684970486..., slope=3.09078914170...) """ n = len(regressor) @@ -943,7 +946,7 @@ def linear_regression(regressor, dependent_variable): except ZeroDivisionError: raise StatisticsError('regressor is constant') intercept = mean(dependent_variable) - slope * mean(regressor) - return intercept, slope + return LinearRegression(intercept=intercept, slope=slope) ## Normal Distribution ##################################################### From a3e1ba0eeb518d3329d698fa6612b9df1f591522 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Tue, 6 Oct 2020 14:01:50 +0200 Subject: [PATCH 25/28] Correct documentation --- Doc/library/statistics.rst | 2 +- Lib/statistics.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 5cde3a7731fd49..3593ef194193b0 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -617,7 +617,7 @@ However, for reading convenience, most of the examples show sorted sequences. .. function:: linear_regression(regressor, dependent_variable) - Return the ``(intercept, slope)`` tuple of the `simple linear regression + Return the intercept and slope of `simple linear regression `_ parameters estimated using ordinary least squares. Simple linear regression describes relationship between *regressor* and diff --git a/Lib/statistics.py b/Lib/statistics.py index a38b4d23081ba8..2e86a43bccf246 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -916,7 +916,7 @@ def correlation(x, y, /): def linear_regression(regressor, dependent_variable, /): """Intercept and slope for simple linear regression - Return the ``(intercept, slope)`` tuple of the simple linear regression + Return the intercept and slope of simple linear regression parameters estimated using ordinary least squares. Simple linear regression describes relationship between *regressor* and *dependent variable* in terms of linear function:: @@ -929,6 +929,8 @@ def linear_regression(regressor, dependent_variable, /): (it is equal to the difference between prediction and the actual values of dependent variable). + The parameters are returned as a named tuple. + >>> regressor = [1, 2, 3, 4, 5] >>> noise = NormalDist().samples(5, seed=42) >>> dependent_variable = [2 + 3 * regressor[i] + noise[i] for i in range(5)] From 9bf170d2048a904d9c8e25012a66557d995be91f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Mon, 2 Nov 2020 13:13:08 +0100 Subject: [PATCH 26/28] Update version information --- Doc/library/statistics.rst | 6 ++++++ Doc/whatsnew/3.10.rst | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 88a634122201f0..c14c1fe2970780 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -597,6 +597,8 @@ However, for reading convenience, most of the examples show sorted sequences. >>> covariance(z, x) -7.5 + .. versionadded:: 3.10 + .. function:: correlation(x, y, /) Return the `Pearson's correlation coefficient @@ -620,6 +622,8 @@ However, for reading convenience, most of the examples show sorted sequences. >>> correlation(x, y) -1.0 + .. versionadded:: 3.10 + .. function:: linear_regression(regressor, dependent_variable) Return the intercept and slope of `simple linear regression @@ -661,6 +665,8 @@ However, for reading convenience, most of the examples show sorted sequences. >>> round(intercept + slope * 1) -610 + .. versionadded:: 3.10 + Exceptions ---------- diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst index 60dee0c6bd1651..108fded1d4e900 100644 --- a/Doc/whatsnew/3.10.rst +++ b/Doc/whatsnew/3.10.rst @@ -217,6 +217,13 @@ The :mod:`shelve` module now uses :data:`pickle.DEFAULT_PROTOCOL` by default instead of :mod:`pickle` protocol ``3`` when creating shelves. (Contributed by Zackery Spytz in :issue:`34204`.) +statistics +---------- + +Added :func:`covariance`, Pearson's :func:`correlation`, and simple :func:`linear_regression` +functionalities to the :mod:`statistics` module. +(Contributed by Tymoteusz Wołodźko in :issue:`38490`.) + sys --- From 48a2bea183cf1ac1b8c70291545430dbc3aef213 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Mon, 2 Nov 2020 20:17:33 +0100 Subject: [PATCH 27/28] Update Doc/whatsnew/3.10.rst Co-authored-by: Tal Einat --- Doc/whatsnew/3.10.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst index 108fded1d4e900..3a4fb70fdde21e 100644 --- a/Doc/whatsnew/3.10.rst +++ b/Doc/whatsnew/3.10.rst @@ -220,8 +220,9 @@ instead of :mod:`pickle` protocol ``3`` when creating shelves. statistics ---------- -Added :func:`covariance`, Pearson's :func:`correlation`, and simple :func:`linear_regression` -functionalities to the :mod:`statistics` module. +Added :func:`~statistics.covariance`, Pearson's +:func:`~statistics.correlation`, and simple +:func:`~statistics.linear_regression` functions. (Contributed by Tymoteusz Wołodźko in :issue:`38490`.) sys From 37b742ea368933f35cec256894a15c1ceb28acdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Tue, 3 Nov 2020 08:33:16 +0100 Subject: [PATCH 28/28] Fixed markup in linear regression description --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index c14c1fe2970780..1453fc1b06b5b0 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -632,7 +632,7 @@ However, for reading convenience, most of the examples show sorted sequences. regression describes relationship between *regressor* and *dependent variable* in terms of linear function: - *dependent_variable = intercept + slope * regressor + noise* + *dependent_variable = intercept + slope \* regressor + noise* where ``intercept`` and ``slope`` are the regression parameters that are estimated, and noise term is an unobserved random variable, for the