From 0a1d35050453aeda84c42c5acf1cfa326ed837d0 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 00:58:38 -0500 Subject: [PATCH 01/12] Incorrect first draft --- Lib/statistics.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index a14c48e89814bd..c83022639e6d69 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -188,7 +188,7 @@ def _sum(data, start=0): partials = {d: n} partials_get = partials.get T = _coerce(int, type(start)) - for typ, values in groupby(data, type): + for typ, values in groupby(data, type): # XXX Are these sorted T = _coerce(T, typ) # or raise TypeError for n, d in map(_exact_ratio, values): count += 1 @@ -730,18 +730,25 @@ def _ss(data, c=None): if c is not None: T, total, count = _sum((d := x - c) * d for x in data) return (T, total) - # Compute the mean accurate to within 1/2 ulp - c = mean(data) - # Initial computation for the sum of square deviations - T, total, count = _sum((d := x - c) * d for x in data) - # Correct any remaining inaccuracy in the mean c. - # The following sum should mathematically equal zero, - # but due to the final rounding of the mean, it may not. - U, error, count2 = _sum((x - c) for x in data) - assert count == count2 - correction = error * error / len(data) - total -= correction - assert not total < 0, 'negative sum of square deviations: %f' % total + T, total, count = _sum(data) + c = total / count # Exact fraction + cn = c.numerator + cd = c.denominator + partials = {} + partials_get = partials.get + for n, d in map(_exact_ratio, data): + dev_numerator = n * cd - cn * d + dev_denominator = d * cd + dev_numerator *= dev_numerator + dev_denominator *= dev_denominator + partials[d] = partials_get(dev_denominator, 0) + dev_numerator + if None in partials: + # The sum will be a NAN or INF. We can ignore all the finite + # partials, and just look at this special one. + total = partials[None] + assert not _isfinite(total) + else: + total = sum(Fraction(n, d) for d, n in sorted(partials.items())) return (T, total) From d8714ccde4e13b382c6fef76881fb3460f127461 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 01:09:25 -0500 Subject: [PATCH 02/12] Exact ss --- Lib/statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index c83022639e6d69..81fc6231107dc3 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -739,16 +739,16 @@ def _ss(data, c=None): for n, d in map(_exact_ratio, data): dev_numerator = n * cd - cn * d dev_denominator = d * cd - dev_numerator *= dev_numerator + dev_numerator *= dev_numerator dev_denominator *= dev_denominator - partials[d] = partials_get(dev_denominator, 0) + dev_numerator + partials[dev_denominator] = partials_get(dev_denominator, 0) + dev_numerator if None in partials: # The sum will be a NAN or INF. We can ignore all the finite # partials, and just look at this special one. total = partials[None] assert not _isfinite(total) else: - total = sum(Fraction(n, d) for d, n in sorted(partials.items())) + total = sum(Fraction(n, d) for d, n in partials.items()) return (T, total) From 2108d4dfd76cb11112500c14f3c644630adc1de8 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 01:18:34 -0500 Subject: [PATCH 03/12] Add test --- Lib/test/test_statistics.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 64ebd0e8cb9f84..4e7d0b91035446 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2101,6 +2101,13 @@ def test_decimals(self): self.assertEqual(result, exact) self.assertIsInstance(result, Decimal) + def test_accuracy_bug_20499(self): + data = [0, 0, 1] + exact = 2 / 9 + result = self.func(data) + self.assertEqual(result, exact) + self.assertIsInstance(result, float) + class TestVariance(VarianceStdevMixin, NumericTestCase, UnivariateTypeMixin): # Tests for sample variance. From da6c6cb8712697f0510c6c7066f5253af76b448d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 01:20:09 -0500 Subject: [PATCH 04/12] Add blurb --- .../NEWS.d/next/Library/2021-09-08-01-19-31.bpo-20499.tSxx8Y.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2021-09-08-01-19-31.bpo-20499.tSxx8Y.rst diff --git a/Misc/NEWS.d/next/Library/2021-09-08-01-19-31.bpo-20499.tSxx8Y.rst b/Misc/NEWS.d/next/Library/2021-09-08-01-19-31.bpo-20499.tSxx8Y.rst new file mode 100644 index 00000000000000..cbbe61ac4a2697 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-09-08-01-19-31.bpo-20499.tSxx8Y.rst @@ -0,0 +1 @@ +Improve the speed and accuracy of statistics.pvariance(). From 69387c6d6fa205c38fe44f9a58781cbc3f6425c0 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 01:27:54 -0500 Subject: [PATCH 05/12] Remove superfluous comment --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 81fc6231107dc3..0a27149be8f33f 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -188,7 +188,7 @@ def _sum(data, start=0): partials = {d: n} partials_get = partials.get T = _coerce(int, type(start)) - for typ, values in groupby(data, type): # XXX Are these sorted + for typ, values in groupby(data, type): T = _coerce(T, typ) # or raise TypeError for n, d in map(_exact_ratio, values): count += 1 From 39973fd60a04c0feebeeaed59b882168213ab87b Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 09:14:00 -0500 Subject: [PATCH 06/12] Add another test --- Lib/statistics.py | 14 +++++--------- Lib/test/test_statistics.py | 7 +++++++ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 0a27149be8f33f..f769175744cd95 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -732,23 +732,19 @@ def _ss(data, c=None): return (T, total) T, total, count = _sum(data) c = total / count # Exact fraction - cn = c.numerator - cd = c.denominator - partials = {} - partials_get = partials.get + cn, cd = c.as_integer_ratio() + partials = Counter() for n, d in map(_exact_ratio, data): dev_numerator = n * cd - cn * d - dev_denominator = d * cd - dev_numerator *= dev_numerator - dev_denominator *= dev_denominator - partials[dev_denominator] = partials_get(dev_denominator, 0) + dev_numerator + dev_denominator = d * cd # Defer squaring until summation + partials[dev_denominator] += dev_numerator * dev_numerator if None in partials: # The sum will be a NAN or INF. We can ignore all the finite # partials, and just look at this special one. total = partials[None] assert not _isfinite(total) else: - total = sum(Fraction(n, d) for d, n in partials.items()) + total = sum(Fraction(n, d*d) for d, n in partials.items()) return (T, total) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 4e7d0b91035446..c85fc39a26ad6d 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2148,6 +2148,13 @@ def test_center_not_at_mean(self): self.assertEqual(self.func(data), 0.5) self.assertEqual(self.func(data, xbar=2.0), 1.0) + def test_accuracy_bug_20499(self): + data = [0, 0, 2] + exact = 4 / 3 + result = self.func(data) + self.assertEqual(result, exact) + self.assertIsInstance(result, float) + class TestPStdev(VarianceStdevMixin, NumericTestCase): # Tests for population standard deviation. def setUp(self): From 829d5ecd8318243d905e79751d71a5d428de6329 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 09:24:50 -0500 Subject: [PATCH 07/12] Improve clairity --- Lib/statistics.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index f769175744cd95..ba931529a22464 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -731,20 +731,19 @@ def _ss(data, c=None): T, total, count = _sum((d := x - c) * d for x in data) return (T, total) T, total, count = _sum(data) - c = total / count # Exact fraction - cn, cd = c.as_integer_ratio() + mean_n, mean_d = (total / count).as_integer_ratio() partials = Counter() for n, d in map(_exact_ratio, data): - dev_numerator = n * cd - cn * d - dev_denominator = d * cd # Defer squaring until summation - partials[dev_denominator] += dev_numerator * dev_numerator + diff_n = n * mean_d - mean_n * d + diff_d = d * mean_d + partials[diff_d * diff_d] += diff_n * diff_n if None in partials: # The sum will be a NAN or INF. We can ignore all the finite # partials, and just look at this special one. total = partials[None] assert not _isfinite(total) else: - total = sum(Fraction(n, d*d) for d, n in partials.items()) + total = sum(Fraction(n, d) for d, n in partials.items()) return (T, total) From cc78aaf86d6818e5aafaee4353babea614bb5c60 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 10:17:40 -0500 Subject: [PATCH 08/12] Beautify the order of terms --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index ba931529a22464..857a143421bae4 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -734,7 +734,7 @@ def _ss(data, c=None): mean_n, mean_d = (total / count).as_integer_ratio() partials = Counter() for n, d in map(_exact_ratio, data): - diff_n = n * mean_d - mean_n * d + diff_n = n * mean_d - d * mean_n diff_d = d * mean_d partials[diff_d * diff_d] += diff_n * diff_n if None in partials: From c0b156043308baef9f3115e6031444defcce1895 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 14:17:33 -0500 Subject: [PATCH 09/12] Removed unused "start" argument from internal function --- Lib/statistics.py | 17 ++++++----------- Lib/test/test_statistics.py | 14 -------------- 2 files changed, 6 insertions(+), 25 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 857a143421bae4..34651ed5b04a7a 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -147,21 +147,17 @@ class StatisticsError(ValueError): # === Private utilities === -def _sum(data, start=0): - """_sum(data [, start]) -> (type, sum, count) +def _sum(data): + """_sum(data) -> (type, sum, count) Return a high-precision sum of the given numeric data as a fraction, together with the type to be converted to and the count of items. - If optional argument ``start`` is given, it is added to the total. - If ``data`` is empty, ``start`` (defaulting to 0) is returned. - - Examples -------- - >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75) - (, Fraction(11, 1), 5) + >>> _sum([3, 2.25, 4.5, -0.5, 0.25]) + (, Fraction(19, 2), 5) Some sources of round-off error will be avoided: @@ -184,10 +180,9 @@ def _sum(data, start=0): allowed. """ count = 0 - n, d = _exact_ratio(start) - partials = {d: n} + partials = {} partials_get = partials.get - T = _coerce(int, type(start)) + T = int for typ, values in groupby(data, type): T = _coerce(T, typ) # or raise TypeError for n, d in map(_exact_ratio, values): diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index c85fc39a26ad6d..5cb055c981c448 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -1250,20 +1250,14 @@ def test_empty_data(self): # Override test for empty data. for data in ([], (), iter([])): self.assertEqual(self.func(data), (int, Fraction(0), 0)) - self.assertEqual(self.func(data, 23), (int, Fraction(23), 0)) - self.assertEqual(self.func(data, 2.3), (float, Fraction(2.3), 0)) def test_ints(self): self.assertEqual(self.func([1, 5, 3, -4, -8, 20, 42, 1]), (int, Fraction(60), 8)) - self.assertEqual(self.func([4, 2, 3, -8, 7], 1000), - (int, Fraction(1008), 5)) def test_floats(self): self.assertEqual(self.func([0.25]*20), (float, Fraction(5.0), 20)) - self.assertEqual(self.func([0.125, 0.25, 0.5, 0.75], 1.5), - (float, Fraction(3.125), 4)) def test_fractions(self): self.assertEqual(self.func([Fraction(1, 1000)]*500), @@ -1284,14 +1278,6 @@ def test_compare_with_math_fsum(self): data = [random.uniform(-100, 1000) for _ in range(1000)] self.assertApproxEqual(float(self.func(data)[1]), math.fsum(data), rel=2e-16) - def test_start_argument(self): - # Test that the optional start argument works correctly. - data = [random.uniform(1, 1000) for _ in range(100)] - t = self.func(data)[1] - self.assertEqual(t+42, self.func(data, 42)[1]) - self.assertEqual(t-23, self.func(data, -23)[1]) - self.assertEqual(t+Fraction(1e20), self.func(data, 1e20)[1]) - def test_strings_fail(self): # Sum of strings should fail. self.assertRaises(TypeError, self.func, [1, 2, 3], '999') From c74648287fa56bbe5755b23df600b0cb4f246951 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 14:30:43 -0500 Subject: [PATCH 10/12] Modernize and speed-up _exact_ratio() internal function --- Lib/statistics.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 34651ed5b04a7a..fc0002523d3e73 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -247,27 +247,19 @@ def _exact_ratio(x): x is expected to be an int, Fraction, Decimal or float. """ try: - # Optimise the common case of floats. We expect that the most often - # used numeric type will be builtin floats, so try to make this as - # fast as possible. - if type(x) is float or type(x) is Decimal: - return x.as_integer_ratio() - try: - # x may be an int, Fraction, or Integral ABC. - return (x.numerator, x.denominator) - except AttributeError: - try: - # x may be a float or Decimal subclass. - return x.as_integer_ratio() - except AttributeError: - # Just give up? - pass + return x.as_integer_ratio() + except AttributeError: + pass except (OverflowError, ValueError): # float NAN or INF. assert not _isfinite(x) return (x, None) - msg = "can't convert type '{}' to numerator/denominator" - raise TypeError(msg.format(type(x).__name__)) + try: + # x may be an Integral ABC. + return (x.numerator, x.denominator) + except AttributeError: + msg = f"can't convert type '{type(x).__name__}' to numerator/denominator" + raise TypeError(msg) def _convert(value, T): From e7c46fa5b93e117dbedc6e184ad797835c34ffd0 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 15:03:57 -0500 Subject: [PATCH 11/12] Remove unnecessary sort --- Lib/statistics.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index fc0002523d3e73..93ef64a534c4a6 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -195,8 +195,7 @@ def _sum(data): assert not _isfinite(total) else: # Sum all the partial sums using builtin sum. - # FIXME is this faster if we sum them in order of the denominator? - total = sum(Fraction(n, d) for d, n in sorted(partials.items())) + total = sum(Fraction(n, d) for d, n in partials.items()) return (T, total, count) From 117ad6b0d7cb52b2808d2a9ada87afee7a0e8924 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 8 Sep 2021 16:09:51 -0500 Subject: [PATCH 12/12] Add accuracy comments to stdev() and pstdev(). --- Lib/statistics.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Lib/statistics.py b/Lib/statistics.py index 93ef64a534c4a6..13e5fe7fc0e861 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -833,6 +833,9 @@ def stdev(data, xbar=None): 1.0810874155219827 """ + # Fixme: Despite the exact sum of squared deviations, some inaccuracy + # remain because there are two rounding steps. The first occurs in + # the _convert() step for variance(), the second occurs in math.sqrt(). var = variance(data, xbar) try: return var.sqrt() @@ -849,6 +852,9 @@ def pstdev(data, mu=None): 0.986893273527251 """ + # Fixme: Despite the exact sum of squared deviations, some inaccuracy + # remain because there are two rounding steps. The first occurs in + # the _convert() step for pvariance(), the second occurs in math.sqrt(). var = pvariance(data, mu) try: return var.sqrt()