Skip to content

Commit e917f2e

Browse files
authored
bpo-36546: Add more tests and expand docs (#13406)
1 parent 73934b9 commit e917f2e

2 files changed

Lines changed: 49 additions & 22 deletions

File tree

Doc/library/statistics.rst

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -511,22 +511,33 @@ However, for reading convenience, most of the examples show sorted sequences.
511511
is not least 1.
512512

513513
The *dist* can be any iterable containing sample data or it can be an
514-
instance of a class that defines an :meth:`~inv_cdf` method.
514+
instance of a class that defines an :meth:`~inv_cdf` method. For meaningful
515+
results, the number of data points in *dist* should be larger than *n*.
515516
Raises :exc:`StatisticsError` if there are not at least two data points.
516517

517518
For sample data, the cut points are linearly interpolated from the
518519
two nearest data points. For example, if a cut point falls one-third
519520
of the distance between two sample values, ``100`` and ``112``, the
520-
cut-point will evaluate to ``104``. Other selection methods may be
521-
offered in the future (for example choose ``100`` as the nearest
522-
value or compute ``106`` as the midpoint). This might matter if
523-
there are too few samples for a given number of cut points.
524-
525-
If *method* is set to *inclusive*, *dist* is treated as population data.
526-
The minimum value is treated as the 0th percentile and the maximum
527-
value is treated as the 100th percentile. If *dist* is an instance of
528-
a class that defines an :meth:`~inv_cdf` method, setting *method*
529-
has no effect.
521+
cut-point will evaluate to ``104``.
522+
523+
The *method* for computing quantiles can be varied depending on
524+
whether the data in *dist* includes or excludes the lowest and
525+
highest possible values from the population.
526+
527+
The default *method* is "exclusive" and is used for data sampled from
528+
a population that can have more extreme values than found in the
529+
samples. The portion of the population falling below the *i-th* of
530+
*m* data points is computed as ``i / (m + 1)``.
531+
532+
Setting the *method* to "inclusive" is used for describing population
533+
data or for samples that include the extreme points. The minimum
534+
value in *dist* is treated as the 0th percentile and the maximum
535+
value is treated as the 100th percentile. The portion of the
536+
population falling below the *i-th* of *m* data points is computed as
537+
``(i - 1) / (m - 1)``.
538+
539+
If *dist* is an instance of a class that defines an
540+
:meth:`~inv_cdf` method, setting *method* has no effect.
530541

531542
.. doctest::
532543

Lib/test/test_statistics.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2161,17 +2161,18 @@ def test_specific_cases(self):
21612161
# Quantiles should be idempotent
21622162
if len(expected) >= 2:
21632163
self.assertEqual(quantiles(expected, n=n), expected)
2164-
# Cross-check against other methods
2165-
if len(data) >= n:
2166-
# After end caps are added, method='inclusive' should
2167-
# give the same result as method='exclusive' whenever
2168-
# there are more data points than desired cut points.
2169-
padded_data = [min(data) - 1000] + data + [max(data) + 1000]
2170-
self.assertEqual(
2171-
quantiles(data, n=n),
2172-
quantiles(padded_data, n=n, method='inclusive'),
2173-
(n, data),
2174-
)
2164+
# Cross-check against method='inclusive' which should give
2165+
# the same result after adding in minimum and maximum values
2166+
# extrapolated from the two lowest and two highest points.
2167+
sdata = sorted(data)
2168+
lo = 2 * sdata[0] - sdata[1]
2169+
hi = 2 * sdata[-1] - sdata[-2]
2170+
padded_data = data + [lo, hi]
2171+
self.assertEqual(
2172+
quantiles(data, n=n),
2173+
quantiles(padded_data, n=n, method='inclusive'),
2174+
(n, data),
2175+
)
21752176
# Invariant under tranlation and scaling
21762177
def f(x):
21772178
return 3.5 * x - 1234.675
@@ -2188,6 +2189,11 @@ def f(x):
21882189
actual = quantiles(statistics.NormalDist(), n=n)
21892190
self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001)
21902191
for e, a in zip(expected, actual)))
2192+
# Q2 agrees with median()
2193+
for k in range(2, 60):
2194+
data = random.choices(range(100), k=k)
2195+
q1, q2, q3 = quantiles(data)
2196+
self.assertEqual(q2, statistics.median(data))
21912197

21922198
def test_specific_cases_inclusive(self):
21932199
# Match results computed by hand and cross-checked
@@ -2233,6 +2239,11 @@ def f(x):
22332239
actual = quantiles(statistics.NormalDist(), n=n, method="inclusive")
22342240
self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001)
22352241
for e, a in zip(expected, actual)))
2242+
# Natural deciles
2243+
self.assertEqual(quantiles([0, 100], n=10, method='inclusive'),
2244+
[10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0])
2245+
self.assertEqual(quantiles(range(0, 101), n=10, method='inclusive'),
2246+
[10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0])
22362247
# Whenever n is smaller than the number of data points, running
22372248
# method='inclusive' should give the same result as method='exclusive'
22382249
# after the two included extreme points are removed.
@@ -2242,6 +2253,11 @@ def f(x):
22422253
data.remove(max(data))
22432254
expected = quantiles(data, n=32)
22442255
self.assertEqual(expected, actual)
2256+
# Q2 agrees with median()
2257+
for k in range(2, 60):
2258+
data = random.choices(range(100), k=k)
2259+
q1, q2, q3 = quantiles(data, method='inclusive')
2260+
self.assertEqual(q2, statistics.median(data))
22452261

22462262
def test_equal_inputs(self):
22472263
quantiles = statistics.quantiles

0 commit comments

Comments
 (0)