Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions Doc/library/statistics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,28 @@ of applications in statistics, including simulations and hypothesis testing.
compute the probability that a random variable *X* will be less than or
equal to *x*. Mathematically, it is written ``P(X <= x)``.

.. method:: NormalDist.overlap(other)

Compute the `overlapping coefficient (OVL)
<http://www.iceaaonline.com/ready/wp-content/uploads/2014/06/MM-9-Presentation-Meet-the-Overlapping-Coefficient-A-Measure-for-Elevator-Speeches.pdf>`_
between two normal distributions.

Measures the agreement between two normal probability distributions.
Returns a value between 0.0 and 1.0 giving the overlapping area for
two probability density functions.

In this `example from John M. Linacre
<https://www.rasch.org/rmt/rmt101r.htm>`_ about 80% of each
distribution overlaps the other:

.. doctest::

>>> N1 = NormalDist(2.4, 1.6)
>>> N2 = NormalDist(3.2, 2.0)
>>> ovl = N1.overlap(N2)
>>> f'{ovl * 100.0 :.1f}%'
'80.4%'

Instances of :class:`NormalDist` support addition, subtraction,
multiplication and division by a constant. These operations
are used for translation and scaling. For example:
Expand Down Expand Up @@ -595,6 +617,16 @@ determine the percentage of students with scores between 1100 and 1200:
>>> f'{fraction * 100 :.1f}% score between 1100 and 1200'
'18.2% score between 1100 and 1200'

What percentage of men and women will have the same height in `two normally
distributed populations with known means and standard deviations
<http://www.usablestats.com/lessons/normal>`_?

>>> men = NormalDist(70, 4)
>>> women = NormalDist(65, 3.5)
>>> ovl = men.overlap(women)
>>> round(ovl * 100.0, 1)
50.3

To estimate the distribution for a model than isn't easy to solve
analytically, :class:`NormalDist` can generate input samples for a `Monte
Carlo simulation <https://en.wikipedia.org/wiki/Monte_Carlo_method>`_ of the
Expand Down
37 changes: 36 additions & 1 deletion Lib/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
from decimal import Decimal
from itertools import groupby
from bisect import bisect_left, bisect_right
from math import hypot, sqrt, fabs, exp, erf, tau
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum



Expand Down Expand Up @@ -740,6 +740,41 @@ def cdf(self, x):
raise StatisticsError('cdf() not defined when sigma is zero')
return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))

def overlap(self, other):
'''Compute the overlapping coefficient (OVL) between two normal distributions.

Measures the agreement between two normal probability distributions.
Returns a value between 0.0 and 1.0 giving the overlapping area in
the two underlying probability density functions.

>>> N1 = NormalDist(2.4, 1.6)
>>> N2 = NormalDist(3.2, 2.0)
>>> N1.overlap(N2)
0.8035050657330205

'''
# See: "The overlapping coefficient as a measure of agreement between
# probability distributions and point estimation of the overlap of two
# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
# http://dx.doi.org/10.1080/03610928908830127
if not isinstance(other, NormalDist):
raise TypeError('Expected another NormalDist instance')
X, Y = self, other
if (Y.sigma, Y.mu) < (X.sigma, X.mu): # sort to assure commutativity
X, Y = Y, X
X_var, Y_var = X.variance, Y.variance
if not X_var or not Y_var:
raise StatisticsError('overlap() not defined when sigma is zero')
dv = Y_var - X_var
dm = fabs(Y.mu - X.mu)
if not dv:
return 2.0 * NormalDist(dm, 2.0 * X.sigma).cdf(0)
a = X.mu * Y_var - Y.mu * X_var
b = X.sigma * Y.sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
x1 = (a + b) / dv
x2 = (a - b) / dv
return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))

@property
def mean(self):
'Arithmetic mean of the normal distribution'
Expand Down
62 changes: 62 additions & 0 deletions Lib/test/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2136,6 +2136,68 @@ def test_cdf(self):
self.assertEqual(X.cdf(float('Inf')), 1.0)
self.assertTrue(math.isnan(X.cdf(float('NaN'))))

def test_overlap(self):
NormalDist = statistics.NormalDist

# Match examples from Imman and Bradley
for X1, X2, published_result in [
(NormalDist(0.0, 2.0), NormalDist(1.0, 2.0), 0.80258),
(NormalDist(0.0, 1.0), NormalDist(1.0, 2.0), 0.60993),
]:
self.assertAlmostEqual(X1.overlap(X2), published_result, places=4)
self.assertAlmostEqual(X2.overlap(X1), published_result, places=4)

# Check against integration of the PDF
def overlap_numeric(X, Y, *, steps=8_192, z=5):
'Numerical integration cross-check for overlap() '
fsum = math.fsum
center = (X.mu + Y.mu) / 2.0
width = z * max(X.sigma, Y.sigma)
start = center - width
dx = 2.0 * width / steps
x_arr = [start + i*dx for i in range(steps)]
xp = list(map(X.pdf, x_arr))
yp = list(map(Y.pdf, x_arr))
total = max(fsum(xp), fsum(yp))
return fsum(map(min, xp, yp)) / total

for X1, X2 in [
# Examples from Imman and Bradley
(NormalDist(0.0, 2.0), NormalDist(1.0, 2.0)),
(NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)),
# Example from https://www.rasch.org/rmt/rmt101r.htm
(NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)),
# Gender heights from http://www.usablestats.com/lessons/normal
(NormalDist(70, 4), NormalDist(65, 3.5)),
# Misc cases with equal standard deviations
(NormalDist(100, 15), NormalDist(110, 15)),
(NormalDist(-100, 15), NormalDist(110, 15)),
(NormalDist(-100, 15), NormalDist(-110, 15)),
# Misc cases with unequal standard deviations
(NormalDist(100, 12), NormalDist(110, 15)),
(NormalDist(100, 12), NormalDist(150, 15)),
(NormalDist(100, 12), NormalDist(150, 35)),
# Misc cases with small values
(NormalDist(1.000, 0.002), NormalDist(1.001, 0.003)),
(NormalDist(1.000, 0.002), NormalDist(1.006, 0.0003)),
(NormalDist(1.000, 0.002), NormalDist(1.001, 0.099)),
]:
self.assertAlmostEqual(X1.overlap(X2), overlap_numeric(X1, X2), places=5)
self.assertAlmostEqual(X2.overlap(X1), overlap_numeric(X1, X2), places=5)

# Error cases
X = NormalDist()
with self.assertRaises(TypeError):
X.overlap() # too few arguments
with self.assertRaises(TypeError):
X.overlap(X, X) # too may arguments
with self.assertRaises(TypeError):
X.overlap(None) # right operand not a NormalDist
with self.assertRaises(statistics.StatisticsError):
X.overlap(NormalDist(1, 0)) # right operand sigma is zero
with self.assertRaises(statistics.StatisticsError):
NormalDist(1, 0).overlap(X) # left operand sigma is zero

def test_properties(self):
X = statistics.NormalDist(100, 15)
self.assertEqual(X.mean, 100)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Add overlap() method to statistics.NormalDist. Computes the overlapping
coefficient for two normal distributions.