DOC: more updates

jklymak · jklymak · commit b1afe65238e9 · 2023-12-04T15:34:31.000-08:00
diff --git a/galleries/examples/statistics/histogram_normalization.py b/galleries/examples/statistics/histogram_normalization.py
@@ -86,8 +86,8 @@
 
 fig, ax = plt.subplots()
 ax.hist(xdata, bins=xbins, density=True, **style)
-ax.set_ylabel('Probability (per dx)')
-ax.set_xlabel('x bins (dx=0.5)')
+ax.set_ylabel('Probability density [$V^{-1}$])')
+ax.set_xlabel('x bins (dx=0.5 $V$)')
 
 # %%
 # This normalization can be a little hard to interpret when just exploring the
@@ -115,32 +115,58 @@
 pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
 
 # %%
-# to make the point very obvious, consider bins that do not have the same
-# spacing.  By normalizing by density, we preserve the shape of the
-# distribution, whereas if we do not, then the wider bins have much higher
-# values than the thin bins:
+# If we don't use ``density=True``, we need to scale the expected probability
+# distribution function by both the length of the data and the width of the
+# bins:
+
+fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
+dx = 0.1
+xbins = np.arange(-4, 4, dx)
+ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
+
+# scale and plot the expected pdf:
+ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$')
+ax['False'].set_ylabel('Count per bin')
+ax['False'].set_xlabel('x bins [V]')
+ax['False'].legend()
+
+ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
+ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
+ax['True'].set_ylabel('Probability density [$V^{-1}$]')
+ax['True'].set_xlabel('x bins [$V$]')
+ax['True'].legend()
+
+# %%
+# One advantage of using the density is therefore that the shape and amplitude
+# of the histogram does not depend on the size of the bins.  Consider an
+# extreme case where the bins do not have the same width.  In this example, the
+# bins below ``x=-1.25`` are six times wider than the rest of the bins.   By
+# normalizing by density, we preserve the shape of the distribution, whereas if
+# we do not, then the wider bins have much higher counts than the thinner bins:
 
 fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
 dx = 0.1
 xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
-ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
+ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
+ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$')
 ax['False'].set_ylabel('Count per bin')
-ax['False'].set_xlabel('x bins (below -1.25 bins are wider)')
+ax['False'].set_xlabel('x bins [V]')
+ax['False'].legend()
 
-ax['True'].hist(xdata, bins=xbins, density=True, histtype='step')
-ax['True'].plot(xpdf, pdf)
-ax['True'].set_ylabel('Probability (per dx)')
-ax['True'].set_xlabel('x bins (below -1.25 bins are wider)')
+ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
+ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
+ax['True'].set_ylabel('Probability density [$V^{-1}$]')
+ax['True'].set_xlabel('x bins [$V$]')
+ax['True'].legend()
 
 # %%
-# Using *density* also makes it easier to compare histograms with different bin
-# widths. Note that in order to get the theoretical distribution, we must
-# multiply the distribution by the number of data points and the bin width
+# Similarly, if we want to compare histograms with different bin widths, we may
+# want to use ``density=True``:
 
 fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
 
 # expected PDF
-ax['True'].plot(xpdf, pdf, '--', label='PDF', color='k')
+ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')
 
 for nn, dx in enumerate([0.1, 0.4, 1.2]):
     xbins = np.arange(-4, 4, dx)
@@ -151,33 +177,35 @@
     ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
 
 # Labels:
-ax['False'].set_xlabel('x bins')
+ax['False'].set_xlabel('x bins [$V$]')
 ax['False'].set_ylabel('Count per bin')
-ax['True'].set_ylabel('Probability (per dx)')
-ax['True'].set_xlabel('x bins')
-ax['True'].legend(fontsize='small')
+ax['True'].set_ylabel('Probability density [$V^{-1}$]')
+ax['True'].set_xlabel('x bins [$V$]')
+ax['True'].legend(fontsize='small', title='bin width:')
 
 # %%
-
 # Sometimes people want to normalize so that the sum of counts is one.  This is
-# not done with the *density* kwarg, but rather we can get this effects if we
-# set the *weights* to 1/N.  Note, however, that the amplitude of the histogram
-# still depends on width of the bins:
+# analogous to a `probability mass function
+# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
+# variable where the sum of probabilities for all the values equals one.  Using
+# ``hist``, we can get this normalization if we set the *weights* to 1/N.
+# Note that the amplitude of this normalized histogram still depends on
+# width and/or number of the bins:
 
 fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
 
 for nn, dx in enumerate([0.1, 0.4, 1.2]):
     xbins = np.arange(-4, 4, dx)
     ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
                    histtype='step', label=f'{dx}')
-ax.set_xlabel('x bins')
+ax.set_xlabel('x bins [$V$]')
 ax.set_ylabel('Bin count / N')
-ax.legend(fontsize='small')
+ax.legend(fontsize='small', title='bin width:')
 
 # %%
-# The true value of normalizing is if you do want to compare two distributions
-# that have different sized populations.  Here we compare the distribution of
-# ``xdata`` with a population of 1000, and ``xdata2`` with 100 members.
+# The value of normalizing histograms is comparing two distributions that have
+# different sized populations.  Here we compare the distribution of ``xdata``
+# with a population of 1000, and ``xdata2`` with 100 members.
 
 xdata2 = rng.normal(size=100)
 
@@ -189,22 +217,22 @@
 ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
 ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
 ax['no_norm'].set_ylabel('Counts')
-ax['no_norm'].set_xlabel('x bins')
+ax['no_norm'].set_xlabel('x bins [$V$]')
 ax['no_norm'].set_title('No normalization')
 
 ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
 ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
-ax['density'].set_ylabel('Probability (per dx)')
+ax['density'].set_ylabel('Probability density [$V^{-1}$]')
 ax['density'].set_title('Density=True')
-ax['density'].set_xlabel('x bins')
+ax['density'].set_xlabel('x bins [$V$]')
 
 ax['weight'].hist(xdata, bins=xbins, histtype='step',
                   weights=1 / len(xdata) * np.ones(len(xdata)),
                   label='N=1000')
 ax['weight'].hist(xdata2, bins=xbins, histtype='step',
                   weights=1 / len(xdata2) * np.ones(len(xdata2)),
                   label='N=100')
-ax['weight'].set_xlabel('x bins')
+ax['weight'].set_xlabel('x bins [$V$]')
 ax['weight'].set_ylabel('Counts / N')
 ax['weight'].legend(fontsize='small')
 ax['weight'].set_title('Weight = 1/N')