Skip to content

Commit b1afe65

Browse files
committed
DOC: more updates
1 parent aa52a1d commit b1afe65

File tree

1 file changed

+61
-33
lines changed

1 file changed

+61
-33
lines changed

galleries/examples/statistics/histogram_normalization.py

Lines changed: 61 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@
8686

8787
fig, ax = plt.subplots()
8888
ax.hist(xdata, bins=xbins, density=True, **style)
89-
ax.set_ylabel('Probability (per dx)')
90-
ax.set_xlabel('x bins (dx=0.5)')
89+
ax.set_ylabel('Probability density [$V^{-1}$])')
90+
ax.set_xlabel('x bins (dx=0.5 $V$)')
9191

9292
# %%
9393
# This normalization can be a little hard to interpret when just exploring the
@@ -115,32 +115,58 @@
115115
pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
116116

117117
# %%
118-
# to make the point very obvious, consider bins that do not have the same
119-
# spacing. By normalizing by density, we preserve the shape of the
120-
# distribution, whereas if we do not, then the wider bins have much higher
121-
# values than the thin bins:
118+
# If we don't use ``density=True``, we need to scale the expected probability
119+
# distribution function by both the length of the data and the width of the
120+
# bins:
121+
122+
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
123+
dx = 0.1
124+
xbins = np.arange(-4, 4, dx)
125+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
126+
127+
# scale and plot the expected pdf:
128+
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$')
129+
ax['False'].set_ylabel('Count per bin')
130+
ax['False'].set_xlabel('x bins [V]')
131+
ax['False'].legend()
132+
133+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
134+
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
135+
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
136+
ax['True'].set_xlabel('x bins [$V$]')
137+
ax['True'].legend()
138+
139+
# %%
140+
# One advantage of using the density is therefore that the shape and amplitude
141+
# of the histogram does not depend on the size of the bins. Consider an
142+
# extreme case where the bins do not have the same width. In this example, the
143+
# bins below ``x=-1.25`` are six times wider than the rest of the bins. By
144+
# normalizing by density, we preserve the shape of the distribution, whereas if
145+
# we do not, then the wider bins have much higher counts than the thinner bins:
122146

123147
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
124148
dx = 0.1
125149
xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
126-
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
150+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
151+
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$')
127152
ax['False'].set_ylabel('Count per bin')
128-
ax['False'].set_xlabel('x bins (below -1.25 bins are wider)')
153+
ax['False'].set_xlabel('x bins [V]')
154+
ax['False'].legend()
129155

130-
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step')
131-
ax['True'].plot(xpdf, pdf)
132-
ax['True'].set_ylabel('Probability (per dx)')
133-
ax['True'].set_xlabel('x bins (below -1.25 bins are wider)')
156+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
157+
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
158+
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
159+
ax['True'].set_xlabel('x bins [$V$]')
160+
ax['True'].legend()
134161

135162
# %%
136-
# Using *density* also makes it easier to compare histograms with different bin
137-
# widths. Note that in order to get the theoretical distribution, we must
138-
# multiply the distribution by the number of data points and the bin width
163+
# Similarly, if we want to compare histograms with different bin widths, we may
164+
# want to use ``density=True``:
139165

140166
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
141167

142168
# expected PDF
143-
ax['True'].plot(xpdf, pdf, '--', label='PDF', color='k')
169+
ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')
144170

145171
for nn, dx in enumerate([0.1, 0.4, 1.2]):
146172
xbins = np.arange(-4, 4, dx)
@@ -151,33 +177,35 @@
151177
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
152178

153179
# Labels:
154-
ax['False'].set_xlabel('x bins')
180+
ax['False'].set_xlabel('x bins [$V$]')
155181
ax['False'].set_ylabel('Count per bin')
156-
ax['True'].set_ylabel('Probability (per dx)')
157-
ax['True'].set_xlabel('x bins')
158-
ax['True'].legend(fontsize='small')
182+
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
183+
ax['True'].set_xlabel('x bins [$V$]')
184+
ax['True'].legend(fontsize='small', title='bin width:')
159185

160186
# %%
161-
162187
# Sometimes people want to normalize so that the sum of counts is one. This is
163-
# not done with the *density* kwarg, but rather we can get this effects if we
164-
# set the *weights* to 1/N. Note, however, that the amplitude of the histogram
165-
# still depends on width of the bins:
188+
# analogous to a `probability mass function
189+
# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
190+
# variable where the sum of probabilities for all the values equals one. Using
191+
# ``hist``, we can get this normalization if we set the *weights* to 1/N.
192+
# Note that the amplitude of this normalized histogram still depends on
193+
# width and/or number of the bins:
166194

167195
fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
168196

169197
for nn, dx in enumerate([0.1, 0.4, 1.2]):
170198
xbins = np.arange(-4, 4, dx)
171199
ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
172200
histtype='step', label=f'{dx}')
173-
ax.set_xlabel('x bins')
201+
ax.set_xlabel('x bins [$V$]')
174202
ax.set_ylabel('Bin count / N')
175-
ax.legend(fontsize='small')
203+
ax.legend(fontsize='small', title='bin width:')
176204

177205
# %%
178-
# The true value of normalizing is if you do want to compare two distributions
179-
# that have different sized populations. Here we compare the distribution of
180-
# ``xdata`` with a population of 1000, and ``xdata2`` with 100 members.
206+
# The value of normalizing histograms is comparing two distributions that have
207+
# different sized populations. Here we compare the distribution of ``xdata``
208+
# with a population of 1000, and ``xdata2`` with 100 members.
181209

182210
xdata2 = rng.normal(size=100)
183211

@@ -189,22 +217,22 @@
189217
ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
190218
ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
191219
ax['no_norm'].set_ylabel('Counts')
192-
ax['no_norm'].set_xlabel('x bins')
220+
ax['no_norm'].set_xlabel('x bins [$V$]')
193221
ax['no_norm'].set_title('No normalization')
194222

195223
ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
196224
ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
197-
ax['density'].set_ylabel('Probability (per dx)')
225+
ax['density'].set_ylabel('Probability density [$V^{-1}$]')
198226
ax['density'].set_title('Density=True')
199-
ax['density'].set_xlabel('x bins')
227+
ax['density'].set_xlabel('x bins [$V$]')
200228

201229
ax['weight'].hist(xdata, bins=xbins, histtype='step',
202230
weights=1 / len(xdata) * np.ones(len(xdata)),
203231
label='N=1000')
204232
ax['weight'].hist(xdata2, bins=xbins, histtype='step',
205233
weights=1 / len(xdata2) * np.ones(len(xdata2)),
206234
label='N=100')
207-
ax['weight'].set_xlabel('x bins')
235+
ax['weight'].set_xlabel('x bins [$V$]')
208236
ax['weight'].set_ylabel('Counts / N')
209237
ax['weight'].legend(fontsize='small')
210238
ax['weight'].set_title('Weight = 1/N')

0 commit comments

Comments
 (0)