|
86 | 86 |
|
87 | 87 | fig, ax = plt.subplots() |
88 | 88 | ax.hist(xdata, bins=xbins, density=True, **style) |
89 | | -ax.set_ylabel('Probability (per dx)') |
90 | | -ax.set_xlabel('x bins (dx=0.5)') |
| 89 | +ax.set_ylabel('Probability density [$V^{-1}$])') |
| 90 | +ax.set_xlabel('x bins (dx=0.5 $V$)') |
91 | 91 |
|
92 | 92 | # %% |
93 | 93 | # This normalization can be a little hard to interpret when just exploring the |
|
115 | 115 | pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2) |
116 | 116 |
|
117 | 117 | # %% |
118 | | -# to make the point very obvious, consider bins that do not have the same |
119 | | -# spacing. By normalizing by density, we preserve the shape of the |
120 | | -# distribution, whereas if we do not, then the wider bins have much higher |
121 | | -# values than the thin bins: |
| 118 | +# If we don't use ``density=True``, we need to scale the expected probability |
| 119 | +# distribution function by both the length of the data and the width of the |
| 120 | +# bins: |
| 121 | + |
| 122 | +fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
| 123 | +dx = 0.1 |
| 124 | +xbins = np.arange(-4, 4, dx) |
| 125 | +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
| 126 | + |
| 127 | +# scale and plot the expected pdf: |
| 128 | +ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$') |
| 129 | +ax['False'].set_ylabel('Count per bin') |
| 130 | +ax['False'].set_xlabel('x bins [V]') |
| 131 | +ax['False'].legend() |
| 132 | + |
| 133 | +ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') |
| 134 | +ax['True'].plot(xpdf, pdf, label='$f_X(x)$') |
| 135 | +ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
| 136 | +ax['True'].set_xlabel('x bins [$V$]') |
| 137 | +ax['True'].legend() |
| 138 | + |
| 139 | +# %% |
| 140 | +# One advantage of using the density is therefore that the shape and amplitude |
| 141 | +# of the histogram does not depend on the size of the bins. Consider an |
| 142 | +# extreme case where the bins do not have the same width. In this example, the |
| 143 | +# bins below ``x=-1.25`` are six times wider than the rest of the bins. By |
| 144 | +# normalizing by density, we preserve the shape of the distribution, whereas if |
| 145 | +# we do not, then the wider bins have much higher counts than the thinner bins: |
122 | 146 |
|
123 | 147 | fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
124 | 148 | dx = 0.1 |
125 | 149 | xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)]) |
126 | | -ax['False'].hist(xdata, bins=xbins, density=False, histtype='step') |
| 150 | +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
| 151 | +ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$') |
127 | 152 | ax['False'].set_ylabel('Count per bin') |
128 | | -ax['False'].set_xlabel('x bins (below -1.25 bins are wider)') |
| 153 | +ax['False'].set_xlabel('x bins [V]') |
| 154 | +ax['False'].legend() |
129 | 155 |
|
130 | | -ax['True'].hist(xdata, bins=xbins, density=True, histtype='step') |
131 | | -ax['True'].plot(xpdf, pdf) |
132 | | -ax['True'].set_ylabel('Probability (per dx)') |
133 | | -ax['True'].set_xlabel('x bins (below -1.25 bins are wider)') |
| 156 | +ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') |
| 157 | +ax['True'].plot(xpdf, pdf, label='$f_X(x)$') |
| 158 | +ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
| 159 | +ax['True'].set_xlabel('x bins [$V$]') |
| 160 | +ax['True'].legend() |
134 | 161 |
|
135 | 162 | # %% |
136 | | -# Using *density* also makes it easier to compare histograms with different bin |
137 | | -# widths. Note that in order to get the theoretical distribution, we must |
138 | | -# multiply the distribution by the number of data points and the bin width |
| 163 | +# Similarly, if we want to compare histograms with different bin widths, we may |
| 164 | +# want to use ``density=True``: |
139 | 165 |
|
140 | 166 | fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
141 | 167 |
|
142 | 168 | # expected PDF |
143 | | -ax['True'].plot(xpdf, pdf, '--', label='PDF', color='k') |
| 169 | +ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k') |
144 | 170 |
|
145 | 171 | for nn, dx in enumerate([0.1, 0.4, 1.2]): |
146 | 172 | xbins = np.arange(-4, 4, dx) |
|
151 | 177 | ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx) |
152 | 178 |
|
153 | 179 | # Labels: |
154 | | -ax['False'].set_xlabel('x bins') |
| 180 | +ax['False'].set_xlabel('x bins [$V$]') |
155 | 181 | ax['False'].set_ylabel('Count per bin') |
156 | | -ax['True'].set_ylabel('Probability (per dx)') |
157 | | -ax['True'].set_xlabel('x bins') |
158 | | -ax['True'].legend(fontsize='small') |
| 182 | +ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
| 183 | +ax['True'].set_xlabel('x bins [$V$]') |
| 184 | +ax['True'].legend(fontsize='small', title='bin width:') |
159 | 185 |
|
160 | 186 | # %% |
161 | | - |
162 | 187 | # Sometimes people want to normalize so that the sum of counts is one. This is |
163 | | -# not done with the *density* kwarg, but rather we can get this effects if we |
164 | | -# set the *weights* to 1/N. Note, however, that the amplitude of the histogram |
165 | | -# still depends on width of the bins: |
| 188 | +# analogous to a `probability mass function |
| 189 | +# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete |
| 190 | +# variable where the sum of probabilities for all the values equals one. Using |
| 191 | +# ``hist``, we can get this normalization if we set the *weights* to 1/N. |
| 192 | +# Note that the amplitude of this normalized histogram still depends on |
| 193 | +# width and/or number of the bins: |
166 | 194 |
|
167 | 195 | fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3)) |
168 | 196 |
|
169 | 197 | for nn, dx in enumerate([0.1, 0.4, 1.2]): |
170 | 198 | xbins = np.arange(-4, 4, dx) |
171 | 199 | ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)), |
172 | 200 | histtype='step', label=f'{dx}') |
173 | | -ax.set_xlabel('x bins') |
| 201 | +ax.set_xlabel('x bins [$V$]') |
174 | 202 | ax.set_ylabel('Bin count / N') |
175 | | -ax.legend(fontsize='small') |
| 203 | +ax.legend(fontsize='small', title='bin width:') |
176 | 204 |
|
177 | 205 | # %% |
178 | | -# The true value of normalizing is if you do want to compare two distributions |
179 | | -# that have different sized populations. Here we compare the distribution of |
180 | | -# ``xdata`` with a population of 1000, and ``xdata2`` with 100 members. |
| 206 | +# The value of normalizing histograms is comparing two distributions that have |
| 207 | +# different sized populations. Here we compare the distribution of ``xdata`` |
| 208 | +# with a population of 1000, and ``xdata2`` with 100 members. |
181 | 209 |
|
182 | 210 | xdata2 = rng.normal(size=100) |
183 | 211 |
|
|
189 | 217 | ax['no_norm'].hist(xdata, bins=xbins, histtype='step') |
190 | 218 | ax['no_norm'].hist(xdata2, bins=xbins, histtype='step') |
191 | 219 | ax['no_norm'].set_ylabel('Counts') |
192 | | -ax['no_norm'].set_xlabel('x bins') |
| 220 | +ax['no_norm'].set_xlabel('x bins [$V$]') |
193 | 221 | ax['no_norm'].set_title('No normalization') |
194 | 222 |
|
195 | 223 | ax['density'].hist(xdata, bins=xbins, histtype='step', density=True) |
196 | 224 | ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True) |
197 | | -ax['density'].set_ylabel('Probability (per dx)') |
| 225 | +ax['density'].set_ylabel('Probability density [$V^{-1}$]') |
198 | 226 | ax['density'].set_title('Density=True') |
199 | | -ax['density'].set_xlabel('x bins') |
| 227 | +ax['density'].set_xlabel('x bins [$V$]') |
200 | 228 |
|
201 | 229 | ax['weight'].hist(xdata, bins=xbins, histtype='step', |
202 | 230 | weights=1 / len(xdata) * np.ones(len(xdata)), |
203 | 231 | label='N=1000') |
204 | 232 | ax['weight'].hist(xdata2, bins=xbins, histtype='step', |
205 | 233 | weights=1 / len(xdata2) * np.ones(len(xdata2)), |
206 | 234 | label='N=100') |
207 | | -ax['weight'].set_xlabel('x bins') |
| 235 | +ax['weight'].set_xlabel('x bins [$V$]') |
208 | 236 | ax['weight'].set_ylabel('Counts / N') |
209 | 237 | ax['weight'].legend(fontsize='small') |
210 | 238 | ax['weight'].set_title('Weight = 1/N') |
|
0 commit comments