histogram.cpp source code [quantlib/ql/math/statistics/histogram.cpp]

1	/ -- mode: c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -- /
2
3	/*
4	Copyright (C) 2007 Gang Liang
5
6	This file is part of QuantLib, a free-software/open-source library
7	for financial quantitative analysts and developers - http://quantlib.org/
8
9	QuantLib is free software: you can redistribute it and/or modify it
10	under the terms of the QuantLib license. You should have received a
11	copy of the license along with this program; if not, please email
12	<quantlib-dev@lists.sf.net>. The license is also available online at
13	<http://quantlib.org/license.shtml>.
14
15	This program is distributed in the hope that it will be useful, but WITHOUT
16	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17	FOR A PARTICULAR PURPOSE. See the license for more details.
18	*/
19
20	#include <ql/math/statistics/histogram.hpp>
21	#include <ql/math/statistics/incrementalstatistics.hpp>
22	#include <ql/math/comparison.hpp>
23	#include <algorithm>
24
25	namespace QuantLib {
26
27	namespace {
28
29	/ The discontinuous quantiles use the method (type 8) as*
30	recommended by Hyndman and Fan (1996). The resulting
31	quantile estimates are approximately median-unbiased
32	regardless of the distribution of 'samples'.
33
34	If quantile function is called multiple times for the same
35	dataset, it is recommended to pre-sort the sample vector.
36	*/
37	Real quantile(const std::vector<Real>& samples, Real prob) {
38	Size nsample = samples.size();
39	QL_REQUIRE(prob >= `0.0` && prob <= `1.0`,
40	"Probability has to be in [0,1].");
41	QL_REQUIRE(nsample > `0`, "The sample size has to be positive." );
42
43	if (nsample == `1`)
44	return samples [`0`];
45
46	// two special cases: close to boundaries
47	const Real a = `1.` / `3`, b = `2`*a / (nsample+a);
48	if (prob < b)
49	return *std::min_element(first: samples.begin(), last: samples.end());
50	else if (prob > `1`-b)
51	return *std::max_element(first: samples.begin(), last: samples.end());
52
53	// general situation: middle region and nsample >= 2
54	Size index = static_cast<Size>(std::floor(x: (nsample+a)*prob+a));
55	std::vector<Real> sorted(index+`1`);
56	std::partial_sort_copy(first: samples.begin(), last: samples.end(),
57	result_first: sorted.begin(), result_last: sorted.end());
58
59	// use "index & index+1"th elements to interpolate the quantile
60	Real weight = nsample*prob + a - index;
61	return (`1`-weight) * sorted [index-`1`] + weight * sorted [index];
62	}
63
64	}
65
66
67	Size Histogram::bins() const {
68	return bins_;
69	}
70
71	const std::vector<Real>& Histogram::breaks() const {
72	return breaks_;
73	}
74
75	Histogram::Algorithm Histogram::algorithm() const {
76	return algorithm_;
77	}
78
79	bool Histogram::empty() const {
80	return bins_ == `0`;
81	}
82
83	Size Histogram::counts(Size i) const {
84	#if defined(QL_EXTRA_SAFETY_CHECKS)
85	return counts_.at(i);
86	#else
87	return counts_[i];
88	#endif
89	}
90
91	Real Histogram::frequency(Size i) const {
92	#if defined(QL_EXTRA_SAFETY_CHECKS)
93	return frequency_.at(i);
94	#else
95	return frequency_[i];
96	#endif
97	}
98
99	void Histogram::calculate() {
100	QL_REQUIRE(!data_.empty(), "no data given");
101
102	Real min = *std::min_element(first: data_.begin(), last: data_.end());
103	Real max = *std::max_element(first: data_.begin(), last: data_.end());
104
105	// calculate number of bins if necessary
106	if (bins_ == Null<Size>()) {
107	switch (algorithm_) {
108	case Sturges: {
109	bins_ = static_cast<Size>(
110	std::ceil(x: std::log(x: static_cast<Real>(data_.size()))
111	/std::log(x: `2.0`) + `1`));
112	break;
113	}
114	case FD: {
115	Real r1 = quantile(samples: data_, prob: `0.25`);
116	Real r2 = quantile(samples: data_, prob: `0.75`);
117	Real h = `2.0` * (r2-r1) * std::pow(x: static_cast<Real>(data_.size()), y: -`1.0`/`3.0`);
118	bins_ = static_cast<Size>(std::ceil(x: (max-min)/h));
119	break;
120	}
121	case Scott: {
122	IncrementalStatistics summary;
123	summary.addSequence(begin: data_.begin(), end: data_.end());
124	Real variance = summary.variance();
125	Real h = `3.5` * std::sqrt(x: variance)
126	* std::pow(x: static_cast<Real>(data_.size()), y: -`1.0`/`3.0`);
127	bins_ = static_cast<Size>(std::ceil(x: (max-min)/h));
128	break;
129	}
130	case None:
131	QL_FAIL("a bin-partition algorithm is required");
132	default:
133	QL_FAIL("unknown bin-partition algorithm");
134	};
135	bins_ = std::max<Size>(a: bins_,b: `1`);
136	}
137
138	if (breaks_.empty()) {
139	// set breaks if not provided
140	breaks_.resize(new_size: bins_-`1`);
141
142	// ensure breaks_ evenly span over the range of data_
143	// TODO: borrow the idea of pretty in R.
144	Real h = (max-min)/bins_;
145	for (Size i=`0`; i<breaks_.size(); ++i) {
146	breaks_[i] = min + (i+`1`)*h;
147	}
148	} else {
149	// or ensure they're sorted if given
150	std::sort(first: breaks_.begin(), last: breaks_.end());
151	auto end = std::unique(first: breaks_.begin(), last: breaks_.end(),
152	binary_pred: static_cast<bool (*)(Real, Real)>(close_enough));
153	breaks_.resize(new_size: end - breaks_.begin());
154	}
155
156	// finally, calculate counts and frequencies
157	counts_.resize(new_size: bins_);
158	std::fill(first: counts_.begin(), last: counts_.end(), value: `0`);
159
160	for (Real p : data_) {
161	bool processed = false;
162	for (Size i=`0`; i<breaks_.size(); ++i) {
163	if (p < breaks_[i]) {
164	++counts_[i];
165	processed = true;
166	break;
167	}
168	}
169	if (!processed)
170	++counts_[bins_-`1`];
171	}
172
173	frequency_.resize(new_size: bins_);
174
175	Size totalCounts = data_.size();
176	for (Size i=`0`; i<bins_; ++i)
177	frequency_[i] = static_cast<Real>(counts_[i])/totalCounts;
178	}
179
180	}
181

source code of quantlib/ql/math/statistics/histogram.cpp