1/* -*- mode: c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
3/*
4 Copyright (C) 2007 Gang Liang
5
6 This file is part of QuantLib, a free-software/open-source library
7 for financial quantitative analysts and developers - http://quantlib.org/
8
9 QuantLib is free software: you can redistribute it and/or modify it
10 under the terms of the QuantLib license. You should have received a
11 copy of the license along with this program; if not, please email
12 <quantlib-dev@lists.sf.net>. The license is also available online at
13 <http://quantlib.org/license.shtml>.
14
15 This program is distributed in the hope that it will be useful, but WITHOUT
16 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the license for more details.
18*/
19
20#include <ql/math/statistics/histogram.hpp>
21#include <ql/math/statistics/incrementalstatistics.hpp>
22#include <ql/math/comparison.hpp>
23#include <algorithm>
24
25namespace QuantLib {
26
27 namespace {
28
29 /* The discontinuous quantiles use the method (type 8) as
30 recommended by Hyndman and Fan (1996). The resulting
31 quantile estimates are approximately median-unbiased
32 regardless of the distribution of 'samples'.
33
34 If quantile function is called multiple times for the same
35 dataset, it is recommended to pre-sort the sample vector.
36 */
37 Real quantile(const std::vector<Real>& samples, Real prob) {
38 Size nsample = samples.size();
39 QL_REQUIRE(prob >= 0.0 && prob <= 1.0,
40 "Probability has to be in [0,1].");
41 QL_REQUIRE(nsample > 0, "The sample size has to be positive." );
42
43 if (nsample == 1)
44 return samples[0];
45
46 // two special cases: close to boundaries
47 const Real a = 1. / 3, b = 2*a / (nsample+a);
48 if (prob < b)
49 return *std::min_element(first: samples.begin(), last: samples.end());
50 else if (prob > 1-b)
51 return *std::max_element(first: samples.begin(), last: samples.end());
52
53 // general situation: middle region and nsample >= 2
54 Size index = static_cast<Size>(std::floor(x: (nsample+a)*prob+a));
55 std::vector<Real> sorted(index+1);
56 std::partial_sort_copy(first: samples.begin(), last: samples.end(),
57 result_first: sorted.begin(), result_last: sorted.end());
58
59 // use "index & index+1"th elements to interpolate the quantile
60 Real weight = nsample*prob + a - index;
61 return (1-weight) * sorted[index-1] + weight * sorted[index];
62 }
63
64 }
65
66
67 Size Histogram::bins() const {
68 return bins_;
69 }
70
71 const std::vector<Real>& Histogram::breaks() const {
72 return breaks_;
73 }
74
75 Histogram::Algorithm Histogram::algorithm() const {
76 return algorithm_;
77 }
78
79 bool Histogram::empty() const {
80 return bins_ == 0;
81 }
82
83 Size Histogram::counts(Size i) const {
84 #if defined(QL_EXTRA_SAFETY_CHECKS)
85 return counts_.at(i);
86 #else
87 return counts_[i];
88 #endif
89 }
90
91 Real Histogram::frequency(Size i) const {
92 #if defined(QL_EXTRA_SAFETY_CHECKS)
93 return frequency_.at(i);
94 #else
95 return frequency_[i];
96 #endif
97 }
98
99 void Histogram::calculate() {
100 QL_REQUIRE(!data_.empty(), "no data given");
101
102 Real min = *std::min_element(first: data_.begin(), last: data_.end());
103 Real max = *std::max_element(first: data_.begin(), last: data_.end());
104
105 // calculate number of bins if necessary
106 if (bins_ == Null<Size>()) {
107 switch (algorithm_) {
108 case Sturges: {
109 bins_ = static_cast<Size>(
110 std::ceil(x: std::log(x: static_cast<Real>(data_.size()))
111 /std::log(x: 2.0) + 1));
112 break;
113 }
114 case FD: {
115 Real r1 = quantile(samples: data_, prob: 0.25);
116 Real r2 = quantile(samples: data_, prob: 0.75);
117 Real h = 2.0 * (r2-r1) * std::pow(x: static_cast<Real>(data_.size()), y: -1.0/3.0);
118 bins_ = static_cast<Size>(std::ceil(x: (max-min)/h));
119 break;
120 }
121 case Scott: {
122 IncrementalStatistics summary;
123 summary.addSequence(begin: data_.begin(), end: data_.end());
124 Real variance = summary.variance();
125 Real h = 3.5 * std::sqrt(x: variance)
126 * std::pow(x: static_cast<Real>(data_.size()), y: -1.0/3.0);
127 bins_ = static_cast<Size>(std::ceil(x: (max-min)/h));
128 break;
129 }
130 case None:
131 QL_FAIL("a bin-partition algorithm is required");
132 default:
133 QL_FAIL("unknown bin-partition algorithm");
134 };
135 bins_ = std::max<Size>(a: bins_,b: 1);
136 }
137
138 if (breaks_.empty()) {
139 // set breaks if not provided
140 breaks_.resize(new_size: bins_-1);
141
142 // ensure breaks_ evenly span over the range of data_
143 // TODO: borrow the idea of pretty in R.
144 Real h = (max-min)/bins_;
145 for (Size i=0; i<breaks_.size(); ++i) {
146 breaks_[i] = min + (i+1)*h;
147 }
148 } else {
149 // or ensure they're sorted if given
150 std::sort(first: breaks_.begin(), last: breaks_.end());
151 auto end = std::unique(first: breaks_.begin(), last: breaks_.end(),
152 binary_pred: static_cast<bool (*)(Real, Real)>(close_enough));
153 breaks_.resize(new_size: end - breaks_.begin());
154 }
155
156 // finally, calculate counts and frequencies
157 counts_.resize(new_size: bins_);
158 std::fill(first: counts_.begin(), last: counts_.end(), value: 0);
159
160 for (Real p : data_) {
161 bool processed = false;
162 for (Size i=0; i<breaks_.size(); ++i) {
163 if (p < breaks_[i]) {
164 ++counts_[i];
165 processed = true;
166 break;
167 }
168 }
169 if (!processed)
170 ++counts_[bins_-1];
171 }
172
173 frequency_.resize(new_size: bins_);
174
175 Size totalCounts = data_.size();
176 for (Size i=0; i<bins_; ++i)
177 frequency_[i] = static_cast<Real>(counts_[i])/totalCounts;
178 }
179
180}
181

source code of quantlib/ql/math/statistics/histogram.cpp