| 1 | /* -*- mode: c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
| 2 | |
| 3 | /* |
| 4 | Copyright (C) 2007 Gang Liang |
| 5 | |
| 6 | This file is part of QuantLib, a free-software/open-source library |
| 7 | for financial quantitative analysts and developers - http://quantlib.org/ |
| 8 | |
| 9 | QuantLib is free software: you can redistribute it and/or modify it |
| 10 | under the terms of the QuantLib license. You should have received a |
| 11 | copy of the license along with this program; if not, please email |
| 12 | <quantlib-dev@lists.sf.net>. The license is also available online at |
| 13 | <http://quantlib.org/license.shtml>. |
| 14 | |
| 15 | This program is distributed in the hope that it will be useful, but WITHOUT |
| 16 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 17 | FOR A PARTICULAR PURPOSE. See the license for more details. |
| 18 | */ |
| 19 | |
| 20 | #include <ql/math/statistics/histogram.hpp> |
| 21 | #include <ql/math/statistics/incrementalstatistics.hpp> |
| 22 | #include <ql/math/comparison.hpp> |
| 23 | #include <algorithm> |
| 24 | |
| 25 | namespace QuantLib { |
| 26 | |
| 27 | namespace { |
| 28 | |
| 29 | /* The discontinuous quantiles use the method (type 8) as |
| 30 | recommended by Hyndman and Fan (1996). The resulting |
| 31 | quantile estimates are approximately median-unbiased |
| 32 | regardless of the distribution of 'samples'. |
| 33 | |
| 34 | If quantile function is called multiple times for the same |
| 35 | dataset, it is recommended to pre-sort the sample vector. |
| 36 | */ |
| 37 | Real quantile(const std::vector<Real>& samples, Real prob) { |
| 38 | Size nsample = samples.size(); |
| 39 | QL_REQUIRE(prob >= 0.0 && prob <= 1.0, |
| 40 | "Probability has to be in [0,1]." ); |
| 41 | QL_REQUIRE(nsample > 0, "The sample size has to be positive." ); |
| 42 | |
| 43 | if (nsample == 1) |
| 44 | return samples[0]; |
| 45 | |
| 46 | // two special cases: close to boundaries |
| 47 | const Real a = 1. / 3, b = 2*a / (nsample+a); |
| 48 | if (prob < b) |
| 49 | return *std::min_element(first: samples.begin(), last: samples.end()); |
| 50 | else if (prob > 1-b) |
| 51 | return *std::max_element(first: samples.begin(), last: samples.end()); |
| 52 | |
| 53 | // general situation: middle region and nsample >= 2 |
| 54 | Size index = static_cast<Size>(std::floor(x: (nsample+a)*prob+a)); |
| 55 | std::vector<Real> sorted(index+1); |
| 56 | std::partial_sort_copy(first: samples.begin(), last: samples.end(), |
| 57 | result_first: sorted.begin(), result_last: sorted.end()); |
| 58 | |
| 59 | // use "index & index+1"th elements to interpolate the quantile |
| 60 | Real weight = nsample*prob + a - index; |
| 61 | return (1-weight) * sorted[index-1] + weight * sorted[index]; |
| 62 | } |
| 63 | |
| 64 | } |
| 65 | |
| 66 | |
| 67 | Size Histogram::bins() const { |
| 68 | return bins_; |
| 69 | } |
| 70 | |
| 71 | const std::vector<Real>& Histogram::breaks() const { |
| 72 | return breaks_; |
| 73 | } |
| 74 | |
| 75 | Histogram::Algorithm Histogram::algorithm() const { |
| 76 | return algorithm_; |
| 77 | } |
| 78 | |
| 79 | bool Histogram::empty() const { |
| 80 | return bins_ == 0; |
| 81 | } |
| 82 | |
| 83 | Size Histogram::counts(Size i) const { |
| 84 | #if defined(QL_EXTRA_SAFETY_CHECKS) |
| 85 | return counts_.at(i); |
| 86 | #else |
| 87 | return counts_[i]; |
| 88 | #endif |
| 89 | } |
| 90 | |
| 91 | Real Histogram::frequency(Size i) const { |
| 92 | #if defined(QL_EXTRA_SAFETY_CHECKS) |
| 93 | return frequency_.at(i); |
| 94 | #else |
| 95 | return frequency_[i]; |
| 96 | #endif |
| 97 | } |
| 98 | |
| 99 | void Histogram::calculate() { |
| 100 | QL_REQUIRE(!data_.empty(), "no data given" ); |
| 101 | |
| 102 | Real min = *std::min_element(first: data_.begin(), last: data_.end()); |
| 103 | Real max = *std::max_element(first: data_.begin(), last: data_.end()); |
| 104 | |
| 105 | // calculate number of bins if necessary |
| 106 | if (bins_ == Null<Size>()) { |
| 107 | switch (algorithm_) { |
| 108 | case Sturges: { |
| 109 | bins_ = static_cast<Size>( |
| 110 | std::ceil(x: std::log(x: static_cast<Real>(data_.size())) |
| 111 | /std::log(x: 2.0) + 1)); |
| 112 | break; |
| 113 | } |
| 114 | case FD: { |
| 115 | Real r1 = quantile(samples: data_, prob: 0.25); |
| 116 | Real r2 = quantile(samples: data_, prob: 0.75); |
| 117 | Real h = 2.0 * (r2-r1) * std::pow(x: static_cast<Real>(data_.size()), y: -1.0/3.0); |
| 118 | bins_ = static_cast<Size>(std::ceil(x: (max-min)/h)); |
| 119 | break; |
| 120 | } |
| 121 | case Scott: { |
| 122 | IncrementalStatistics summary; |
| 123 | summary.addSequence(begin: data_.begin(), end: data_.end()); |
| 124 | Real variance = summary.variance(); |
| 125 | Real h = 3.5 * std::sqrt(x: variance) |
| 126 | * std::pow(x: static_cast<Real>(data_.size()), y: -1.0/3.0); |
| 127 | bins_ = static_cast<Size>(std::ceil(x: (max-min)/h)); |
| 128 | break; |
| 129 | } |
| 130 | case None: |
| 131 | QL_FAIL("a bin-partition algorithm is required" ); |
| 132 | default: |
| 133 | QL_FAIL("unknown bin-partition algorithm" ); |
| 134 | }; |
| 135 | bins_ = std::max<Size>(a: bins_,b: 1); |
| 136 | } |
| 137 | |
| 138 | if (breaks_.empty()) { |
| 139 | // set breaks if not provided |
| 140 | breaks_.resize(new_size: bins_-1); |
| 141 | |
| 142 | // ensure breaks_ evenly span over the range of data_ |
| 143 | // TODO: borrow the idea of pretty in R. |
| 144 | Real h = (max-min)/bins_; |
| 145 | for (Size i=0; i<breaks_.size(); ++i) { |
| 146 | breaks_[i] = min + (i+1)*h; |
| 147 | } |
| 148 | } else { |
| 149 | // or ensure they're sorted if given |
| 150 | std::sort(first: breaks_.begin(), last: breaks_.end()); |
| 151 | auto end = std::unique(first: breaks_.begin(), last: breaks_.end(), |
| 152 | binary_pred: static_cast<bool (*)(Real, Real)>(close_enough)); |
| 153 | breaks_.resize(new_size: end - breaks_.begin()); |
| 154 | } |
| 155 | |
| 156 | // finally, calculate counts and frequencies |
| 157 | counts_.resize(new_size: bins_); |
| 158 | std::fill(first: counts_.begin(), last: counts_.end(), value: 0); |
| 159 | |
| 160 | for (Real p : data_) { |
| 161 | bool processed = false; |
| 162 | for (Size i=0; i<breaks_.size(); ++i) { |
| 163 | if (p < breaks_[i]) { |
| 164 | ++counts_[i]; |
| 165 | processed = true; |
| 166 | break; |
| 167 | } |
| 168 | } |
| 169 | if (!processed) |
| 170 | ++counts_[bins_-1]; |
| 171 | } |
| 172 | |
| 173 | frequency_.resize(new_size: bins_); |
| 174 | |
| 175 | Size totalCounts = data_.size(); |
| 176 | for (Size i=0; i<bins_; ++i) |
| 177 | frequency_[i] = static_cast<Real>(counts_[i])/totalCounts; |
| 178 | } |
| 179 | |
| 180 | } |
| 181 | |