xgboost
random.h
Go to the documentation of this file.
1 
7 #ifndef XGBOOST_COMMON_RANDOM_H_
8 #define XGBOOST_COMMON_RANDOM_H_
9 
10 #include <rabit/rabit.h>
11 #include <xgboost/logging.h>
12 #include <algorithm>
13 #include <functional>
14 #include <vector>
15 #include <limits>
16 #include <map>
17 #include <memory>
18 #include <numeric>
19 #include <random>
20 #include <utility>
21 
23 #include "common.h"
24 
25 namespace xgboost {
26 namespace common {
30 using RandomEngine = std::mt19937;
31 
32 #if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
33 
39 class CustomGlobalRandomEngine {
40  public:
42  using result_type = uint32_t;
44  inline static constexpr result_type min() {
45  return 0;
46  }
48  inline static constexpr result_type max() {
49  return std::numeric_limits<result_type>::max();
50  }
55  void seed(result_type val);
59  result_type operator()();
60 };
61 
65 typedef CustomGlobalRandomEngine GlobalRandomEngine;
66 
67 #else
68 
72 #endif // XGBOOST_CUSTOMIZE_GLOBAL_PRNG
73 
79 GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
80 
81 /*
82  * Original paper:
83  * Weighted Random Sampling (2005; Efraimidis, Spirakis)
84  *
85  * Blog:
86  * https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/
87 */
88 template <typename T>
90  std::vector<T> const &array, std::vector<float> const &weights, size_t n) {
91  // ES sampling.
92  CHECK_EQ(array.size(), weights.size());
93  std::vector<float> keys(weights.size());
94  std::uniform_real_distribution<float> dist;
95  auto& rng = GlobalRandom();
96  for (size_t i = 0; i < array.size(); ++i) {
97  auto w = std::max(weights.at(i), kRtEps);
98  auto u = dist(rng);
99  auto k = std::log(u) / w;
100  keys[i] = k;
101  }
102  auto ind = ArgSort<size_t>(Span<float>{keys}, std::greater<>{});
103  ind.resize(n);
104 
105  std::vector<T> results(ind.size());
106  for (size_t k = 0; k < ind.size(); ++k) {
107  auto idx = ind[k];
108  results[k] = array[idx];
109  }
110  return results;
111 }
112 
121  std::shared_ptr<HostDeviceVector<bst_feature_t>> feature_set_tree_;
122  std::map<int, std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_set_level_;
123  std::vector<float> feature_weights_;
124  float colsample_bylevel_{1.0f};
125  float colsample_bytree_{1.0f};
126  float colsample_bynode_{1.0f};
127  GlobalRandomEngine rng_;
128 
129  public:
130  std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
131  std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features, float colsample);
136  explicit ColumnSampler(uint32_t seed) {
137  rng_.seed(seed);
138  }
139 
145  uint32_t seed = common::GlobalRandom()();
146  rabit::Broadcast(&seed, sizeof(seed), 0);
147  rng_.seed(seed);
148  }
149 
159  void Init(int64_t num_col, std::vector<float> feature_weights, float colsample_bynode,
160  float colsample_bylevel, float colsample_bytree) {
161  feature_weights_ = std::move(feature_weights);
162  colsample_bylevel_ = colsample_bylevel;
163  colsample_bytree_ = colsample_bytree;
164  colsample_bynode_ = colsample_bynode;
165 
166  if (feature_set_tree_ == nullptr) {
167  feature_set_tree_ = std::make_shared<HostDeviceVector<bst_feature_t>>();
168  }
169  Reset();
170 
171  feature_set_tree_->Resize(num_col);
172  std::iota(feature_set_tree_->HostVector().begin(), feature_set_tree_->HostVector().end(), 0);
173 
174  feature_set_tree_ = ColSample(feature_set_tree_, colsample_bytree_);
175  }
176 
180  void Reset() {
181  feature_set_tree_->Resize(0);
182  feature_set_level_.clear();
183  }
184 
196  std::shared_ptr<HostDeviceVector<bst_feature_t>> GetFeatureSet(int depth) {
197  if (colsample_bylevel_ == 1.0f && colsample_bynode_ == 1.0f) {
198  return feature_set_tree_;
199  }
200 
201  if (feature_set_level_.count(depth) == 0) {
202  // Level sampling, level does not yet exist so generate it
203  feature_set_level_[depth] = ColSample(feature_set_tree_, colsample_bylevel_);
204  }
205  if (colsample_bynode_ == 1.0f) {
206  // Level sampling
207  return feature_set_level_[depth];
208  }
209  // Need to sample for the node individually
210  return ColSample(feature_set_level_[depth], colsample_bynode_);
211  }
212 };
213 
214 } // namespace common
215 } // namespace xgboost
216 #endif // XGBOOST_COMMON_RANDOM_H_
xgboost::common::ColumnSampler
Handles selection of columns due to colsample_bytree, colsample_bylevel and colsample_bynode paramete...
Definition: random.h:120
xgboost::common::RandomEngine
std::mt19937 RandomEngine
Define mt19937 as default type Random Engine.
Definition: random.h:30
xgboost::common::ColumnSampler::ColumnSampler
ColumnSampler()
Column sampler constructor.
Definition: random.h:144
xgboost::HostDeviceVector
Definition: host_device_vector.h:86
host_device_vector.h
A device-and-host vector abstraction layer.
xgboost::common::ColumnSampler::ColumnSampler
ColumnSampler(uint32_t seed)
Column sampler constructor.
Definition: random.h:136
xgboost::common::ColumnSampler::Init
void Init(int64_t num_col, std::vector< float > feature_weights, float colsample_bynode, float colsample_bylevel, float colsample_bytree)
Initialise this object before use.
Definition: random.h:159
xgboost::common::ColumnSampler::ColSample
std::shared_ptr< HostDeviceVector< bst_feature_t > > ColSample(std::shared_ptr< HostDeviceVector< bst_feature_t >> p_features, float colsample)
xgboost::common::WeightedSamplingWithoutReplacement
std::vector< T > WeightedSamplingWithoutReplacement(std::vector< T > const &array, std::vector< float > const &weights, size_t n)
Definition: random.h:89
common.h
Common utilities.
xgboost::kRtEps
constexpr bst_float kRtEps
small eps gap for minimum split decision.
Definition: base.h:268
xgboost::common::Span< float >
xgboost::common::ColumnSampler::GetFeatureSet
std::shared_ptr< HostDeviceVector< bst_feature_t > > GetFeatureSet(int depth)
Samples a feature set.
Definition: random.h:196
xgboost::common::ColumnSampler::Reset
void Reset()
Resets this object.
Definition: random.h:180
xgboost::common::GlobalRandomEngine
RandomEngine GlobalRandomEngine
global random engine
Definition: random.h:71
xgboost
namespace of xgboost
Definition: base.h:110
xgboost::common::GlobalRandom
GlobalRandomEngine & GlobalRandom()
global singleton of a random engine. This random engine is thread-local and only visible to current t...