xgboost
hist_util.h
Go to the documentation of this file.
1 
7 #ifndef XGBOOST_COMMON_HIST_UTIL_H_
8 #define XGBOOST_COMMON_HIST_UTIL_H_
9 
10 #include <xgboost/data.h>
11 #include <limits>
12 #include <vector>
13 #include "row_set.h"
14 #include "../tree/param.h"
15 #include "./quantile.h"
16 #include "./timer.h"
17 #include "../include/rabit/rabit.h"
18 
19 namespace xgboost {
20 namespace common {
21 
22 /*
23  * \brief A thin wrapper around dynamically allocated C-style array.
24  * Make sure to call resize() before use.
25  */
26 template<typename T>
27 struct SimpleArray {
29  free(ptr_);
30  ptr_ = nullptr;
31  }
32 
33  void resize(size_t n) {
34  T* ptr = static_cast<T*>(malloc(n*sizeof(T)));
35  memcpy(ptr, ptr_, n_ * sizeof(T));
36  free(ptr_);
37  ptr_ = ptr;
38  n_ = n;
39  }
40 
41  T& operator[](size_t idx) {
42  return ptr_[idx];
43  }
44 
45  T& operator[](size_t idx) const {
46  return ptr_[idx];
47  }
48 
49  size_t size() const {
50  return n_;
51  }
52 
53  T back() const {
54  return ptr_[n_-1];
55  }
56 
57  T* data() {
58  return ptr_;
59  }
60 
61  const T* data() const {
62  return ptr_;
63  }
64 
65 
66  T* begin() {
67  return ptr_;
68  }
69 
70  const T* begin() const {
71  return ptr_;
72  }
73 
74  T* end() {
75  return ptr_ + n_;
76  }
77 
78  const T* end() const {
79  return ptr_ + n_;
80  }
81 
82  private:
83  T* ptr_ = nullptr;
84  size_t n_ = 0;
85 };
86 
87 
88 
89 
91 struct HistCutMatrix {
93  std::vector<uint32_t> row_ptr;
95  std::vector<bst_float> min_val;
97  std::vector<bst_float> cut;
98  uint32_t GetBinIdx(const Entry &e);
99 
101 
102  // create histogram cut matrix given statistics from data
103  // using approximate quantile sketch approach
104  void Init(DMatrix* p_fmat, uint32_t max_num_bins);
105 
106  void Init(std::vector<WXQSketch>* sketchs, uint32_t max_num_bins);
107 
108  HistCutMatrix();
109  size_t NumBins() const { return row_ptr.back(); }
110 
111  protected:
112  virtual size_t SearchGroupIndFromBaseRow(
113  std::vector<bst_uint> const& group_ptr, size_t const base_rowid) const;
114 
116 };
117 
119 void DeviceSketch
120  (const SparsePage& batch, const MetaInfo& info,
121  const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows);
122 
128 
136  std::vector<size_t> row_ptr;
138  std::vector<uint32_t> index;
140  std::vector<size_t> hit_count;
143  // Create a global histogram matrix, given cut
144  void Init(DMatrix* p_fmat, int max_num_bins);
145  // get i-th row
146  inline GHistIndexRow operator[](size_t i) const {
147  return {&index[0] + row_ptr[i],
148  static_cast<GHistIndexRow::index_type>(
149  row_ptr[i + 1] - row_ptr[i])};
150  }
151  inline void GetFeatureCounts(size_t* counts) const {
152  auto nfeature = cut.row_ptr.size() - 1;
153  for (unsigned fid = 0; fid < nfeature; ++fid) {
154  auto ibegin = cut.row_ptr[fid];
155  auto iend = cut.row_ptr[fid + 1];
156  for (auto i = ibegin; i < iend; ++i) {
157  counts[fid] += hit_count[i];
158  }
159  }
160  }
161 
162  private:
163  std::vector<size_t> hit_count_tloc_;
164 };
165 
167  const size_t* row_ptr;
168  const uint32_t* index;
169 
170  inline GHistIndexBlock(const size_t* row_ptr, const uint32_t* index)
171  : row_ptr(row_ptr), index(index) {}
172 };
173 
174 class ColumnMatrix;
175 
177  public:
178  void Init(const GHistIndexMatrix& gmat,
179  const ColumnMatrix& colmat,
180  const tree::TrainParam& param);
181 
182  inline GHistIndexBlock operator[](size_t i) const {
183  return {blocks_[i].row_ptr_begin, blocks_[i].index_begin};
184  }
185 
186  inline size_t GetNumBlock() const {
187  return blocks_.size();
188  }
189 
190  private:
191  std::vector<size_t> row_ptr_;
192  std::vector<uint32_t> index_;
193  const HistCutMatrix* cut_;
194  struct Block {
195  const size_t* row_ptr_begin;
196  const size_t* row_ptr_end;
197  const uint32_t* index_begin;
198  const uint32_t* index_end;
199  };
200  std::vector<Block> blocks_;
201 };
202 
210 
215  public:
216  // access histogram for i-th node
218  constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
219  CHECK_NE(row_ptr_[nid], kMax);
220  tree::GradStats* ptr =
221  const_cast<tree::GradStats*>(dmlc::BeginPtr(data_) + row_ptr_[nid]);
222  return {ptr, nbins_};
223  }
224 
225  // have we computed a histogram for i-th node?
226  bool RowExists(bst_uint nid) const {
227  const uint32_t k_max = std::numeric_limits<uint32_t>::max();
228  return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
229  }
230 
231  // initialize histogram collection
232  void Init(uint32_t nbins) {
233  nbins_ = nbins;
234  row_ptr_.clear();
235  data_.clear();
236  }
237 
238  // create an empty histogram for i-th node
239  void AddHistRow(bst_uint nid) {
240  constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
241  if (nid >= row_ptr_.size()) {
242  row_ptr_.resize(nid + 1, kMax);
243  }
244  CHECK_EQ(row_ptr_[nid], kMax);
245 
246  row_ptr_[nid] = data_.size();
247  data_.resize(data_.size() + nbins_);
248  }
249 
250  private:
252  uint32_t nbins_;
253 
254  std::vector<tree::GradStats> data_;
255 
257  std::vector<size_t> row_ptr_;
258 };
259 
264  public:
265  // initialize builder
266  inline void Init(size_t nthread, uint32_t nbins) {
267  nthread_ = nthread;
268  nbins_ = nbins;
269  thread_init_.resize(nthread_);
270  }
271 
272  // construct a histogram via histogram aggregation
273  void BuildHist(const std::vector<GradientPair>& gpair,
274  const RowSetCollection::Elem row_indices,
275  const GHistIndexMatrix& gmat,
276  GHistRow hist);
277  // same, with feature grouping
278  void BuildBlockHist(const std::vector<GradientPair>& gpair,
279  const RowSetCollection::Elem row_indices,
280  const GHistIndexBlockMatrix& gmatb,
281  GHistRow hist);
282  // construct a histogram via subtraction trick
283  void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);
284 
285  uint32_t GetNumBins() {
286  return nbins_;
287  }
288 
289  private:
291  size_t nthread_;
293  uint32_t nbins_;
294  std::vector<size_t> thread_init_;
295  std::vector<tree::GradStats> data_;
296 };
297 
298 
299 } // namespace common
300 } // namespace xgboost
301 #endif // XGBOOST_COMMON_HIST_UTIL_H_
void Init(uint32_t nbins)
Definition: hist_util.h:232
std::vector< bst_float > cut
the cut field
Definition: hist_util.h:97
size_t NumBins() const
Definition: hist_util.h:109
T * end()
Definition: hist_util.h:74
T back() const
Definition: hist_util.h:53
size_t GetNumBlock() const
Definition: hist_util.h:186
Definition: hist_util.h:166
void AddHistRow(bst_uint nid)
Definition: hist_util.h:239
Meta information about dataset, always sit in memory.
Definition: data.h:40
detail::ptrdiff_t index_type
Definition: span.h:387
util to compute quantiles
The input data structure of xgboost.
Definition: hist_util.h:27
Internal data structured used by XGBoost during training. There are two ways to create a customized D...
Definition: data.h:406
std::vector< uint32_t > index
The index data.
Definition: hist_util.h:138
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:157
const T * begin() const
Definition: hist_util.h:70
void DeviceSketch(const SparsePage &batch, const MetaInfo &info, const tree::TrainParam &param, HistCutMatrix *hmat, int gpu_batch_nrows)
Builds the cut matrix on the GPU.
std::vector< uint32_t > row_ptr
Unit pointer to rows by element position.
Definition: hist_util.h:93
Cut configuration for all the features.
Definition: hist_util.h:91
std::vector< size_t > hit_count
hit count of each index
Definition: hist_util.h:140
~SimpleArray()
Definition: hist_util.h:28
Quantile sketch use WXQSummary.
Definition: quantile.h:839
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:109
T * data()
Definition: hist_util.h:57
builder for histograms of gradient statistics
Definition: hist_util.h:263
GHistIndexRow operator[](size_t i) const
Definition: hist_util.h:146
GHistIndexBlock operator[](size_t i) const
Definition: hist_util.h:182
Quick Utility to compute subset of rows.
size_t size() const
Definition: hist_util.h:49
const size_t * row_ptr
Definition: hist_util.h:167
void resize(size_t n)
Definition: hist_util.h:33
const T * end() const
Definition: hist_util.h:78
histogram of gradient statistics for multiple nodes
Definition: hist_util.h:214
Definition: hist_util.h:176
HistCutMatrix cut
The corresponding cuts.
Definition: hist_util.h:142
GHistRow operator[](bst_uint nid) const
Definition: hist_util.h:217
T * begin()
Definition: hist_util.h:66
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:64
const T * data() const
Definition: hist_util.h:61
namespace of xgboost
Definition: base.h:79
data structure to store an instance set, a subset of rows (instances) associated with a particular no...
Definition: row_set.h:23
Monitor monitor_
Definition: hist_util.h:115
Timing utility used to measure total method execution time over the lifetime of the containing object...
Definition: timer.h:49
void Init(size_t nthread, uint32_t nbins)
Definition: hist_util.h:266
std::vector< size_t > row_ptr
row pointer to rows by element position
Definition: hist_util.h:136
Element from a sparse vector.
Definition: data.h:132
T & operator[](size_t idx)
Definition: hist_util.h:41
uint32_t bst_uint
unsigned integer type used in boost, used for feature index and row index.
Definition: base.h:84
std::vector< bst_float > min_val
minimum value of each feature
Definition: hist_util.h:95
uint32_t GetNumBins()
Definition: hist_util.h:285
preprocessed global index matrix, in CSR format Transform floating values to integer index in histogr...
Definition: hist_util.h:134
const uint32_t * index
Definition: hist_util.h:168
void GetFeatureCounts(size_t *counts) const
Definition: hist_util.h:151
bool RowExists(bst_uint nid) const
Definition: hist_util.h:226
GHistIndexBlock(const size_t *row_ptr, const uint32_t *index)
Definition: hist_util.h:170
T & operator[](size_t idx) const
Definition: hist_util.h:45