xgboost
hist_util.h
Go to the documentation of this file.
1 
7 #ifndef XGBOOST_COMMON_HIST_UTIL_H_
8 #define XGBOOST_COMMON_HIST_UTIL_H_
9 
10 #include <xgboost/data.h>
12 #include <limits>
13 #include <vector>
14 #include <algorithm>
15 #include <memory>
16 #include <utility>
17 #include <map>
18 
19 #include "row_set.h"
20 #include "threading_utils.h"
21 #include "../tree/param.h"
22 #include "./quantile.h"
23 #include "./timer.h"
24 #include "../include/rabit/rabit.h"
25 
26 namespace xgboost {
27 namespace common {
33 
34 // A CSC matrix representing histogram cuts, used in CPU quantile hist.
35 // The cut values represent upper bounds of bins containing approximately equal numbers of elements
37  // Using friends to avoid creating a virtual class, since HistogramCuts is used as value
38  // object in many places.
39  friend class SparseCuts;
40  friend class DenseCuts;
41  friend class CutsBuilder;
42 
43  protected:
44  using BinIdx = uint32_t;
46 
47  public:
50  // storing minimum value in a sketch set.
52 
53  HistogramCuts();
55  cut_values_.Resize(that.cut_values_.Size());
56  cut_ptrs_.Resize(that.cut_ptrs_.Size());
57  min_vals_.Resize(that.min_vals_.Size());
58  cut_values_.Copy(that.cut_values_);
59  cut_ptrs_.Copy(that.cut_ptrs_);
60  min_vals_.Copy(that.min_vals_);
61  }
62 
63  HistogramCuts(HistogramCuts&& that) noexcept(true) {
64  *this = std::forward<HistogramCuts&&>(that);
65  }
66 
68  cut_values_.Resize(that.cut_values_.Size());
69  cut_ptrs_.Resize(that.cut_ptrs_.Size());
70  min_vals_.Resize(that.min_vals_.Size());
71  cut_values_.Copy(that.cut_values_);
72  cut_ptrs_.Copy(that.cut_ptrs_);
73  min_vals_.Copy(that.min_vals_);
74  return *this;
75  }
76 
77  HistogramCuts& operator=(HistogramCuts&& that) noexcept(true) {
78  monitor_ = std::move(that.monitor_);
79  cut_ptrs_ = std::move(that.cut_ptrs_);
80  cut_values_ = std::move(that.cut_values_);
81  min_vals_ = std::move(that.min_vals_);
82  return *this;
83  }
84 
85  /* \brief Build histogram cuts. */
86  void Build(DMatrix* dmat, uint32_t const max_num_bins);
87  /* \brief How many bins a feature has. */
88  uint32_t FeatureBins(uint32_t feature) const {
89  return cut_ptrs_.ConstHostVector().at(feature + 1) -
90  cut_ptrs_.ConstHostVector()[feature];
91  }
92 
93  // Getters. Cuts should be of no use after building histogram indices, but currently
94  // it's deeply linked with quantile_hist, gpu sketcher and gpu_hist. So we preserve
95  // these for now.
96  std::vector<uint32_t> const& Ptrs() const { return cut_ptrs_.ConstHostVector(); }
97  std::vector<float> const& Values() const { return cut_values_.ConstHostVector(); }
98  std::vector<float> const& MinValues() const { return min_vals_.ConstHostVector(); }
99 
100  size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
101 
102  // Return the index of a cut point that is strictly greater than the input
103  // value, or the last available index if none exists
104  BinIdx SearchBin(float value, uint32_t column_id) const {
105  auto beg = cut_ptrs_.ConstHostVector().at(column_id);
106  auto end = cut_ptrs_.ConstHostVector().at(column_id + 1);
107  const auto &values = cut_values_.ConstHostVector();
108  auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
109  BinIdx idx = it - values.cbegin();
110  if (idx == end) {
111  idx -= 1;
112  }
113  return idx;
114  }
115 
116  BinIdx SearchBin(Entry const& e) const {
117  return SearchBin(e.fvalue, e.index);
118  }
119 };
120 
121 /* \brief An interface for building quantile cuts.
122  *
123  * `DenseCuts' always assumes there are `max_bins` for each feature, which makes it not
124  * suitable for sparse dataset. On the other hand `SparseCuts' uses `GetColumnBatches',
125  * which doubles the memory usage, hence can not be applied to dense dataset.
126  */
127 class CutsBuilder {
128  public:
130  /* \brief return whether group for ranking is used. */
131  static bool UseGroup(DMatrix* dmat);
132  static bool UseGroup(MetaInfo const& info);
133 
134  protected:
136 
137  public:
138  explicit CutsBuilder(HistogramCuts* p_cuts) : p_cuts_{p_cuts} {}
139  virtual ~CutsBuilder() = default;
140 
141  static uint32_t SearchGroupIndFromRow(std::vector<bst_uint> const &group_ptr,
142  size_t const base_rowid) {
143  CHECK_LT(base_rowid, group_ptr.back())
144  << "Row: " << base_rowid << " is not found in any group.";
145  auto it =
146  std::upper_bound(group_ptr.cbegin(), group_ptr.cend() - 1, base_rowid);
147  bst_group_t group_ind = it - group_ptr.cbegin() - 1;
148  return group_ind;
149  }
150 
151  void AddCutPoint(WQSketch::SummaryContainer const& summary, int max_bin) {
152  size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
153  for (size_t i = 1; i < required_cuts; ++i) {
154  bst_float cpt = summary.data[i].value;
155  if (i == 1 || cpt > p_cuts_->cut_values_.ConstHostVector().back()) {
156  p_cuts_->cut_values_.HostVector().push_back(cpt);
157  }
158  }
159  }
160 
161  /* \brief Build histogram indices. */
162  virtual void Build(DMatrix* dmat, uint32_t const max_num_bins) = 0;
163 };
164 
166 class SparseCuts : public CutsBuilder {
167  /* \brief Distribute columns to each thread according to number of entries. */
168  static std::vector<size_t> LoadBalance(SparsePage const& page, size_t const nthreads);
170 
171  public:
172  explicit SparseCuts(HistogramCuts* container) :
173  CutsBuilder(container) {
174  monitor_.Init(__FUNCTION__);
175  }
176 
177  /* \brief Concatonate the built cuts in each thread. */
178  void Concat(std::vector<std::unique_ptr<SparseCuts>> const& cuts, uint32_t n_cols);
179  /* \brief Build histogram indices in single thread. */
180  void SingleThreadBuild(SparsePage const& page, MetaInfo const& info,
181  uint32_t max_num_bins,
182  bool const use_group_ind,
183  uint32_t beg, uint32_t end, uint32_t thread_id);
184  void Build(DMatrix* dmat, uint32_t const max_num_bins) override;
185 };
186 
188 class DenseCuts : public CutsBuilder {
189  protected:
191 
192  public:
193  explicit DenseCuts(HistogramCuts* container) :
194  CutsBuilder(container) {
195  monitor_.Init(__FUNCTION__);
196  }
197  void Init(std::vector<WQSketch>* sketchs, uint32_t max_num_bins, size_t max_rows);
198  void Build(DMatrix* p_fmat, uint32_t max_num_bins) override;
199 };
200 
201 
206 };
207 
208 struct Index {
209  Index() {
210  SetBinTypeSize(binTypeSize_);
211  }
212  Index(const Index& i) = delete;
213  Index& operator=(Index i) = delete;
214  Index(Index&& i) = delete;
215  Index& operator=(Index&& i) = delete;
216  uint32_t operator[](size_t i) const {
217  if (offset_ptr_ != nullptr) {
218  return func_(data_ptr_, i) + offset_ptr_[i%p_];
219  } else {
220  return func_(data_ptr_, i);
221  }
222  }
223  void SetBinTypeSize(BinTypeSize binTypeSize) {
224  binTypeSize_ = binTypeSize;
225  switch (binTypeSize) {
226  case kUint8BinsTypeSize:
227  func_ = &GetValueFromUint8;
228  break;
229  case kUint16BinsTypeSize:
230  func_ = &GetValueFromUint16;
231  break;
232  case kUint32BinsTypeSize:
233  func_ = &GetValueFromUint32;
234  break;
235  default:
236  CHECK(binTypeSize == kUint8BinsTypeSize ||
237  binTypeSize == kUint16BinsTypeSize ||
238  binTypeSize == kUint32BinsTypeSize);
239  }
240  }
242  return binTypeSize_;
243  }
244  template<typename T>
245  T* data() const { // NOLINT
246  return static_cast<T*>(data_ptr_);
247  }
248  uint32_t* Offset() const {
249  return offset_ptr_;
250  }
251  size_t OffsetSize() const {
252  return offset_.size();
253  }
254  size_t Size() const {
255  return data_.size() / (binTypeSize_);
256  }
257  void Resize(const size_t nBytesData) {
258  data_.resize(nBytesData);
259  data_ptr_ = reinterpret_cast<void*>(data_.data());
260  }
261  void ResizeOffset(const size_t nDisps) {
262  offset_.resize(nDisps);
263  offset_ptr_ = offset_.data();
264  p_ = nDisps;
265  }
266  std::vector<uint8_t>::const_iterator begin() const { // NOLINT
267  return data_.begin();
268  }
269  std::vector<uint8_t>::const_iterator end() const { // NOLINT
270  return data_.end();
271  }
272 
273  private:
274  static uint32_t GetValueFromUint8(void *t, size_t i) {
275  return reinterpret_cast<uint8_t*>(t)[i];
276  }
277  static uint32_t GetValueFromUint16(void* t, size_t i) {
278  return reinterpret_cast<uint16_t*>(t)[i];
279  }
280  static uint32_t GetValueFromUint32(void* t, size_t i) {
281  return reinterpret_cast<uint32_t*>(t)[i];
282  }
283 
284  using Func = uint32_t (*)(void*, size_t);
285 
286  std::vector<uint8_t> data_;
287  std::vector<uint32_t> offset_; // size of this field is equal to number of features
288  void* data_ptr_;
289  BinTypeSize binTypeSize_ {kUint8BinsTypeSize};
290  size_t p_ {1};
291  uint32_t* offset_ptr_ {nullptr};
292  Func func_;
293 };
294 
295 
304  std::vector<size_t> row_ptr;
308  std::vector<size_t> hit_count;
312  size_t max_num_bins;
313  // Create a global histogram matrix, given cut
314  void Init(DMatrix* p_fmat, int max_num_bins);
315 
316  template<typename BinIdxType>
317  void SetIndexDataForDense(common::Span<BinIdxType> index_data_span,
318  size_t batch_threads, const SparsePage& batch,
319  size_t rbegin, common::Span<const uint32_t> offsets_span,
320  size_t nbins);
321 
322  // specific method for sparse data as no posibility to reduce allocated memory
323  void SetIndexDataForSparse(common::Span<uint32_t> index_data_span,
324  size_t batch_threads, const SparsePage& batch,
325  size_t rbegin, size_t nbins);
326 
327  void ResizeIndex(const size_t rbegin, const SparsePage& batch,
328  const size_t n_offsets, const size_t n_index,
329  const bool isDense);
330 
331  inline void GetFeatureCounts(size_t* counts) const {
332  auto nfeature = cut.Ptrs().size() - 1;
333  for (unsigned fid = 0; fid < nfeature; ++fid) {
334  auto ibegin = cut.Ptrs()[fid];
335  auto iend = cut.Ptrs()[fid + 1];
336  for (auto i = ibegin; i < iend; ++i) {
337  counts[fid] += hit_count[i];
338  }
339  }
340  }
341  inline bool IsDense() const {
342  return isDense_;
343  }
344 
345  private:
346  std::vector<size_t> hit_count_tloc_;
347  bool isDense_;
348 };
349 
351  const size_t* row_ptr;
352  const uint32_t* index;
353 
354  inline GHistIndexBlock(const size_t* row_ptr, const uint32_t* index)
355  : row_ptr(row_ptr), index(index) {}
356 
357  // get i-th row
358  inline GHistIndexRow operator[](size_t i) const {
359  return {&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]};
360  }
361 };
362 
363 class ColumnMatrix;
364 
366  public:
367  void Init(const GHistIndexMatrix& gmat,
368  const ColumnMatrix& colmat,
369  const tree::TrainParam& param);
370 
371  inline GHistIndexBlock operator[](size_t i) const {
372  return {blocks_[i].row_ptr_begin, blocks_[i].index_begin};
373  }
374 
375  inline size_t GetNumBlock() const {
376  return blocks_.size();
377  }
378 
379  private:
380  std::vector<size_t> row_ptr_;
381  std::vector<uint32_t> index_;
382  const HistogramCuts* cut_;
383  struct Block {
384  const size_t* row_ptr_begin;
385  const size_t* row_ptr_end;
386  const uint32_t* index_begin;
387  const uint32_t* index_end;
388  };
389  std::vector<Block> blocks_;
390 };
391 
392 template<typename GradientSumT>
394 
398 template<typename GradientSumT>
399 void InitilizeHistByZeroes(GHistRow<GradientSumT> hist, size_t begin, size_t end);
400 
404 template<typename GradientSumT>
406  size_t begin, size_t end);
407 
411 template<typename GradientSumT>
413  size_t begin, size_t end);
414 
418 template<typename GradientSumT>
420  const GHistRow<GradientSumT> src2,
421  size_t begin, size_t end);
422 
426 template<typename GradientSumT>
428  public:
431 
432  // access histogram for i-th node
434  constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
435  CHECK_NE(row_ptr_[nid], kMax);
436  GradientPairT* ptr =
437  const_cast<GradientPairT*>(dmlc::BeginPtr(data_) + row_ptr_[nid]);
438  return {ptr, nbins_};
439  }
440 
441  // have we computed a histogram for i-th node?
442  bool RowExists(bst_uint nid) const {
443  const uint32_t k_max = std::numeric_limits<uint32_t>::max();
444  return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
445  }
446 
447  // initialize histogram collection
448  void Init(uint32_t nbins) {
449  if (nbins_ != nbins) {
450  nbins_ = nbins;
451  // quite expensive operation, so let's do this only once
452  data_.clear();
453  }
454  row_ptr_.clear();
455  n_nodes_added_ = 0;
456  }
457 
458  // create an empty histogram for i-th node
459  void AddHistRow(bst_uint nid) {
460  constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
461  if (nid >= row_ptr_.size()) {
462  row_ptr_.resize(nid + 1, kMax);
463  }
464  CHECK_EQ(row_ptr_[nid], kMax);
465 
466  if (data_.size() < nbins_ * (nid + 1)) {
467  data_.resize(nbins_ * (nid + 1));
468  }
469 
470  row_ptr_[nid] = nbins_ * n_nodes_added_;
471  n_nodes_added_++;
472  }
473 
474  private:
476  uint32_t nbins_ = 0;
478  uint32_t n_nodes_added_ = 0;
479 
480  std::vector<GradientPairT> data_;
481 
483  std::vector<size_t> row_ptr_;
484 };
485 
491 template<typename GradientSumT>
493  public:
495 
496  void Init(size_t nbins) {
497  if (nbins != nbins_) {
498  hist_buffer_.Init(nbins);
499  nbins_ = nbins;
500  }
501  }
502 
503  // Add new elements if needed, mark all hists as unused
504  // targeted_hists - already allocated hists which should contain final results after Reduce() call
505  void Reset(size_t nthreads, size_t nodes, const BlockedSpace2d& space,
506  const std::vector<GHistRowT>& targeted_hists) {
507  hist_buffer_.Init(nbins_);
508  tid_nid_to_hist_.clear();
509  hist_memory_.clear();
510  threads_to_nids_map_.clear();
511 
512  targeted_hists_ = targeted_hists;
513 
514  CHECK_EQ(nodes, targeted_hists.size());
515 
516  nodes_ = nodes;
517  nthreads_ = nthreads;
518 
519  MatchThreadsToNodes(space);
520  AllocateAdditionalHistograms();
521  MatchNodeNidPairToHist();
522 
523  hist_was_used_.resize(nthreads * nodes_);
524  std::fill(hist_was_used_.begin(), hist_was_used_.end(), static_cast<int>(false));
525  }
526 
527  // Get specified hist, initialize hist by zeros if it wasn't used before
528  GHistRowT GetInitializedHist(size_t tid, size_t nid) {
529  CHECK_LT(nid, nodes_);
530  CHECK_LT(tid, nthreads_);
531 
532  size_t idx = tid_nid_to_hist_.at({tid, nid});
533  GHistRowT hist = hist_memory_[idx];
534 
535  if (!hist_was_used_[tid * nodes_ + nid]) {
536  InitilizeHistByZeroes(hist, 0, hist.size());
537  hist_was_used_[tid * nodes_ + nid] = static_cast<int>(true);
538  }
539 
540  return hist;
541  }
542 
543  // Reduce following bins (begin, end] for nid-node in dst across threads
544  void ReduceHist(size_t nid, size_t begin, size_t end) {
545  CHECK_GT(end, begin);
546  CHECK_LT(nid, nodes_);
547 
548  GHistRowT dst = targeted_hists_[nid];
549 
550  bool is_updated = false;
551  for (size_t tid = 0; tid < nthreads_; ++tid) {
552  if (hist_was_used_[tid * nodes_ + nid]) {
553  is_updated = true;
554  const size_t idx = tid_nid_to_hist_.at({tid, nid});
555  GHistRowT src = hist_memory_[idx];
556 
557  if (dst.data() != src.data()) {
558  IncrementHist(dst, src, begin, end);
559  }
560  }
561  }
562  if (!is_updated) {
563  // In distributed mode - some tree nodes can be empty on local machines,
564  // So we need just set local hist by zeros in this case
565  InitilizeHistByZeroes(dst, begin, end);
566  }
567  }
568 
569  protected:
570  void MatchThreadsToNodes(const BlockedSpace2d& space) {
571  const size_t space_size = space.Size();
572  const size_t chunck_size = space_size / nthreads_ + !!(space_size % nthreads_);
573 
574  threads_to_nids_map_.resize(nthreads_ * nodes_, false);
575 
576  for (size_t tid = 0; tid < nthreads_; ++tid) {
577  size_t begin = chunck_size * tid;
578  size_t end = std::min(begin + chunck_size, space_size);
579 
580  if (begin < space_size) {
581  size_t nid_begin = space.GetFirstDimension(begin);
582  size_t nid_end = space.GetFirstDimension(end-1);
583 
584  for (size_t nid = nid_begin; nid <= nid_end; ++nid) {
585  // true - means thread 'tid' will work to compute partial hist for node 'nid'
586  threads_to_nids_map_[tid * nodes_ + nid] = true;
587  }
588  }
589  }
590  }
591 
593  size_t hist_allocated_additionally = 0;
594 
595  for (size_t nid = 0; nid < nodes_; ++nid) {
596  int nthreads_for_nid = 0;
597 
598  for (size_t tid = 0; tid < nthreads_; ++tid) {
599  if (threads_to_nids_map_[tid * nodes_ + nid]) {
600  nthreads_for_nid++;
601  }
602  }
603 
604  // In distributed mode - some tree nodes can be empty on local machines,
605  // set nthreads_for_nid to 0 in this case.
606  // In another case - allocate additional (nthreads_for_nid - 1) histograms,
607  // because one is already allocated externally (will store final result for the node).
608  hist_allocated_additionally += std::max<int>(0, nthreads_for_nid - 1);
609  }
610 
611  for (size_t i = 0; i < hist_allocated_additionally; ++i) {
612  hist_buffer_.AddHistRow(i);
613  }
614  }
615 
617  size_t hist_total = 0;
618  size_t hist_allocated_additionally = 0;
619 
620  for (size_t nid = 0; nid < nodes_; ++nid) {
621  bool first_hist = true;
622  for (size_t tid = 0; tid < nthreads_; ++tid) {
623  if (threads_to_nids_map_[tid * nodes_ + nid]) {
624  if (first_hist) {
625  hist_memory_.push_back(targeted_hists_[nid]);
626  first_hist = false;
627  } else {
628  hist_memory_.push_back(hist_buffer_[hist_allocated_additionally]);
629  hist_allocated_additionally++;
630  }
631  // map pair {tid, nid} to index of allocated histogram from hist_memory_
632  tid_nid_to_hist_[{tid, nid}] = hist_total++;
633  CHECK_EQ(hist_total, hist_memory_.size());
634  }
635  }
636  }
637  }
638 
640  size_t nbins_ = 0;
642  size_t nthreads_ = 0;
644  size_t nodes_ = 0;
652  std::vector<int> hist_was_used_;
653 
655  std::vector<bool> threads_to_nids_map_;
657  std::vector<GHistRowT> targeted_hists_;
659  std::vector<GHistRowT> hist_memory_;
661  std::map<std::pair<size_t, size_t>, size_t> tid_nid_to_hist_;
662 };
663 
667 template<typename GradientSumT>
669  public:
671 
672  GHistBuilder() = default;
673  GHistBuilder(size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {}
674 
675  // construct a histogram via histogram aggregation
676  void BuildHist(const std::vector<GradientPair>& gpair,
677  const RowSetCollection::Elem row_indices,
678  const GHistIndexMatrix& gmat,
679  GHistRowT hist,
680  bool isDense);
681  // same, with feature grouping
682  void BuildBlockHist(const std::vector<GradientPair>& gpair,
683  const RowSetCollection::Elem row_indices,
684  const GHistIndexBlockMatrix& gmatb,
685  GHistRowT hist);
686  // construct a histogram via subtraction trick
687  void SubtractionTrick(GHistRowT self,
688  GHistRowT sibling,
689  GHistRowT parent);
690 
691  uint32_t GetNumBins() const {
692  return nbins_;
693  }
694 
695  private:
697  size_t nthread_ { 0 };
699  uint32_t nbins_ { 0 };
700 };
701 
702 
703 } // namespace common
704 } // namespace xgboost
705 #endif // XGBOOST_COMMON_HIST_UTIL_H_
DenseCuts(HistogramCuts *container)
Definition: hist_util.h:193
Definition: hist_util.h:208
void SetBinTypeSize(BinTypeSize binTypeSize)
Definition: hist_util.h:223
Index index
The index data.
Definition: hist_util.h:306
float bst_float
float type, used for storing statistics
Definition: base.h:111
BinIdx SearchBin(float value, uint32_t column_id) const
Definition: hist_util.h:104
XGBOOST_DEVICE constexpr index_type size() const __span_noexcept
Definition: span.h:531
void Copy(const HostDeviceVector< T > &other)
HistCollection< GradientSumT > hist_buffer_
Buffer for additional histograms for Parallel processing.
Definition: hist_util.h:646
size_t GetNumBlock() const
Definition: hist_util.h:375
uint32_t FeatureBins(uint32_t feature) const
Definition: hist_util.h:88
Definition: hist_util.h:350
BinIdx SearchBin(Entry const &e) const
Definition: hist_util.h:116
static uint32_t SearchGroupIndFromRow(std::vector< bst_uint > const &group_ptr, size_t const base_rowid)
Definition: hist_util.h:141
void MatchNodeNidPairToHist()
Definition: hist_util.h:616
Meta information about dataset, always sit in memory.
Definition: data.h:45
std::vector< uint32_t > const & Ptrs() const
Definition: hist_util.h:96
void SubtractionHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > src1, const GHistRow< GradientSumT > src2, size_t begin, size_t end)
Compute Subtraction: dst = src1 - src2 in range [begin, end)
util to compute quantiles
void CopyHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > src, size_t begin, size_t end)
Copy hist from src to dst in range [begin, end)
Definition: hist_util.h:203
The input data structure of xgboost.
T * data() const
Definition: hist_util.h:245
HistogramCuts & operator=(HistogramCuts const &that)
Definition: hist_util.h:67
HistogramCuts cut
The corresponding cuts.
Definition: hist_util.h:310
void Reset(size_t nthreads, size_t nodes, const BlockedSpace2d &space, const std::vector< GHistRowT > &targeted_hists)
Definition: hist_util.h:505
Internal data structured used by XGBoost during training.
Definition: data.h:464
size_t GetFirstDimension(size_t i) const
Definition: threading_utils.h:89
Cut configuration for dense dataset.
Definition: hist_util.h:188
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:245
bool IsDense() const
Definition: hist_util.h:341
HistogramCuts & operator=(HistogramCuts &&that) noexcept(true)
Definition: hist_util.h:77
Definition: hist_util.h:127
void Init(uint32_t nbins)
Definition: hist_util.h:448
std::vector< size_t > hit_count
hit count of each index
Definition: hist_util.h:308
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:126
Definition: hist_util.h:204
builder for histograms of gradient statistics
Definition: hist_util.h:668
size_t OffsetSize() const
Definition: hist_util.h:251
void AllocateAdditionalHistograms()
Definition: hist_util.h:592
Implementation of gradient statistics pair. Template specialisation may be used to overload different...
Definition: base.h:132
std::vector< float > const & MinValues() const
Definition: hist_util.h:98
HostDeviceVector< bst_float > cut_values_
Definition: hist_util.h:48
GHistIndexBlock operator[](size_t i) const
Definition: hist_util.h:371
Quantile sketch use WQSummary.
Definition: quantile.h:672
XGBOOST_DEVICE constexpr pointer data() const __span_noexcept
Definition: span.h:526
Quick Utility to compute subset of rows.
HistogramCuts(HistogramCuts const &that)
Definition: hist_util.h:54
void Init(std::string label)
Definition: timer.h:82
BinTypeSize GetBinTypeSize() const
Definition: hist_util.h:241
size_t TotalBins() const
Definition: hist_util.h:100
HistogramCuts(HistogramCuts &&that) noexcept(true)
Definition: hist_util.h:63
std::vector< uint8_t >::const_iterator begin() const
Definition: hist_util.h:266
std::vector< uint8_t >::const_iterator end() const
Definition: hist_util.h:269
Cut configuration for sparse dataset.
Definition: hist_util.h:166
const size_t * row_ptr
Definition: hist_util.h:351
size_t Size() const
Definition: threading_utils.h:84
histogram of gradient statistics for multiple nodes
Definition: hist_util.h:427
uint32_t bst_group_t
Type for ranking group index.
Definition: base.h:125
std::vector< bool > threads_to_nids_map_
Buffer for additional histograms for Parallel processing.
Definition: hist_util.h:655
Definition: hist_util.h:365
HistogramCuts * p_cuts_
Definition: hist_util.h:135
void AddHistRow(bst_uint nid)
Definition: hist_util.h:459
common::Monitor monitor_
Definition: hist_util.h:45
std::vector< T > & HostVector()
void MatchThreadsToNodes(const BlockedSpace2d &space)
Definition: hist_util.h:570
void Build(DMatrix *dmat, uint32_t const max_num_bins)
void Resize(const size_t nBytesData)
Definition: hist_util.h:257
GHistRowT operator[](bst_uint nid) const
Definition: hist_util.h:433
uint32_t operator[](size_t i) const
Definition: hist_util.h:216
Index()
Definition: hist_util.h:209
void IncrementHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > add, size_t begin, size_t end)
Increment hist as dst += add in range [begin, end)
GHistIndexRow operator[](size_t i) const
Definition: hist_util.h:358
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:101
HostDeviceVector< float > min_vals_
Definition: hist_util.h:51
BinTypeSize
Definition: hist_util.h:202
SparseCuts(HistogramCuts *container)
Definition: hist_util.h:172
GHistBuilder(size_t nthread, uint32_t nbins)
Definition: hist_util.h:673
void Init(size_t nbins)
Definition: hist_util.h:496
uint32_t BinIdx
Definition: hist_util.h:44
namespace of xgboost
Definition: base.h:102
size_t Size() const
Definition: hist_util.h:254
data structure to store an instance set, a subset of rows (instances) associated with a particular no...
Definition: row_set.h:24
uint32_t * Offset() const
Definition: hist_util.h:248
const std::vector< T > & ConstHostVector() const
bool RowExists(bst_uint nid) const
Definition: hist_util.h:442
Definition: threading_utils.h:54
Timing utility used to measure total method execution time over the lifetime of the containing object...
Definition: timer.h:47
size_t max_num_bins
Definition: hist_util.h:312
uint32_t GetNumBins() const
Definition: hist_util.h:691
CutsBuilder(HistogramCuts *p_cuts)
Definition: hist_util.h:138
void AddCutPoint(WQSketch::SummaryContainer const &summary, int max_bin)
Definition: hist_util.h:151
std::vector< size_t > row_ptr
row pointer to rows by element position
Definition: hist_util.h:304
Stores temporary histograms to compute them in parallel Supports processing multiple tree-nodes for n...
Definition: hist_util.h:492
Element from a sparse vector.
Definition: data.h:201
std::vector< GHistRowT > hist_memory_
Allocated memory for histograms used for construction.
Definition: hist_util.h:659
std::map< std::pair< size_t, size_t >, size_t > tid_nid_to_hist_
map pair {tid, nid} to index of allocated histogram from hist_memory_
Definition: hist_util.h:661
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:105
Monitor monitor_
Definition: hist_util.h:190
HostDeviceVector< uint32_t > cut_ptrs_
Definition: hist_util.h:49
preprocessed global index matrix, in CSR format
Definition: hist_util.h:302
std::vector< float > const & Values() const
Definition: hist_util.h:97
bst_feature_t index
feature index
Definition: data.h:203
void InitilizeHistByZeroes(GHistRow< GradientSumT > hist, size_t begin, size_t end)
fill a histogram by zeros
std::vector< int > hist_was_used_
Marks which hists were used, it means that they should be merged. Contains only {true or false} value...
Definition: hist_util.h:652
bst_float fvalue
feature value
Definition: data.h:205
const uint32_t * index
Definition: hist_util.h:352
void GetFeatureCounts(size_t *counts) const
Definition: hist_util.h:331
GHistRowT GetInitializedHist(size_t tid, size_t nid)
Definition: hist_util.h:528
void Resize(size_t new_size, T v=T())
std::vector< GHistRowT > targeted_hists_
Contains histograms for final results.
Definition: hist_util.h:657
void ReduceHist(size_t nid, size_t begin, size_t end)
Definition: hist_util.h:544
void ResizeOffset(const size_t nDisps)
Definition: hist_util.h:261
Definition: hist_util.h:205
GHistIndexBlock(const size_t *row_ptr, const uint32_t *index)
Definition: hist_util.h:354
Definition: hist_util.h:36
DMatrix * p_fmat
Definition: hist_util.h:311