xgboost
hist_util.h
Go to the documentation of this file.
1 
7 #ifndef XGBOOST_COMMON_HIST_UTIL_H_
8 #define XGBOOST_COMMON_HIST_UTIL_H_
9 
10 #include <xgboost/data.h>
12 #include <limits>
13 #include <vector>
14 #include <algorithm>
15 #include <memory>
16 #include <utility>
17 #include <map>
18 
19 #include "row_set.h"
20 #include "threading_utils.h"
21 #include "../tree/param.h"
22 #include "./quantile.h"
23 #include "./timer.h"
24 #include "../include/rabit/rabit.h"
25 
26 namespace xgboost {
27 namespace common {
33 
34 // A CSC matrix representing histogram cuts, used in CPU quantile hist.
35 // The cut values represent upper bounds of bins containing approximately equal numbers of elements
37  // Using friends to avoid creating a virtual class, since HistogramCuts is used as value
38  // object in many places.
39  friend class SparseCuts;
40  friend class DenseCuts;
41  friend class CutsBuilder;
42 
43  protected:
44  using BinIdx = uint32_t;
46 
47  public:
50  // storing minimum value in a sketch set.
52 
53  HistogramCuts();
55  cut_values_.Resize(that.cut_values_.Size());
56  cut_ptrs_.Resize(that.cut_ptrs_.Size());
57  min_vals_.Resize(that.min_vals_.Size());
58  cut_values_.Copy(that.cut_values_);
59  cut_ptrs_.Copy(that.cut_ptrs_);
60  min_vals_.Copy(that.min_vals_);
61  }
62 
63  HistogramCuts(HistogramCuts&& that) noexcept(true) {
64  *this = std::forward<HistogramCuts&&>(that);
65  }
66 
68  cut_values_.Resize(that.cut_values_.Size());
69  cut_ptrs_.Resize(that.cut_ptrs_.Size());
70  min_vals_.Resize(that.min_vals_.Size());
71  cut_values_.Copy(that.cut_values_);
72  cut_ptrs_.Copy(that.cut_ptrs_);
73  min_vals_.Copy(that.min_vals_);
74  return *this;
75  }
76 
77  HistogramCuts& operator=(HistogramCuts&& that) noexcept(true) {
78  monitor_ = std::move(that.monitor_);
79  cut_ptrs_ = std::move(that.cut_ptrs_);
80  cut_values_ = std::move(that.cut_values_);
81  min_vals_ = std::move(that.min_vals_);
82  return *this;
83  }
84 
85  /* \brief Build histogram cuts. */
86  void Build(DMatrix* dmat, uint32_t const max_num_bins);
87  /* \brief How many bins a feature has. */
88  uint32_t FeatureBins(uint32_t feature) const {
89  return cut_ptrs_.ConstHostVector().at(feature + 1) -
90  cut_ptrs_.ConstHostVector()[feature];
91  }
92 
93  // Getters. Cuts should be of no use after building histogram indices, but currently
94  // it's deeply linked with quantile_hist, gpu sketcher and gpu_hist. So we preserve
95  // these for now.
96  std::vector<uint32_t> const& Ptrs() const { return cut_ptrs_.ConstHostVector(); }
97  std::vector<float> const& Values() const { return cut_values_.ConstHostVector(); }
98  std::vector<float> const& MinValues() const { return min_vals_.ConstHostVector(); }
99 
100  size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
101 
102  // Return the index of a cut point that is strictly greater than the input
103  // value, or the last available index if none exists
104  BinIdx SearchBin(float value, uint32_t column_id) const {
105  auto beg = cut_ptrs_.ConstHostVector().at(column_id);
106  auto end = cut_ptrs_.ConstHostVector().at(column_id + 1);
107  const auto &values = cut_values_.ConstHostVector();
108  auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
109  BinIdx idx = it - values.cbegin();
110  if (idx == end) {
111  idx -= 1;
112  }
113  return idx;
114  }
115 
116  BinIdx SearchBin(Entry const& e) const {
117  return SearchBin(e.fvalue, e.index);
118  }
119 };
120 
121 /* \brief An interface for building quantile cuts.
122  *
123  * `DenseCuts' always assumes there are `max_bins` for each feature, which makes it not
124  * suitable for sparse dataset. On the other hand `SparseCuts' uses `GetColumnBatches',
125  * which doubles the memory usage, hence can not be applied to dense dataset.
126  */
127 class CutsBuilder {
128  public:
130  /* \brief return whether group for ranking is used. */
131  static bool UseGroup(DMatrix* dmat);
132 
133  protected:
135 
136  public:
137  explicit CutsBuilder(HistogramCuts* p_cuts) : p_cuts_{p_cuts} {}
138  virtual ~CutsBuilder() = default;
139 
140  static uint32_t SearchGroupIndFromRow(
141  std::vector<bst_uint> const& group_ptr, size_t const base_rowid) {
142  using KIt = std::vector<bst_uint>::const_iterator;
143  KIt res = std::lower_bound(group_ptr.cbegin(), group_ptr.cend() - 1, base_rowid);
144  // Cannot use CHECK_NE because it will try to print the iterator.
145  bool const found = res != group_ptr.cend() - 1;
146  if (!found) {
147  LOG(FATAL) << "Row " << base_rowid << " does not lie in any group!";
148  }
149  uint32_t group_ind = std::distance(group_ptr.cbegin(), res);
150  return group_ind;
151  }
152 
153  void AddCutPoint(WQSketch::SummaryContainer const& summary, int max_bin) {
154  size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
155  for (size_t i = 1; i < required_cuts; ++i) {
156  bst_float cpt = summary.data[i].value;
157  if (i == 1 || cpt > p_cuts_->cut_values_.ConstHostVector().back()) {
158  p_cuts_->cut_values_.HostVector().push_back(cpt);
159  }
160  }
161  }
162 
163  /* \brief Build histogram indices. */
164  virtual void Build(DMatrix* dmat, uint32_t const max_num_bins) = 0;
165 };
166 
168 class SparseCuts : public CutsBuilder {
169  /* \brief Distrbute columns to each thread according to number of entries. */
170  static std::vector<size_t> LoadBalance(SparsePage const& page, size_t const nthreads);
172 
173  public:
174  explicit SparseCuts(HistogramCuts* container) :
175  CutsBuilder(container) {
176  monitor_.Init(__FUNCTION__);
177  }
178 
179  /* \brief Concatonate the built cuts in each thread. */
180  void Concat(std::vector<std::unique_ptr<SparseCuts>> const& cuts, uint32_t n_cols);
181  /* \brief Build histogram indices in single thread. */
182  void SingleThreadBuild(SparsePage const& page, MetaInfo const& info,
183  uint32_t max_num_bins,
184  bool const use_group_ind,
185  uint32_t beg, uint32_t end, uint32_t thread_id);
186  void Build(DMatrix* dmat, uint32_t const max_num_bins) override;
187 };
188 
190 class DenseCuts : public CutsBuilder {
191  protected:
193 
194  public:
195  explicit DenseCuts(HistogramCuts* container) :
196  CutsBuilder(container) {
197  monitor_.Init(__FUNCTION__);
198  }
199  void Init(std::vector<WQSketch>* sketchs, uint32_t max_num_bins, size_t max_rows);
200  void Build(DMatrix* p_fmat, uint32_t max_num_bins) override;
201 };
202 
203 // sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
204 HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
205  size_t sketch_batch_num_elements = 0);
206 
207 // sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
208 template <typename AdapterT>
209 HistogramCuts AdapterDeviceSketch(AdapterT* adapter, int num_bins,
210  float missing,
211  size_t sketch_batch_num_elements = 0);
212 
213 
218 };
219 
220 struct Index {
221  Index() {
222  SetBinTypeSize(binTypeSize_);
223  }
224  Index(const Index& i) = delete;
225  Index& operator=(Index i) = delete;
226  Index(Index&& i) = delete;
227  Index& operator=(Index&& i) = delete;
228  uint32_t operator[](size_t i) const {
229  if (offset_ptr_ != nullptr) {
230  return func_(data_ptr_, i) + offset_ptr_[i%p_];
231  } else {
232  return func_(data_ptr_, i);
233  }
234  }
235  void SetBinTypeSize(BinTypeSize binTypeSize) {
236  binTypeSize_ = binTypeSize;
237  switch (binTypeSize) {
238  case kUint8BinsTypeSize:
239  func_ = &GetValueFromUint8;
240  break;
241  case kUint16BinsTypeSize:
242  func_ = &GetValueFromUint16;
243  break;
244  case kUint32BinsTypeSize:
245  func_ = &GetValueFromUint32;
246  break;
247  default:
248  CHECK(binTypeSize == kUint8BinsTypeSize ||
249  binTypeSize == kUint16BinsTypeSize ||
250  binTypeSize == kUint32BinsTypeSize);
251  }
252  }
254  return binTypeSize_;
255  }
256  template<typename T>
257  T* data() const { // NOLINT
258  return static_cast<T*>(data_ptr_);
259  }
260  uint32_t* Offset() const {
261  return offset_ptr_;
262  }
263  size_t OffsetSize() const {
264  return offset_.size();
265  }
266  size_t Size() const {
267  return data_.size() / (binTypeSize_);
268  }
269  void Resize(const size_t nBytesData) {
270  data_.resize(nBytesData);
271  data_ptr_ = reinterpret_cast<void*>(data_.data());
272  }
273  void ResizeOffset(const size_t nDisps) {
274  offset_.resize(nDisps);
275  offset_ptr_ = offset_.data();
276  p_ = nDisps;
277  }
278  std::vector<uint8_t>::const_iterator begin() const { // NOLINT
279  return data_.begin();
280  }
281  std::vector<uint8_t>::const_iterator end() const { // NOLINT
282  return data_.end();
283  }
284 
285  private:
286  static uint32_t GetValueFromUint8(void *t, size_t i) {
287  return reinterpret_cast<uint8_t*>(t)[i];
288  }
289  static uint32_t GetValueFromUint16(void* t, size_t i) {
290  return reinterpret_cast<uint16_t*>(t)[i];
291  }
292  static uint32_t GetValueFromUint32(void* t, size_t i) {
293  return reinterpret_cast<uint32_t*>(t)[i];
294  }
295 
296  using Func = uint32_t (*)(void*, size_t);
297 
298  std::vector<uint8_t> data_;
299  std::vector<uint32_t> offset_; // size of this field is equal to number of features
300  void* data_ptr_;
301  BinTypeSize binTypeSize_ {kUint8BinsTypeSize};
302  size_t p_ {1};
303  uint32_t* offset_ptr_ {nullptr};
304  Func func_;
305 };
306 
307 
316  std::vector<size_t> row_ptr;
320  std::vector<size_t> hit_count;
324  size_t max_num_bins;
325  // Create a global histogram matrix, given cut
326  void Init(DMatrix* p_fmat, int max_num_bins);
327 
328  template<typename BinIdxType>
329  void SetIndexDataForDense(common::Span<BinIdxType> index_data_span,
330  size_t batch_threads, const SparsePage& batch,
331  size_t rbegin, common::Span<const uint32_t> offsets_span,
332  size_t nbins);
333 
334  // specific method for sparse data as no posibility to reduce allocated memory
335  void SetIndexDataForSparse(common::Span<uint32_t> index_data_span,
336  size_t batch_threads, const SparsePage& batch,
337  size_t rbegin, size_t nbins);
338 
339  void ResizeIndex(const size_t rbegin, const SparsePage& batch,
340  const size_t n_offsets, const size_t n_index,
341  const bool isDense);
342 
343  inline void GetFeatureCounts(size_t* counts) const {
344  auto nfeature = cut.Ptrs().size() - 1;
345  for (unsigned fid = 0; fid < nfeature; ++fid) {
346  auto ibegin = cut.Ptrs()[fid];
347  auto iend = cut.Ptrs()[fid + 1];
348  for (auto i = ibegin; i < iend; ++i) {
349  counts[fid] += hit_count[i];
350  }
351  }
352  }
353  inline bool IsDense() const {
354  return isDense_;
355  }
356 
357  private:
358  std::vector<size_t> hit_count_tloc_;
359  bool isDense_;
360 };
361 
363  const size_t* row_ptr;
364  const uint32_t* index;
365 
366  inline GHistIndexBlock(const size_t* row_ptr, const uint32_t* index)
367  : row_ptr(row_ptr), index(index) {}
368 
369  // get i-th row
370  inline GHistIndexRow operator[](size_t i) const {
371  return {&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]};
372  }
373 };
374 
375 class ColumnMatrix;
376 
378  public:
379  void Init(const GHistIndexMatrix& gmat,
380  const ColumnMatrix& colmat,
381  const tree::TrainParam& param);
382 
383  inline GHistIndexBlock operator[](size_t i) const {
384  return {blocks_[i].row_ptr_begin, blocks_[i].index_begin};
385  }
386 
387  inline size_t GetNumBlock() const {
388  return blocks_.size();
389  }
390 
391  private:
392  std::vector<size_t> row_ptr_;
393  std::vector<uint32_t> index_;
394  const HistogramCuts* cut_;
395  struct Block {
396  const size_t* row_ptr_begin;
397  const size_t* row_ptr_end;
398  const uint32_t* index_begin;
399  const uint32_t* index_end;
400  };
401  std::vector<Block> blocks_;
402 };
403 
411 
415 void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end);
416 
420 void IncrementHist(GHistRow dst, const GHistRow add, size_t begin, size_t end);
421 
425 void CopyHist(GHistRow dst, const GHistRow src, size_t begin, size_t end);
426 
430 void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2,
431  size_t begin, size_t end);
432 
437  public:
438  // access histogram for i-th node
440  constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
441  CHECK_NE(row_ptr_[nid], kMax);
442  tree::GradStats* ptr =
443  const_cast<tree::GradStats*>(dmlc::BeginPtr(data_) + row_ptr_[nid]);
444  return {ptr, nbins_};
445  }
446 
447  // have we computed a histogram for i-th node?
448  bool RowExists(bst_uint nid) const {
449  const uint32_t k_max = std::numeric_limits<uint32_t>::max();
450  return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
451  }
452 
453  // initialize histogram collection
454  void Init(uint32_t nbins) {
455  if (nbins_ != nbins) {
456  nbins_ = nbins;
457  // quite expensive operation, so let's do this only once
458  data_.clear();
459  }
460  row_ptr_.clear();
461  n_nodes_added_ = 0;
462  }
463 
464  // create an empty histogram for i-th node
465  void AddHistRow(bst_uint nid) {
466  constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
467  if (nid >= row_ptr_.size()) {
468  row_ptr_.resize(nid + 1, kMax);
469  }
470  CHECK_EQ(row_ptr_[nid], kMax);
471 
472  if (data_.size() < nbins_ * (nid + 1)) {
473  data_.resize(nbins_ * (nid + 1));
474  }
475 
476  row_ptr_[nid] = nbins_ * n_nodes_added_;
477  n_nodes_added_++;
478  }
479 
480  private:
482  uint32_t nbins_ = 0;
484  uint32_t n_nodes_added_ = 0;
485 
486  std::vector<tree::GradStats> data_;
487 
489  std::vector<size_t> row_ptr_;
490 };
491 
498  public:
499  void Init(size_t nbins) {
500  if (nbins != nbins_) {
501  hist_buffer_.Init(nbins);
502  nbins_ = nbins;
503  }
504  }
505 
506  // Add new elements if needed, mark all hists as unused
507  // targeted_hists - already allocated hists which should contain final results after Reduce() call
508  void Reset(size_t nthreads, size_t nodes, const BlockedSpace2d& space,
509  const std::vector<GHistRow>& targeted_hists) {
510  hist_buffer_.Init(nbins_);
511  tid_nid_to_hist_.clear();
512  hist_memory_.clear();
513  threads_to_nids_map_.clear();
514 
515  targeted_hists_ = targeted_hists;
516 
517  CHECK_EQ(nodes, targeted_hists.size());
518 
519  nodes_ = nodes;
520  nthreads_ = nthreads;
521 
522  MatchThreadsToNodes(space);
523  AllocateAdditionalHistograms();
524  MatchNodeNidPairToHist();
525 
526  hist_was_used_.resize(nthreads * nodes_);
527  std::fill(hist_was_used_.begin(), hist_was_used_.end(), static_cast<int>(false));
528  }
529 
530  // Get specified hist, initialize hist by zeros if it wasn't used before
531  GHistRow GetInitializedHist(size_t tid, size_t nid) {
532  CHECK_LT(nid, nodes_);
533  CHECK_LT(tid, nthreads_);
534 
535  size_t idx = tid_nid_to_hist_.at({tid, nid});
536  GHistRow hist = hist_memory_[idx];
537 
538  if (!hist_was_used_[tid * nodes_ + nid]) {
539  InitilizeHistByZeroes(hist, 0, hist.size());
540  hist_was_used_[tid * nodes_ + nid] = static_cast<int>(true);
541  }
542 
543  return hist;
544  }
545 
546  // Reduce following bins (begin, end] for nid-node in dst across threads
547  void ReduceHist(size_t nid, size_t begin, size_t end) {
548  CHECK_GT(end, begin);
549  CHECK_LT(nid, nodes_);
550 
551  GHistRow dst = targeted_hists_[nid];
552 
553  bool is_updated = false;
554  for (size_t tid = 0; tid < nthreads_; ++tid) {
555  if (hist_was_used_[tid * nodes_ + nid]) {
556  is_updated = true;
557  const size_t idx = tid_nid_to_hist_.at({tid, nid});
558  GHistRow src = hist_memory_[idx];
559 
560  if (dst.data() != src.data()) {
561  IncrementHist(dst, src, begin, end);
562  }
563  }
564  }
565  if (!is_updated) {
566  // In distributed mode - some tree nodes can be empty on local machines,
567  // So we need just set local hist by zeros in this case
568  InitilizeHistByZeroes(dst, begin, end);
569  }
570  }
571 
572  protected:
573  void MatchThreadsToNodes(const BlockedSpace2d& space) {
574  const size_t space_size = space.Size();
575  const size_t chunck_size = space_size / nthreads_ + !!(space_size % nthreads_);
576 
577  threads_to_nids_map_.resize(nthreads_ * nodes_, false);
578 
579  for (size_t tid = 0; tid < nthreads_; ++tid) {
580  size_t begin = chunck_size * tid;
581  size_t end = std::min(begin + chunck_size, space_size);
582 
583  if (begin < space_size) {
584  size_t nid_begin = space.GetFirstDimension(begin);
585  size_t nid_end = space.GetFirstDimension(end-1);
586 
587  for (size_t nid = nid_begin; nid <= nid_end; ++nid) {
588  // true - means thread 'tid' will work to compute partial hist for node 'nid'
589  threads_to_nids_map_[tid * nodes_ + nid] = true;
590  }
591  }
592  }
593  }
594 
596  size_t hist_allocated_additionally = 0;
597 
598  for (size_t nid = 0; nid < nodes_; ++nid) {
599  int nthreads_for_nid = 0;
600 
601  for (size_t tid = 0; tid < nthreads_; ++tid) {
602  if (threads_to_nids_map_[tid * nodes_ + nid]) {
603  nthreads_for_nid++;
604  }
605  }
606 
607  // In distributed mode - some tree nodes can be empty on local machines,
608  // set nthreads_for_nid to 0 in this case.
609  // In another case - allocate additional (nthreads_for_nid - 1) histograms,
610  // because one is already allocated externally (will store final result for the node).
611  hist_allocated_additionally += std::max<int>(0, nthreads_for_nid - 1);
612  }
613 
614  for (size_t i = 0; i < hist_allocated_additionally; ++i) {
615  hist_buffer_.AddHistRow(i);
616  }
617  }
618 
620  size_t hist_total = 0;
621  size_t hist_allocated_additionally = 0;
622 
623  for (size_t nid = 0; nid < nodes_; ++nid) {
624  bool first_hist = true;
625  for (size_t tid = 0; tid < nthreads_; ++tid) {
626  if (threads_to_nids_map_[tid * nodes_ + nid]) {
627  if (first_hist) {
628  hist_memory_.push_back(targeted_hists_[nid]);
629  first_hist = false;
630  } else {
631  hist_memory_.push_back(hist_buffer_[hist_allocated_additionally]);
632  hist_allocated_additionally++;
633  }
634  // map pair {tid, nid} to index of allocated histogram from hist_memory_
635  tid_nid_to_hist_[{tid, nid}] = hist_total++;
636  CHECK_EQ(hist_total, hist_memory_.size());
637  }
638  }
639  }
640  }
641 
643  size_t nbins_ = 0;
645  size_t nthreads_ = 0;
647  size_t nodes_ = 0;
655  std::vector<int> hist_was_used_;
656 
658  std::vector<bool> threads_to_nids_map_;
660  std::vector<GHistRow> targeted_hists_;
662  std::vector<GHistRow> hist_memory_;
664  std::map<std::pair<size_t, size_t>, size_t> tid_nid_to_hist_;
665 };
666 
671  public:
672  GHistBuilder() = default;
673  GHistBuilder(size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {}
674 
675  // construct a histogram via histogram aggregation
676  void BuildHist(const std::vector<GradientPair>& gpair,
677  const RowSetCollection::Elem row_indices,
678  const GHistIndexMatrix& gmat,
679  GHistRow hist,
680  bool isDense);
681  // same, with feature grouping
682  void BuildBlockHist(const std::vector<GradientPair>& gpair,
683  const RowSetCollection::Elem row_indices,
684  const GHistIndexBlockMatrix& gmatb,
685  GHistRow hist);
686  // construct a histogram via subtraction trick
687  void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);
688 
689  uint32_t GetNumBins() const {
690  return nbins_;
691  }
692 
693  private:
695  size_t nthread_ { 0 };
697  uint32_t nbins_ { 0 };
698 };
699 
700 
701 } // namespace common
702 } // namespace xgboost
703 #endif // XGBOOST_COMMON_HIST_UTIL_H_
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end)
fill a histogram by zeros
void Init(uint32_t nbins)
Definition: hist_util.h:454
void ReduceHist(size_t nid, size_t begin, size_t end)
Definition: hist_util.h:547
DenseCuts(HistogramCuts *container)
Definition: hist_util.h:195
Definition: hist_util.h:220
void SetBinTypeSize(BinTypeSize binTypeSize)
Definition: hist_util.h:235
Index index
The index data.
Definition: hist_util.h:318
float bst_float
float type, used for storing statistics
Definition: base.h:111
BinIdx SearchBin(float value, uint32_t column_id) const
Definition: hist_util.h:104
XGBOOST_DEVICE constexpr index_type size() const __span_noexcept
Definition: span.h:531
void Copy(const HostDeviceVector< T > &other)
HistCollection hist_buffer_
Buffer for additional histograms for Parallel processing.
Definition: hist_util.h:649
size_t GetNumBlock() const
Definition: hist_util.h:387
void IncrementHist(GHistRow dst, const GHistRow add, size_t begin, size_t end)
Increment hist as dst += add in range [begin, end)
uint32_t FeatureBins(uint32_t feature) const
Definition: hist_util.h:88
Definition: hist_util.h:362
BinIdx SearchBin(Entry const &e) const
Definition: hist_util.h:116
static uint32_t SearchGroupIndFromRow(std::vector< bst_uint > const &group_ptr, size_t const base_rowid)
Definition: hist_util.h:140
void CopyHist(GHistRow dst, const GHistRow src, size_t begin, size_t end)
Copy hist from src to dst in range [begin, end)
void AddHistRow(bst_uint nid)
Definition: hist_util.h:465
Meta information about dataset, always sit in memory.
Definition: data.h:40
std::vector< bool > threads_to_nids_map_
Buffer for additional histograms for Parallel processing.
Definition: hist_util.h:658
std::vector< uint32_t > const & Ptrs() const
Definition: hist_util.h:96
util to compute quantiles
Definition: hist_util.h:215
The input data structure of xgboost.
T * data() const
Definition: hist_util.h:257
HistogramCuts & operator=(HistogramCuts const &that)
Definition: hist_util.h:67
GHistBuilder(size_t nthread, uint32_t nbins)
Definition: hist_util.h:673
HistogramCuts cut
The corresponding cuts.
Definition: hist_util.h:322
void MatchNodeNidPairToHist()
Definition: hist_util.h:619
Internal data structured used by XGBoost during training. There are two ways to create a customized D...
Definition: data.h:451
size_t GetFirstDimension(size_t i) const
Definition: threading_utils.h:89
Cut configuration for dense dataset.
Definition: hist_util.h:190
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:211
bool IsDense() const
Definition: hist_util.h:353
HistogramCuts & operator=(HistogramCuts &&that) noexcept(true)
Definition: hist_util.h:77
Definition: hist_util.h:127
std::vector< size_t > hit_count
hit count of each index
Definition: hist_util.h:320
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:126
Definition: hist_util.h:216
builder for histograms of gradient statistics
Definition: hist_util.h:670
size_t OffsetSize() const
Definition: hist_util.h:263
std::vector< float > const & MinValues() const
Definition: hist_util.h:98
HostDeviceVector< bst_float > cut_values_
Definition: hist_util.h:48
GHistIndexBlock operator[](size_t i) const
Definition: hist_util.h:383
Quantile sketch use WQSummary.
Definition: quantile.h:831
XGBOOST_DEVICE constexpr pointer data() const __span_noexcept
Definition: span.h:526
Quick Utility to compute subset of rows.
GHistRow GetInitializedHist(size_t tid, size_t nid)
Definition: hist_util.h:531
HistogramCuts(HistogramCuts const &that)
Definition: hist_util.h:54
void Init(std::string label)
Definition: timer.h:82
BinTypeSize GetBinTypeSize() const
Definition: hist_util.h:253
size_t TotalBins() const
Definition: hist_util.h:100
HistogramCuts(HistogramCuts &&that) noexcept(true)
Definition: hist_util.h:63
std::vector< uint8_t >::const_iterator begin() const
Definition: hist_util.h:278
std::vector< uint8_t >::const_iterator end() const
Definition: hist_util.h:281
Cut configuration for sparse dataset.
Definition: hist_util.h:168
const size_t * row_ptr
Definition: hist_util.h:363
size_t Size() const
Definition: threading_utils.h:84
histogram of gradient statistics for multiple nodes
Definition: hist_util.h:436
void Reset(size_t nthreads, size_t nodes, const BlockedSpace2d &space, const std::vector< GHistRow > &targeted_hists)
Definition: hist_util.h:508
void MatchThreadsToNodes(const BlockedSpace2d &space)
Definition: hist_util.h:573
Definition: hist_util.h:377
HistogramCuts * p_cuts_
Definition: hist_util.h:134
std::vector< int > hist_was_used_
Marks which hists were used, it means that they should be merged. Contains only {true or false} value...
Definition: hist_util.h:655
GHistRow operator[](bst_uint nid) const
Definition: hist_util.h:439
common::Monitor monitor_
Definition: hist_util.h:45
std::vector< T > & HostVector()
void Build(DMatrix *dmat, uint32_t const max_num_bins)
void Resize(const size_t nBytesData)
Definition: hist_util.h:269
uint32_t operator[](size_t i) const
Definition: hist_util.h:228
Index()
Definition: hist_util.h:221
GHistIndexRow operator[](size_t i) const
Definition: hist_util.h:370
HistogramCuts DeviceSketch(int device, DMatrix *dmat, int max_bins, size_t sketch_batch_num_elements=0)
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:99
HostDeviceVector< float > min_vals_
Definition: hist_util.h:51
BinTypeSize
Definition: hist_util.h:214
SparseCuts(HistogramCuts *container)
Definition: hist_util.h:174
uint32_t BinIdx
Definition: hist_util.h:44
std::vector< GHistRow > targeted_hists_
Contains histograms for final results.
Definition: hist_util.h:660
namespace of xgboost
Definition: base.h:102
size_t Size() const
Definition: hist_util.h:266
data structure to store an instance set, a subset of rows (instances) associated with a particular no...
Definition: row_set.h:24
uint32_t * Offset() const
Definition: hist_util.h:260
const std::vector< T > & ConstHostVector() const
Definition: threading_utils.h:54
Timing utility used to measure total method execution time over the lifetime of the containing object...
Definition: timer.h:47
std::vector< GHistRow > hist_memory_
Allocated memory for histograms used for construction.
Definition: hist_util.h:662
HistogramCuts AdapterDeviceSketch(AdapterT *adapter, int num_bins, float missing, size_t sketch_batch_num_elements=0)
size_t max_num_bins
Definition: hist_util.h:324
uint32_t GetNumBins() const
Definition: hist_util.h:689
CutsBuilder(HistogramCuts *p_cuts)
Definition: hist_util.h:137
void AddCutPoint(WQSketch::SummaryContainer const &summary, int max_bin)
Definition: hist_util.h:153
std::vector< size_t > row_ptr
row pointer to rows by element position
Definition: hist_util.h:316
Stores temporary histograms to compute them in parallel Supports processing multiple tree-nodes for n...
Definition: hist_util.h:497
void AllocateAdditionalHistograms()
Definition: hist_util.h:595
Element from a sparse vector.
Definition: data.h:167
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:105
Monitor monitor_
Definition: hist_util.h:192
HostDeviceVector< uint32_t > cut_ptrs_
Definition: hist_util.h:49
void Init(size_t nbins)
Definition: hist_util.h:499
void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2, size_t begin, size_t end)
Compute Subtraction: dst = src1 - src2 in range [begin, end)
preprocessed global index matrix, in CSR format
Definition: hist_util.h:314
std::vector< float > const & Values() const
Definition: hist_util.h:97
bst_feature_t index
feature index
Definition: data.h:169
bst_float fvalue
feature value
Definition: data.h:171
const uint32_t * index
Definition: hist_util.h:364
void GetFeatureCounts(size_t *counts) const
Definition: hist_util.h:343
void Resize(size_t new_size, T v=T())
std::map< std::pair< size_t, size_t >, size_t > tid_nid_to_hist_
map pair {tid, nid} to index of allocated histogram from hist_memory_
Definition: hist_util.h:664
void ResizeOffset(const size_t nDisps)
Definition: hist_util.h:273
Definition: hist_util.h:217
bool RowExists(bst_uint nid) const
Definition: hist_util.h:448
GHistIndexBlock(const size_t *row_ptr, const uint32_t *index)
Definition: hist_util.h:366
Definition: hist_util.h:36
DMatrix * p_fmat
Definition: hist_util.h:323