Go to the documentation of this file.
7 #ifndef XGBOOST_COMMON_HIST_UTIL_H_
8 #define XGBOOST_COMMON_HIST_UTIL_H_
27 class GHistIndexMatrix;
39 bool has_categorical_{
false};
40 float max_cat_{-1.0f};
50 std::swap(has_categorical_, that.has_categorical_);
61 has_categorical_ = that.has_categorical_;
62 max_cat_ = that.max_cat_;
75 this->
Swap(std::forward<HistogramCuts>(that));
84 this->
Swap(std::forward<HistogramCuts>(that));
105 has_categorical_ = has_cat;
114 std::vector<float>
const& values)
const {
115 auto end = ptrs[column_id + 1];
116 auto beg = ptrs[column_id];
117 auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
118 BinIdx idx = it - values.cbegin();
119 idx -= !!(idx == end);
138 auto const &ptrs = this->
Ptrs();
139 auto const &vals = this->
Values();
140 auto end = ptrs.at(e.
index + 1) + vals.cbegin();
141 auto beg = ptrs[e.
index] + vals.cbegin();
144 auto bin_idx = std::lower_bound(beg, end, v) - vals.cbegin();
145 if (bin_idx == ptrs.at(e.
index + 1)) {
159 bool use_sorted =
false,
Span<float> const hessian = {}) {
161 auto const& info = m->
Info();
162 std::vector<std::vector<bst_row_t>> column_sizes(n_threads);
163 for (
auto& column : column_sizes) {
164 column.resize(info.num_col_, 0);
166 std::vector<bst_row_t> reduced(info.num_col_, 0);
167 for (
auto const& page : m->
GetBatches<SparsePage>()) {
168 auto const& entries_per_column =
170 for (
size_t i = 0; i < entries_per_column.size(); ++i) {
171 reduced[i] += entries_per_column[i];
178 for (
auto const& page : m->
GetBatches<SparsePage>()) {
179 container.PushRowPage(page, info, hessian);
181 container.MakeCuts(&out);
183 SortedSketchContainer container{
185 for (
auto const& page : m->
GetBatches<SortedCSCPage>()) {
186 container.PushColPage(page, info, hessian);
188 container.MakeCuts(&out);
214 if (!bin_offset_.empty()) {
216 auto fidx = i % bin_offset_.size();
218 return func_(data_.data(), i) + bin_offset_[fidx];
220 return func_(data_.data(), i);
224 binTypeSize_ = binTypeSize;
225 switch (binTypeSize) {
227 func_ = &GetValueFromUint8;
230 func_ = &GetValueFromUint16;
233 func_ = &GetValueFromUint32;
243 template <
typename T>
245 return reinterpret_cast<T const*
>(data_.data());
247 template <
typename T>
249 return reinterpret_cast<T*
>(data_.data());
251 uint32_t
const*
Offset()
const {
return bin_offset_.data(); }
253 size_t Size()
const {
return data_.size() / (binTypeSize_); }
256 data_.resize(n_bytes);
260 bin_offset_.resize(cut_ptrs.size() - 1);
261 std::copy_n(cut_ptrs.begin(), bin_offset_.size(), bin_offset_.begin());
263 std::vector<uint8_t>::const_iterator
begin()
const {
264 return data_.begin();
266 std::vector<uint8_t>::const_iterator
end()
const {
270 std::vector<uint8_t>::iterator
begin() {
271 return data_.begin();
273 std::vector<uint8_t>::iterator
end() {
279 static uint32_t GetValueFromUint8(uint8_t
const* t,
size_t i) {
return t[i]; }
280 static uint32_t GetValueFromUint16(uint8_t
const* t,
size_t i) {
281 return reinterpret_cast<uint16_t const*
>(t)[i];
283 static uint32_t GetValueFromUint32(uint8_t
const* t,
size_t i) {
284 return reinterpret_cast<uint32_t const*
>(t)[i];
287 using Func = uint32_t (*)(uint8_t
const*, size_t);
289 std::vector<uint8_t> data_;
292 std::vector<uint32_t> bin_offset_;
298 template <
typename GradientIndex>
300 GradientIndex
const &data,
301 uint32_t
const fidx_begin,
302 uint32_t
const fidx_end) {
303 size_t previous_middle = std::numeric_limits<size_t>::max();
304 while (end != begin) {
305 size_t middle = begin + (end - begin) / 2;
306 if (middle == previous_middle) {
309 previous_middle = middle;
312 auto gidx = data[middle];
314 if (gidx >= fidx_begin && gidx < fidx_end) {
316 return static_cast<int32_t
>(gidx);
317 }
else if (gidx < fidx_begin) {
329 template<
typename GradientSumT>
335 template<
typename GradientSumT>
341 template<
typename GradientSumT>
343 size_t begin,
size_t end);
348 template<
typename GradientSumT>
350 size_t begin,
size_t end);
355 template<
typename GradientSumT>
358 size_t begin,
size_t end);
363 template<
typename GradientSumT>
371 constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
372 const size_t id = row_ptr_.at(nid);
375 if (contiguous_allocation_) {
376 ptr =
const_cast<GradientPairT*
>(data_[0].data() + nbins_*id);
380 return {ptr, nbins_};
385 const uint32_t k_max = std::numeric_limits<uint32_t>::max();
386 return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
391 if (nbins_ != nbins) {
402 constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
403 if (nid >= row_ptr_.size()) {
404 row_ptr_.resize(nid + 1, kMax);
406 CHECK_EQ(row_ptr_[nid], kMax);
408 if (data_.size() < (nid + 1)) {
409 data_.resize((nid + 1));
412 row_ptr_[nid] = n_nodes_added_;
417 if (data_[row_ptr_[nid]].size() == 0) {
418 data_[row_ptr_[nid]].resize(nbins_, {0, 0});
423 const size_t new_size = nbins_*data_.size();
424 contiguous_allocation_ =
true;
425 if (data_[0].size() != new_size) {
426 data_[0].resize(new_size);
434 uint32_t n_nodes_added_ = 0;
436 bool contiguous_allocation_ =
false;
438 std::vector<std::vector<GradientPairT>> data_;
441 std::vector<size_t> row_ptr_;
449 template<
typename GradientSumT>
455 if (nbins != nbins_) {
456 hist_buffer_.Init(nbins);
464 const std::vector<GHistRowT>& targeted_hists) {
465 hist_buffer_.Init(nbins_);
466 tid_nid_to_hist_.clear();
467 threads_to_nids_map_.clear();
469 targeted_hists_ = targeted_hists;
471 CHECK_EQ(nodes, targeted_hists.size());
474 nthreads_ = nthreads;
478 MatchNodeNidPairToHist();
480 hist_was_used_.resize(nthreads * nodes_);
481 std::fill(hist_was_used_.begin(), hist_was_used_.end(),
static_cast<int>(
false));
486 CHECK_LT(nid, nodes_);
487 CHECK_LT(tid, nthreads_);
489 int idx = tid_nid_to_hist_.at({tid, nid});
491 hist_buffer_.AllocateData(idx);
493 GHistRowT hist = idx == -1 ? targeted_hists_[nid] : hist_buffer_[idx];
495 if (!hist_was_used_[tid * nodes_ + nid]) {
497 hist_was_used_[tid * nodes_ + nid] =
static_cast<int>(
true);
504 void ReduceHist(
size_t nid,
size_t begin,
size_t end)
const {
505 CHECK_GT(end, begin);
506 CHECK_LT(nid, nodes_);
510 bool is_updated =
false;
511 for (
size_t tid = 0; tid < nthreads_; ++tid) {
512 if (hist_was_used_[tid * nodes_ + nid]) {
515 int idx = tid_nid_to_hist_.at({tid, nid});
516 GHistRowT src = idx == -1 ? targeted_hists_[nid] : hist_buffer_[idx];
531 const size_t space_size = space.
Size();
532 const size_t chunck_size = space_size / nthreads_ + !!(space_size % nthreads_);
534 threads_to_nids_map_.resize(nthreads_ * nodes_,
false);
536 for (
size_t tid = 0; tid < nthreads_; ++tid) {
537 size_t begin = chunck_size * tid;
538 size_t end = std::min(begin + chunck_size, space_size);
540 if (begin < space_size) {
544 for (
size_t nid = nid_begin; nid <= nid_end; ++nid) {
546 threads_to_nids_map_[tid * nodes_ + nid] =
true;
553 size_t hist_allocated_additionally = 0;
555 for (
size_t nid = 0; nid < nodes_; ++nid) {
556 int nthreads_for_nid = 0;
558 for (
size_t tid = 0; tid < nthreads_; ++tid) {
559 if (threads_to_nids_map_[tid * nodes_ + nid]) {
568 hist_allocated_additionally += std::max<int>(0, nthreads_for_nid - 1);
571 for (
size_t i = 0; i < hist_allocated_additionally; ++i) {
572 hist_buffer_.AddHistRow(i);
577 void MatchNodeNidPairToHist() {
578 size_t hist_allocated_additionally = 0;
580 for (
size_t nid = 0; nid < nodes_; ++nid) {
581 bool first_hist =
true;
582 for (
size_t tid = 0; tid < nthreads_; ++tid) {
583 if (threads_to_nids_map_[tid * nodes_ + nid]) {
585 tid_nid_to_hist_[{tid, nid}] = -1;
588 tid_nid_to_hist_[{tid, nid}] = hist_allocated_additionally++;
598 size_t nthreads_ = 0;
602 HistCollection<GradientSumT> hist_buffer_;
608 std::vector<int> hist_was_used_;
611 std::vector<bool> threads_to_nids_map_;
613 std::vector<GHistRowT> targeted_hists_;
618 std::map<std::pair<size_t, size_t>,
int> tid_nid_to_hist_;
624 template<
typename GradientSumT>
633 template <
bool any_missing>
634 void BuildHist(
const std::vector<GradientPair> &gpair,
635 const RowSetCollection::Elem row_indices,
636 const GHistIndexMatrix &gmat,
GHistRowT hist)
const;
643 uint32_t nbins_ { 0 };
647 #endif // XGBOOST_COMMON_HIST_UTIL_H_
bst_feature_t index
feature index
Definition: data.h:189
util to compute quantiles
virtual MetaInfo & Info()=0
meta information of the dataset
#define XGBOOST_HOST_DEV_INLINE
Definition: base.h:91
std::vector< uint8_t >::iterator end()
Definition: hist_util.h:273
void IncrementHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > add, size_t begin, size_t end)
Increment hist as dst += add in range [begin, end)
Implementation of gradient statistics pair. Template specialisation may be used to overload different...
Definition: base.h:141
void AllocateData(bst_uint nid)
Definition: hist_util.h:416
GHistRowT GetInitializedHist(size_t tid, size_t nid)
Definition: hist_util.h:485
BinIdx SearchBin(float value, bst_feature_t column_id, std::vector< uint32_t > const &ptrs, std::vector< float > const &values) const
Definition: hist_util.h:113
T * data()
Definition: hist_util.h:248
@ kUint32BinsTypeSize
Definition: hist_util.h:197
size_t TotalBins() const
Definition: hist_util.h:109
size_t Size() const
Definition: threading_utils.h:100
uint32_t FeatureBins(bst_feature_t feature) const
Definition: hist_util.h:88
HostDeviceVector< float > cut_values_
Definition: hist_util.h:66
HistogramCuts(HistogramCuts const &that)
Definition: hist_util.h:72
void swap(xgboost::IntrusivePtr< T > &x, xgboost::IntrusivePtr< T > &y) noexcept
Definition: intrusive_ptr.h:209
void AllocateAllData()
Definition: hist_util.h:422
float MaxCategory() const
Definition: hist_util.h:97
uint32_t BinIdx
Definition: hist_util.h:43
Element from a sparse vector.
Definition: data.h:187
Quick Utility to compute subset of rows.
Definition: hist_util.h:38
void Resize(const size_t n_bytes)
Definition: hist_util.h:255
std::vector< uint8_t >::iterator begin()
Definition: hist_util.h:270
void Reset(size_t nthreads, size_t nodes, const BlockedSpace2d &space, const std::vector< GHistRowT > &targeted_hists)
Definition: hist_util.h:463
Optionally compressed gradient index. The compression works only with dense data.
Definition: hist_util.h:207
HistogramCuts(HistogramCuts &&that) noexcept(true)
Definition: hist_util.h:74
HostDeviceVector< uint32_t > cut_ptrs_
Definition: hist_util.h:67
size_t GetFirstDimension(size_t i) const
Definition: threading_utils.h:105
void SetCategorical(bool has_cat, float max_cat)
Set meta info about categorical features.
Definition: hist_util.h:104
static std::vector< bst_row_t > CalcColumnSize(SparsePage const &page, bst_feature_t const n_columns, size_t const nthreads)
void CopyHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > src, size_t begin, size_t end)
Copy hist from src to dst in range [begin, end)
Internal data structured used by XGBoost during training.
Definition: data.h:475
Definition: threading_utils.h:70
uint32_t GetNumBins() const
Definition: hist_util.h:637
void BuildHist(const std::vector< GradientPair > &gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, GHistRowT hist) const
void AllocateAdditionalHistograms()
Definition: hist_util.h:552
uint32_t bst_feature_t
Type for data column (feature) index.
Definition: base.h:123
void SetBinTypeSize(BinTypeSize binTypeSize)
Definition: hist_util.h:223
size_t Size() const
Definition: hist_util.h:253
void AddHistRow(bst_uint nid)
Definition: hist_util.h:401
std::vector< uint32_t > const & Ptrs() const
Definition: hist_util.h:92
void Init(uint32_t nbins)
Definition: hist_util.h:390
histogram of gradient statistics for multiple nodes
Definition: hist_util.h:364
HostDeviceVector< float > min_vals_
Definition: hist_util.h:69
GHistRowT operator[](bst_uint nid) const
Definition: hist_util.h:370
HistogramCuts & operator=(HistogramCuts &&that) noexcept(true)
Definition: hist_util.h:83
int32_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(size_t begin, size_t end, GradientIndex const &data, uint32_t const fidx_begin, uint32_t const fidx_end)
Definition: hist_util.h:299
Stores temporary histograms to compute them in parallel Supports processing multiple tree-nodes for n...
Definition: hist_util.h:450
@ kUint8BinsTypeSize
Definition: hist_util.h:195
uint32_t operator[](size_t i) const
Definition: hist_util.h:213
void ReduceHist(size_t nid, size_t begin, size_t end) const
Definition: hist_util.h:504
BatchSet< T > GetBatches()
Gets batches. Use range based for loop over BatchSet to access individual batches.
bool RowExists(bst_uint nid) const
Definition: hist_util.h:384
static bool UseGroup(MetaInfo const &info)
Definition: quantile.h:730
void InitilizeHistByZeroes(GHistRow< GradientSumT > hist, size_t begin, size_t end)
fill a histogram by zeros
std::vector< float > const & MinValues() const
Definition: hist_util.h:94
Index & operator=(Index i)=delete
BinIdx SearchBin(float value, bst_feature_t column_id) const
Definition: hist_util.h:123
void MatchThreadsToNodes(const BlockedSpace2d &space)
Definition: hist_util.h:530
void Copy(const HostDeviceVector< T > &other)
void SubtractionHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > src1, const GHistRow< GradientSumT > src2, size_t begin, size_t end)
Compute Subtraction: dst = src1 - src2 in range [begin, end)
void Resize(size_t new_size, T v=T())
BinTypeSize GetBinTypeSize() const
Definition: hist_util.h:240
BinTypeSize
Definition: hist_util.h:194
T const * data() const
Definition: hist_util.h:244
GHistRow< GradientSumT > GHistRowT
Definition: hist_util.h:627
The input data structure of xgboost.
void Copy(HistogramCuts const &that)
Definition: hist_util.h:54
void SetBinOffset(std::vector< uint32_t > const &cut_ptrs)
Definition: hist_util.h:259
Index()
Definition: hist_util.h:208
void Init(size_t nbins)
Definition: hist_util.h:454
BinIdx SearchBin(Entry const &e) const
Search the bin index for numerical feature.
Definition: hist_util.h:130
const std::vector< T > & ConstHostVector() const
constexpr XGBOOST_DEVICE index_type size() const __span_noexcept
Definition: span.h:553
GHistBuilder(uint32_t nbins)
Definition: hist_util.h:630
BinIdx SearchCatBin(Entry const &e) const
Search the bin index for categorical feature.
Definition: hist_util.h:137
constexpr XGBOOST_DEVICE pointer data() const __span_noexcept
Definition: span.h:548
HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, bool use_sorted=false, Span< float > const hessian={})
Run CPU sketching on DMatrix.
Definition: hist_util.h:158
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:113
@ kUint16BinsTypeSize
Definition: hist_util.h:196
bst_float fvalue
feature value
Definition: data.h:191
std::vector< uint8_t >::const_iterator begin() const
Definition: hist_util.h:263
HistogramCuts & operator=(HistogramCuts const &that)
Definition: hist_util.h:78
std::vector< uint8_t >::const_iterator end() const
Definition: hist_util.h:266
std::vector< float > const & Values() const
Definition: hist_util.h:93
size_t OffsetSize() const
Definition: hist_util.h:252
bool HasCategorical() const
Definition: hist_util.h:96
builder for histograms of gradient statistics
Definition: hist_util.h:625
void Swap(HistogramCuts &&that) noexcept(true)
Definition: hist_util.h:45
uint32_t const * Offset() const
Definition: hist_util.h:251
XGBOOST_DEVICE bst_cat_t AsCat(T const &v)
Definition: categorical.h:24
namespace of xgboost
Definition: base.h:110