Go to the documentation of this file.
7 #ifndef XGBOOST_COMMON_HIST_UTIL_H_
8 #define XGBOOST_COMMON_HIST_UTIL_H_
22 #include "../tree/param.h"
25 #include "../include/rabit/rabit.h"
28 class GHistIndexMatrix;
60 *
this = std::forward<HistogramCuts&&>(that);
100 auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
101 BinIdx idx = it - values.cbegin();
116 auto const& info = m->
Info();
117 const auto threads = omp_get_max_threads();
118 std::vector<std::vector<bst_row_t>> column_sizes(threads);
119 for (
auto& column : column_sizes) {
120 column.resize(info.num_col_, 0);
122 std::vector<bst_row_t> reduced(info.num_col_, 0);
123 for (
auto const& page : m->
GetBatches<SparsePage>()) {
124 auto const &entries_per_column =
126 for (
size_t i = 0; i < entries_per_column.size(); ++i) {
127 reduced[i] += entries_per_column[i];
130 HostSketchContainer container(reduced, max_bins,
133 for (
auto const &page : m->
GetBatches<SparsePage>()) {
134 container.PushRowPage(page, info, hessian);
136 container.MakeCuts(&out);
155 if (offset_ptr_ !=
nullptr) {
156 return func_(data_ptr_, i) + offset_ptr_[i%p_];
158 return func_(data_ptr_, i);
162 binTypeSize_ = binTypeSize;
163 switch (binTypeSize) {
165 func_ = &GetValueFromUint8;
168 func_ = &GetValueFromUint16;
171 func_ = &GetValueFromUint32;
184 return static_cast<T*
>(data_ptr_);
190 return offset_.size();
193 return data_.size() / (binTypeSize_);
196 data_.resize(nBytesData);
197 data_ptr_ =
reinterpret_cast<void*
>(data_.data());
200 offset_.resize(nDisps);
201 offset_ptr_ = offset_.data();
204 std::vector<uint8_t>::const_iterator
begin()
const {
205 return data_.begin();
207 std::vector<uint8_t>::const_iterator
end()
const {
211 std::vector<uint8_t>::iterator
begin() {
212 return data_.begin();
214 std::vector<uint8_t>::iterator
end() {
219 static uint32_t GetValueFromUint8(
void *t,
size_t i) {
220 return reinterpret_cast<uint8_t*
>(t)[i];
222 static uint32_t GetValueFromUint16(
void* t,
size_t i) {
223 return reinterpret_cast<uint16_t*
>(t)[i];
225 static uint32_t GetValueFromUint32(
void* t,
size_t i) {
226 return reinterpret_cast<uint32_t*
>(t)[i];
229 using Func = uint32_t (*)(
void*, size_t);
231 std::vector<uint8_t> data_;
232 std::vector<uint32_t> offset_;
236 uint32_t* offset_ptr_ {
nullptr};
240 template <
typename GradientIndex>
242 GradientIndex
const &data,
243 uint32_t
const fidx_begin,
244 uint32_t
const fidx_end) {
245 size_t previous_middle = std::numeric_limits<size_t>::max();
246 while (end != begin) {
247 size_t middle = begin + (end - begin) / 2;
248 if (middle == previous_middle) {
251 previous_middle = middle;
253 auto gidx = data[middle];
255 if (gidx >= fidx_begin && gidx < fidx_end) {
256 return static_cast<int32_t
>(gidx);
257 }
else if (gidx < fidx_begin) {
269 template<
typename GradientSumT>
275 template<
typename GradientSumT>
281 template<
typename GradientSumT>
283 size_t begin,
size_t end);
288 template<
typename GradientSumT>
290 size_t begin,
size_t end);
295 template<
typename GradientSumT>
298 size_t begin,
size_t end);
303 template<
typename GradientSumT>
311 constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
312 const size_t id = row_ptr_.at(nid);
315 if (contiguous_allocation_) {
316 ptr =
const_cast<GradientPairT*
>(data_[0].data() + nbins_*id);
320 return {ptr, nbins_};
325 const uint32_t k_max = std::numeric_limits<uint32_t>::max();
326 return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
331 if (nbins_ != nbins) {
342 constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
343 if (nid >= row_ptr_.size()) {
344 row_ptr_.resize(nid + 1, kMax);
346 CHECK_EQ(row_ptr_[nid], kMax);
348 if (data_.size() < (nid + 1)) {
349 data_.resize((nid + 1));
352 row_ptr_[nid] = n_nodes_added_;
357 if (data_[row_ptr_[nid]].size() == 0) {
358 data_[row_ptr_[nid]].resize(nbins_, {0, 0});
363 const size_t new_size = nbins_*data_.size();
364 contiguous_allocation_ =
true;
365 if (data_[0].size() != new_size) {
366 data_[0].resize(new_size);
374 uint32_t n_nodes_added_ = 0;
376 bool contiguous_allocation_ =
false;
378 std::vector<std::vector<GradientPairT>> data_;
381 std::vector<size_t> row_ptr_;
389 template<
typename GradientSumT>
404 const std::vector<GHistRowT>& targeted_hists) {
411 CHECK_EQ(nodes, targeted_hists.size());
445 CHECK_GT(end, begin);
450 bool is_updated =
false;
451 for (
size_t tid = 0; tid <
nthreads_; ++tid) {
472 const size_t space_size = space.
Size();
477 for (
size_t tid = 0; tid <
nthreads_; ++tid) {
478 size_t begin = chunck_size * tid;
479 size_t end = std::min(begin + chunck_size, space_size);
481 if (begin < space_size) {
485 for (
size_t nid = nid_begin; nid <= nid_end; ++nid) {
494 size_t hist_allocated_additionally = 0;
496 for (
size_t nid = 0; nid <
nodes_; ++nid) {
497 int nthreads_for_nid = 0;
499 for (
size_t tid = 0; tid <
nthreads_; ++tid) {
509 hist_allocated_additionally += std::max<int>(0, nthreads_for_nid - 1);
512 for (
size_t i = 0; i < hist_allocated_additionally; ++i) {
518 size_t hist_allocated_additionally = 0;
520 for (
size_t nid = 0; nid <
nodes_; ++nid) {
521 bool first_hist =
true;
522 for (
size_t tid = 0; tid <
nthreads_; ++tid) {
564 template<
typename GradientSumT>
570 GHistBuilder(
size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {}
573 template <
bool any_missing>
574 void BuildHist(
const std::vector<GradientPair>& gpair,
575 const RowSetCollection::Elem row_indices,
576 const GHistIndexMatrix& gmat,
589 size_t nthread_ { 0 };
591 uint32_t nbins_ { 0 };
599 template<
typename T,
size_t MaxStackSize>
607 if (MaxStackSize >= required_size_) {
610 ptr_ =
reinterpret_cast<T*
>(malloc(required_size_ *
sizeof(T)));
619 if (do_free_) free(ptr_);
625 bool do_free_ =
false;
626 size_t required_size_;
627 T stack_mem_[MaxStackSize];
631 #endif // XGBOOST_COMMON_HIST_UTIL_H_
bst_feature_t index
feature index
Definition: data.h:189
util to compute quantiles
virtual MetaInfo & Info()=0
meta information of the dataset
#define XGBOOST_HOST_DEV_INLINE
Definition: base.h:91
std::vector< uint8_t >::iterator end()
Definition: hist_util.h:214
void IncrementHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > add, size_t begin, size_t end)
Increment hist as dst += add in range [begin, end)
Implementation of gradient statistics pair. Template specialisation may be used to overload different...
Definition: base.h:141
void AllocateData(bst_uint nid)
Definition: hist_util.h:356
GHistRowT GetInitializedHist(size_t tid, size_t nid)
Definition: hist_util.h:425
@ kUint32BinsTypeSize
Definition: hist_util.h:143
size_t nodes_
number of nodes which will be processed in parallel
Definition: hist_util.h:540
size_t TotalBins() const
Definition: hist_util.h:92
size_t Size() const
Definition: threading_utils.h:102
HistogramCuts(HistogramCuts const &that)
Definition: hist_util.h:50
void AllocateAllData()
Definition: hist_util.h:362
uint32_t BinIdx
Definition: hist_util.h:41
Element from a sparse vector.
Definition: data.h:187
Quick Utility to compute subset of rows.
Definition: hist_util.h:39
std::vector< uint8_t >::iterator begin()
Definition: hist_util.h:211
std::vector< bool > threads_to_nids_map_
Buffer for additional histograms for Parallel processing
Definition: hist_util.h:551
void ReduceHist(size_t nid, size_t begin, size_t end)
Definition: hist_util.h:444
void Reset(size_t nthreads, size_t nodes, const BlockedSpace2d &space, const std::vector< GHistRowT > &targeted_hists)
Definition: hist_util.h:403
~MemStackAllocator()
Definition: hist_util.h:618
void ResizeOffset(const size_t nDisps)
Definition: hist_util.h:199
Definition: hist_util.h:146
HistogramCuts(HistogramCuts &&that) noexcept(true)
Definition: hist_util.h:59
void Resize(const size_t nBytesData)
Definition: hist_util.h:195
common::Span< T const > ConstHostSpan() const
Definition: host_device_vector.h:114
HostDeviceVector< uint32_t > cut_ptrs_
Definition: hist_util.h:45
size_t GetFirstDimension(size_t i) const
Definition: threading_utils.h:107
BinIdx SearchBin(float value, uint32_t column_id) const
Definition: hist_util.h:96
std::map< std::pair< size_t, size_t >, int > tid_nid_to_hist_
map pair {tid, nid} to index of allocated histogram from hist_buffer_ and targeted_hists_,...
Definition: hist_util.h:558
HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, Span< float > const hessian={})
Definition: hist_util.h:113
void CopyHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > src, size_t begin, size_t end)
Copy hist from src to dst in range [begin, end)
Internal data structured used by XGBoost during training.
Definition: data.h:469
static std::vector< bst_row_t > CalcColumnSize(SparsePage const &page, bst_feature_t const n_columns, size_t const nthreads)
Definition: threading_utils.h:72
uint32_t * Offset() const
Definition: hist_util.h:186
uint32_t GetNumBins() const
Definition: hist_util.h:583
void AllocateAdditionalHistograms()
Definition: hist_util.h:493
void SetBinTypeSize(BinTypeSize binTypeSize)
Definition: hist_util.h:161
size_t Size() const
Definition: hist_util.h:192
T * data() const
Definition: hist_util.h:183
void AddHistRow(bst_uint nid)
Definition: hist_util.h:341
std::vector< uint32_t > const & Ptrs() const
Definition: hist_util.h:88
void Init(uint32_t nbins)
Definition: hist_util.h:330
histogram of gradient statistics for multiple nodes
Definition: hist_util.h:304
HostDeviceVector< float > min_vals_
Definition: hist_util.h:47
GHistRowT operator[](bst_uint nid) const
Definition: hist_util.h:310
HistogramCuts & operator=(HistogramCuts &&that) noexcept(true)
Definition: hist_util.h:73
int32_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(size_t begin, size_t end, GradientIndex const &data, uint32_t const fidx_begin, uint32_t const fidx_end)
Definition: hist_util.h:241
Stores temporary histograms to compute them in parallel Supports processing multiple tree-nodes for n...
Definition: hist_util.h:390
GHistBuilder(size_t nthread, uint32_t nbins)
Definition: hist_util.h:570
@ kUint8BinsTypeSize
Definition: hist_util.h:141
uint32_t operator[](size_t i) const
Definition: hist_util.h:154
bool RowExists(bst_uint nid) const
Definition: hist_util.h:324
void InitilizeHistByZeroes(GHistRow< GradientSumT > hist, size_t begin, size_t end)
fill a histogram by zeros
uint32_t FeatureBins(uint32_t feature) const
Definition: hist_util.h:80
std::vector< float > const & MinValues() const
Definition: hist_util.h:90
size_t nbins_
number of bins in each histogram
Definition: hist_util.h:536
Index & operator=(Index i)=delete
void MatchThreadsToNodes(const BlockedSpace2d &space)
Definition: hist_util.h:471
void Copy(const HostDeviceVector< T > &other)
void SubtractionHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > src1, const GHistRow< GradientSumT > src2, size_t begin, size_t end)
Compute Subtraction: dst = src1 - src2 in range [begin, end)
size_t nthreads_
number of threads for parallel computation
Definition: hist_util.h:538
void MatchNodeNidPairToHist()
Definition: hist_util.h:517
void Resize(size_t new_size, T v=T())
BinTypeSize GetBinTypeSize() const
Definition: hist_util.h:179
MemStackAllocator(size_t required_size)
Definition: hist_util.h:602
static bool UseGroup(MetaInfo const &info)
Definition: quantile.h:731
BinTypeSize
Definition: hist_util.h:140
GHistRow< GradientSumT > GHistRowT
Definition: hist_util.h:567
The input data structure of xgboost.
Index()
Definition: hist_util.h:147
void Init(size_t nbins)
Definition: hist_util.h:394
BinIdx SearchBin(Entry const &e) const
Definition: hist_util.h:108
const std::vector< T > & ConstHostVector() const
constexpr XGBOOST_DEVICE index_type size() const __span_noexcept
Definition: span.h:547
A C-style array with in-stack allocation. As long as the array is smaller than MaxStackSize,...
Definition: hist_util.h:600
constexpr XGBOOST_DEVICE pointer data() const __span_noexcept
Definition: span.h:542
HostDeviceVector< bst_float > cut_values_
Definition: hist_util.h:44
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:113
@ kUint16BinsTypeSize
Definition: hist_util.h:142
std::vector< GHistRowT > targeted_hists_
Contains histograms for final results
Definition: hist_util.h:553
T * Get()
Definition: hist_util.h:605
bst_float fvalue
feature value
Definition: data.h:191
std::vector< uint8_t >::const_iterator begin() const
Definition: hist_util.h:204
HistCollection< GradientSumT > hist_buffer_
Buffer for additional histograms for Parallel processing
Definition: hist_util.h:542
BatchSet< T > GetBatches(const BatchParam ¶m={})
Gets batches. Use range based for loop over BatchSet to access individual batches.
HistogramCuts & operator=(HistogramCuts const &that)
Definition: hist_util.h:63
std::vector< uint8_t >::const_iterator end() const
Definition: hist_util.h:207
std::vector< float > const & Values() const
Definition: hist_util.h:89
void SubtractionTrick(GHistRowT self, GHistRowT sibling, GHistRowT parent)
std::vector< int > hist_was_used_
Marks which hists were used, it means that they should be merged. Contains only {true or false} value...
Definition: hist_util.h:548
size_t OffsetSize() const
Definition: hist_util.h:189
builder for histograms of gradient statistics
Definition: hist_util.h:565
void BuildHist(const std::vector< GradientPair > &gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, GHistRowT hist)
namespace of xgboost
Definition: base.h:110