Go to the documentation of this file.
7 #ifndef XGBOOST_COMMON_QUANTILE_H_
8 #define XGBOOST_COMMON_QUANTILE_H_
10 #include <dmlc/base.h>
11 #include <xgboost/logging.h>
29 template<
typename DType,
typename RType>
51 CHECK(
rmin >= 0 &&
rmax >= 0 &&
wmin >= 0) <<
"nonneg constraint";
52 CHECK(
rmax-
rmin -
wmin > -eps) <<
"relation constraint: min/max";
64 os <<
"rmin: " << e.
rmin <<
", "
65 <<
"rmax: " << e.
rmax <<
", "
66 <<
"wmin: " << e.
wmin <<
", "
67 <<
"value: " << e.
value;
94 inline void Push(DType x, RType w) {
107 for (
size_t i = 0; i <
qtail;) {
109 RType w =
queue[i].weight;
111 w +=
queue[j].weight; ++j;
130 for (
size_t i = 1; i <
size; ++i) {
131 res = std::max(
data[i].RMaxPrev() -
data[i - 1].RMinNext(), res);
132 res = std::max(
data[i].rmax -
data[i].rmin -
data[i].wmin, res);
142 while (istart < size && qvalue >
data[istart].value) {
145 if (istart ==
size) {
147 return Entry(rmax, rmax, 0.0f, qvalue);
149 if (qvalue ==
data[istart].value) {
153 return Entry(0.0f, 0.0f, 0.0f, qvalue);
155 return Entry(
data[istart - 1].RMinNext(),
156 data[istart].RMaxPrev(),
171 CHECK_EQ(src.
size, 0);
176 CHECK_EQ(this->size, 0);
177 CHECK_EQ(src.
size, 0);
185 for (
size_t i = 0; i < n;) {
188 for (; j < n && entries[j].
value == entries[i].
value; ++j) {}
189 data[
size++] =
Entry(entries[i].rmin, entries[i].rmax, entries[i].wmin,
201 for (
size_t i = 0; i <
size; ++i) {
204 CHECK(
data[i].rmin >=
data[i - 1].rmin +
data[i - 1].wmin) <<
"rmin range constraint";
205 CHECK(
data[i].rmax >=
data[i - 1].rmax +
data[i].wmin) <<
"rmax range constraint";
217 if (src.
size <= maxsize) {
220 const RType begin = src.
data[0].
rmax;
222 const size_t n = maxsize - 1;
226 size_t i = 1, lastidx = 0;
227 for (
size_t k = 1; k < n; ++k) {
228 RType dx2 = 2 * ((k * range) / n + begin);
230 while (i < src.
size - 1
232 if (i == src.
size - 1)
break;
238 if (i + 1 != lastidx) {
243 if (lastidx != src.
size - 1) {
264 RType aprev_rmin = 0, bprev_rmin = 0;
266 while (a != a_end && b != b_end) {
290 RType brmax = (b_end - 1)->rmax;
294 }
while (a != a_end);
297 RType armax = (a_end - 1)->rmax;
301 }
while (b != b_end);
303 this->size = dst -
data;
304 const RType tol = 10;
305 RType err_mingap, err_maxgap, err_wgap;
306 this->
FixError(&err_mingap, &err_maxgap, &err_wgap);
307 if (err_mingap > tol || err_maxgap > tol || err_wgap > tol) {
308 LOG(INFO) <<
"mingap=" << err_mingap
309 <<
", maxgap=" << err_maxgap
310 <<
", wgap=" << err_wgap;
316 for (
size_t i = 0; i < this->
size; ++i) {
317 LOG(CONSOLE) <<
"[" << i <<
"] rmin=" <<
data[i].
rmin
327 RType *err_wgap)
const {
331 RType prev_rmin = 0, prev_rmax = 0;
332 for (
size_t i = 0; i < this->
size; ++i) {
333 if (
data[i].rmin < prev_rmin) {
335 *err_mingap = std::max(*err_mingap, prev_rmin -
data[i].rmin);
339 if (
data[i].rmax < prev_rmax) {
341 *err_maxgap = std::max(*err_maxgap, prev_rmax -
data[i].rmax);
344 if (
data[i].rmax < rmin_next) {
346 *err_wgap = std::max(*err_wgap,
data[i].rmax - rmin_next);
352 inline bool Check(
const char *msg)
const {
353 const float tol = 10.0f;
354 for (
size_t i = 0; i < this->
size; ++i) {
356 data[i].rmin < -1e-6f ||
data[i].rmax < -1e-6f) {
357 LOG(INFO) <<
"---------- WQSummary::Check did not pass ----------";
367 template<
typename DType,
typename RType>
376 return e.RMinNext() > e.RMaxPrev() + chunk;
380 if (src.
size <= maxsize) {
383 RType begin = src.
data[0].rmax;
385 size_t n = maxsize - 2, nbig = 0;
387 RType range = src.
data[src.
size - 1].rmin - begin;
389 if (range == 0.0f || maxsize <= 2) {
396 range = std::max(range,
static_cast<RType
>(1e-3f));
400 const RType chunk = 2 * range / n;
407 for (
size_t i = 1; i < src.
size - 1; ++i) {
413 mrange += src.
data[i].RMaxPrev() - src.
data[bid].RMinNext();
418 if (bid != src.
size - 2) {
419 mrange += src.
data[src.
size-1].RMaxPrev() - src.
data[bid].RMinNext();
425 LOG(INFO) <<
" check quantile stats, nbig=" << nbig <<
", n=" << n;
426 LOG(INFO) <<
" srcsize=" << src.
size <<
", maxsize=" << maxsize
427 <<
", range=" << range <<
", chunk=" << chunk;
429 CHECK(nbig < n) <<
"quantile: too many large chunk";
436 size_t bid = 0, k = 1, lastidx = 0;
437 for (
size_t end = 1; end < src.
size; ++end) {
439 if (bid != end - 1) {
441 RType maxdx2 = src.
data[end].RMaxPrev() * 2;
443 RType dx2 = 2 * ((k * mrange) / n + begin);
444 if (dx2 >= maxdx2)
break;
446 dx2 >= src.
data[i + 1].rmax + src.
data[i + 1].rmin) ++i;
448 if (dx2 < src.
data[i].RMinNext() + src.
data[i + 1].RMaxPrev()) {
453 if (i + 1 != lastidx) {
454 this->
data[this->
size++] = src.
data[i + 1]; lastidx = i + 1;
459 if (lastidx != end) {
465 begin += src.
data[bid].RMinNext() - src.
data[bid].RMaxPrev();
477 template<
typename DType,
typename RType,
class TSummary>
491 this->space = src.
space;
492 this->
data = dmlc::BeginPtr(this->space);
498 if (size >
space.size()) {
510 this->
Reserve((max_nbyte -
sizeof(this->size)) /
sizeof(
Entry));
513 temp.SetCombine(*
this, src);
518 return sizeof(size_t) +
sizeof(
Entry) * nentry;
521 template<
typename TStream>
522 inline void Save(TStream &fo)
const {
523 fo.Write(&(this->size),
sizeof(this->size));
524 if (this->size != 0) {
525 fo.Write(this->
data, this->size *
sizeof(
Entry));
529 template<
typename TStream>
530 inline void Load(TStream &fi) {
531 CHECK_EQ(fi.Read(&this->size,
sizeof(this->size)),
sizeof(this->size));
533 if (this->size != 0) {
534 CHECK_EQ(fi.Read(this->data, this->size *
sizeof(
Entry)),
535 this->size *
sizeof(
Entry));
544 inline void Init(
size_t maxn,
double eps) {
554 (
size_t maxn,
double eps,
size_t* out_nlevel,
size_t* out_limit_size) {
555 size_t&
nlevel = *out_nlevel;
561 size_t n = (1ULL <<
nlevel);
566 size_t n = (1ULL <<
nlevel);
567 CHECK(n *
limit_size >= maxn) <<
"invalid init parameter";
568 CHECK(
nlevel <= std::max(
static_cast<size_t>(1),
static_cast<size_t>(
limit_size * eps)))
569 <<
"invalid init parameter";
577 inline void Push(DType x, RType w = 1) {
578 if (w ==
static_cast<RType
>(0))
return;
581 if (
inqueue.queue.size() == 1) {
603 for (
size_t l = 1;
true; ++l) {
606 if (
level[l].size == 0) {
625 if (
level.size() != 0) {
628 out->Reserve(
inqueue.queue.size());
631 if (
level.size() != 0) {
633 for (
size_t l = 1; l <
level.size(); ++l) {
634 if (
level[l].size == 0)
continue;
635 if (
level[0].size == 0) {
642 out->CopyFrom(
level[0]);
653 for (
size_t l = 1; l <
level.size(); ++l) {
654 level[l].CheckValid(eps);
662 for (
size_t l = 0; l <
level.size(); ++l) {
685 template<
typename DType,
typename RType =
unsigned>
695 template<
typename DType,
typename RType =
unsigned>
705 template <
typename WQSketch>
731 size_t const num_groups =
734 bool const use_group_ind =
736 return use_group_ind;
741 size_t const nthreads);
745 size_t const nthreads);
748 size_t const base_rowid) {
749 CHECK_LT(base_rowid, group_ptr.back())
750 <<
"Row: " << base_rowid <<
" is not found in any group.";
752 std::upper_bound(group_ptr.cbegin(), group_ptr.cend() - 1, base_rowid) -
753 group_ptr.cbegin() - 1;
757 void GatherSketchInfo(std::vector<typename WQSketch::SummaryContainer>
const &reduced,
758 std::vector<bst_row_t> *p_worker_segments,
759 std::vector<bst_row_t> *p_sketches_scan,
760 std::vector<typename WQSketch::Entry> *p_global_sketches);
762 void AllReduce(std::vector<typename WQSketch::SummaryContainer> *p_reduced,
763 std::vector<int32_t> *p_num_cuts);
795 inline void Init(
unsigned max_size) {
824 CHECK_LT(
sketch->
temp.size, max_size) <<
"invalid maximum size max_size=" << max_size
835 LOG(DEBUG) <<
"INFO: rmax=" << rmax <<
", sum_total=" <<
sum_total
852 <<
"Finalize: invalid maximum size, max_size=" << max_size
865 std::vector<SortedQuantile> sketches_;
870 std::vector<size_t> columns_size,
bool use_group,
875 sketches_.resize(info.num_col_);
877 for (
auto &sketch : sketches_) {
880 auto eps = 2.0 / max_bins;
888 void PushColPage(SparsePage
const &page, MetaInfo
const &info, Span<float const> hessian);
892 #endif // XGBOOST_COMMON_QUANTILE_H_
void CopyFrom(const WQSummary &src)
copy content from src
Definition: quantile.h:169
void MakeSummary(WQSummary *out)
Definition: quantile.h:101
bool Check(const char *msg) const
Definition: quantile.h:352
static void LimitSizeLevel(size_t maxn, double eps, size_t *out_nlevel, size_t *out_limit_size)
Definition: quantile.h:554
void FixError(RType *err_mingap, RType *err_maxgap, RType *err_wgap) const
Definition: quantile.h:325
void Reserve(size_t size)
reserve space for summary
Definition: quantile.h:497
void CheckValid(RType eps) const
Definition: quantile.h:652
Definition: quantile.h:771
void GatherSketchInfo(std::vector< typename WQSketch::SummaryContainer > const &reduced, std::vector< bst_row_t > *p_worker_segments, std::vector< bst_row_t > *p_sketches_scan, std::vector< typename WQSketch::Entry > *p_global_sketches)
void MakeFromSorted(const Entry *entries, size_t n)
Definition: quantile.h:183
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:271
WXQSummary(Entry *data, size_t size)
Definition: quantile.h:372
int32_t n_threads_
Definition: quantile.h:715
void Save(TStream &fo) const
save the data structure into stream
Definition: quantile.h:522
double wmin
Definition: quantile.h:787
XGBOOST_DEVICE RType RMaxPrev() const
Definition: quantile.h:59
Definition: hist_util.h:38
const std::vector< FeatureType > feature_types_
Definition: quantile.h:710
size_t limit_size
Definition: quantile.h:671
size_t qtail
Definition: quantile.h:92
RType rmax
maximum rank
Definition: quantile.h:36
void PushColPage(SparsePage const &page, MetaInfo const &info, Span< float const > hessian)
Push a sorted CSC page.
std::vector< Summary > level
Definition: quantile.h:673
try to do efficient pruning
Definition: quantile.h:368
static size_t CalcMemCost(size_t nentry)
return the number of bytes this data structure cost in serialization
Definition: quantile.h:517
void GetSummary(SummaryContainer *out)
get the summary after finalize
Definition: quantile.h:624
std::vector< std::set< float > > categories_
Definition: quantile.h:709
std::vector< Entry > space
Definition: quantile.h:489
void CheckValid(RType eps=0) const
debug function, check Valid
Definition: quantile.h:50
Quantile sketch use WQSummary.
Definition: quantile.h:686
common::Span< T const > ConstHostSpan() const
Definition: host_device_vector.h:114
void Push(DType x, RType w)
Definition: quantile.h:94
double rmin
statistics used in the sketch
Definition: quantile.h:787
void Print() const
Definition: quantile.h:315
same as summary, but use STL to backup the space
Definition: quantile.h:488
static std::vector< bst_row_t > CalcColumnSize(SparsePage const &page, bst_feature_t const n_columns, size_t const nthreads)
XGBOOST_DEVICE Entry(RType rmin, RType rmax, RType wmin, DType value)
Definition: quantile.h:44
void AllReduce(std::vector< typename WQSketch::SummaryContainer > *p_reduced, std::vector< int32_t > *p_num_cuts)
std::vector< bst_row_t > columns_size_
Definition: quantile.h:712
Definition: quantile.h:864
HostSketchContainer(int32_t max_bins, MetaInfo const &info, std::vector< size_t > columns_size, bool use_group, Span< float const > hessian, int32_t n_threads)
bool use_group_ind_
Definition: quantile.h:714
void Push(DType x, RType w=1)
add an element to a sketch
Definition: quantile.h:577
double sum_total
total sum of amount to be met
Definition: quantile.h:785
Definition: quantile.h:706
void Finalize(unsigned max_size)
push final unfinished value to the sketch
Definition: quantile.h:848
bool operator<(const QEntry &b) const
Definition: quantile.h:85
XGBOOST_DEVICE Entry()
Definition: quantile.h:42
Monitor monitor_
Definition: quantile.h:717
uint32_t bst_feature_t
Type for data column (feature) index.
Definition: base.h:123
void InitLevel(size_t nlevel)
Definition: quantile.h:658
friend std::ostream & operator<<(std::ostream &os, Entry const &e)
Definition: quantile.h:63
uint32_t bst_group_t
Type for ranking group index.
Definition: base.h:134
RType rmin
minimum rank
Definition: quantile.h:34
static uint32_t SearchGroupIndFromRow(std::vector< bst_uint > const &group_ptr, size_t const base_rowid)
Definition: quantile.h:747
static bool CheckLarge(const Entry &e, RType chunk)
Definition: quantile.h:375
RType weight
Definition: quantile.h:78
SortedSketchContainer(int32_t max_bins, MetaInfo const &info, std::vector< size_t > columns_size, bool use_group, Span< float const > hessian, int32_t n_threads)
Definition: quantile.h:869
RType MaxRank() const
Definition: quantile.h:162
void CheckValid(RType eps) const
debug function, validate whether the summary run consistency check to check if it is a valid summary
Definition: quantile.h:200
Timing utility used to measure total method execution time over the lifetime of the containing object...
Definition: timer.h:47
an entry in the sketch summary
Definition: quantile.h:32
static constexpr float kFactor
Definition: quantile.h:480
SketchContainerImpl(std::vector< bst_row_t > columns_size, int32_t max_bins, common::Span< FeatureType const > feature_types, bool use_group, int32_t n_threads)
static bool UseGroup(MetaInfo const &info)
Definition: quantile.h:730
QEntry(DType value, RType weight)
Definition: quantile.h:82
common::WXQuantileSketch< bst_float, bst_float > * sketch
Definition: quantile.h:793
TSummary Summary
type of summary type
Definition: quantile.h:484
typename Summary::Entry Entry
the entry type
Definition: quantile.h:486
void SetCombine(const WQSummary &sa, const WQSummary &sb)
set current summary to be merged summary of sa and sb
Definition: quantile.h:252
SummaryContainer(const SummaryContainer &src)
Definition: quantile.h:490
WQSummary(Entry *data, size_t size)
Definition: quantile.h:123
DType value
the value of data
Definition: quantile.h:40
double next_goal
current size of sketch
Definition: quantile.h:791
Quantile structure accepts sorted data, extracted from histmaker.
Definition: quantile.h:783
void MakeCuts(HistogramCuts *cuts)
template for all quantile sketch algorithm that uses merge/prune scheme
Definition: quantile.h:478
void Reduce(const Summary &src, size_t max_nbyte)
do elementwise combination of summary array this[i] = combine(this[i], src[i]) for each i
Definition: quantile.h:509
input data queue before entering the summary
Definition: quantile.h:72
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:148
void SetPrune(const WQSummary< DType, RType > &src, size_t maxsize)
Definition: quantile.h:379
void PushSummary(const Summary &summary)
Definition: quantile.h:594
RType wmin
maximum weight
Definition: quantile.h:38
The input data structure of xgboost.
size_t size
number of elements in the summary
Definition: quantile.h:121
SummaryContainer temp
Definition: quantile.h:677
experimental wsummary
Definition: quantile.h:30
bst_float last_fvalue
last seen feature value
Definition: quantile.h:789
void PushRowPage(SparsePage const &page, MetaInfo const &info, Span< float const > hessian={})
RType MaxError() const
Definition: quantile.h:128
void Init(std::string label)
Definition: timer.h:80
void PushTemp()
push up temp
Definition: quantile.h:601
Entry Query(DType qvalue, size_t &istart) const
query qvalue, start from istart
Definition: quantile.h:141
std::vector< Entry > data
Definition: quantile.h:675
Entry * data
data field
Definition: quantile.h:119
void Push(bst_float fvalue, bst_float w, unsigned max_size)
push a new element to sketch
Definition: quantile.h:807
Quantile sketch use WXQSummary.
Definition: quantile.h:696
Summary::Queue inqueue
Definition: quantile.h:667
XGBOOST_DEVICE RType RMinNext() const
Definition: quantile.h:55
std::vector< WQSketch > sketches_
Definition: quantile.h:708
void SetPrune(const WQSummary &src, size_t maxsize)
set current summary to be pruned summary of src assume data field is already allocated to be at least...
Definition: quantile.h:216
typename WQSummary< DType, RType >::Entry Entry
Definition: quantile.h:370
Definition: quantile.h:74
SummaryContainer()
Definition: quantile.h:494
void Init(unsigned max_size)
Definition: quantile.h:795
void Load(TStream &fi)
load data structure from input stream
Definition: quantile.h:530
int32_t max_bins_
Definition: quantile.h:713
#define XGBOOST_DEVICE
Tag function as usable by device.
Definition: base.h:84
DType value
Definition: quantile.h:76
size_t nlevel
Definition: quantile.h:669
static std::vector< bst_feature_t > LoadBalance(SparsePage const &page, bst_feature_t n_columns, size_t const nthreads)
void Init(size_t maxn, double eps)
initialize the quantile sketch, given the performance specification
Definition: quantile.h:544
std::vector< QEntry > queue
Definition: quantile.h:90
bool has_categorical_
Definition: quantile.h:716
namespace of xgboost
Definition: base.h:110
float bst_float
float type, used for storing statistics
Definition: base.h:119