Go to the documentation of this file.
7 #ifndef XGBOOST_COMMON_QUANTILE_H_
8 #define XGBOOST_COMMON_QUANTILE_H_
10 #include <dmlc/base.h>
11 #include <xgboost/logging.h>
28 template<
typename DType,
typename RType>
50 CHECK(
rmin >= 0 &&
rmax >= 0 &&
wmin >= 0) <<
"nonneg constraint";
51 CHECK(
rmax-
rmin -
wmin > -eps) <<
"relation constraint: min/max";
63 os <<
"rmin: " << e.
rmin <<
", "
64 <<
"rmax: " << e.
rmax <<
", "
65 <<
"wmin: " << e.
wmin <<
", "
66 <<
"value: " << e.
value;
93 inline void Push(DType x, RType w) {
106 for (
size_t i = 0; i <
qtail;) {
108 RType w =
queue[i].weight;
110 w +=
queue[j].weight; ++j;
129 for (
size_t i = 1; i <
size; ++i) {
130 res = std::max(
data[i].RMaxPrev() -
data[i - 1].RMinNext(), res);
131 res = std::max(
data[i].rmax -
data[i].rmin -
data[i].wmin, res);
141 while (istart < size && qvalue >
data[istart].value) {
144 if (istart ==
size) {
146 return Entry(rmax, rmax, 0.0f, qvalue);
148 if (qvalue ==
data[istart].value) {
152 return Entry(0.0f, 0.0f, 0.0f, qvalue);
154 return Entry(
data[istart - 1].RMinNext(),
155 data[istart].RMaxPrev(),
170 CHECK_EQ(src.
size, 0);
175 CHECK_EQ(this->size, 0);
176 CHECK_EQ(src.
size, 0);
184 for (
size_t i = 0; i < n;) {
187 for (; j < n && entries[j].
value == entries[i].
value; ++j) {}
188 data[
size++] =
Entry(entries[i].rmin, entries[i].rmax, entries[i].wmin,
200 for (
size_t i = 0; i <
size; ++i) {
203 CHECK(
data[i].rmin >=
data[i - 1].rmin +
data[i - 1].wmin) <<
"rmin range constraint";
204 CHECK(
data[i].rmax >=
data[i - 1].rmax +
data[i].wmin) <<
"rmax range constraint";
216 if (src.
size <= maxsize) {
219 const RType begin = src.
data[0].
rmax;
221 const size_t n = maxsize - 1;
225 size_t i = 1, lastidx = 0;
226 for (
size_t k = 1; k < n; ++k) {
227 RType dx2 = 2 * ((k * range) / n + begin);
229 while (i < src.
size - 1
231 if (i == src.
size - 1)
break;
237 if (i + 1 != lastidx) {
242 if (lastidx != src.
size - 1) {
263 RType aprev_rmin = 0, bprev_rmin = 0;
265 while (a != a_end && b != b_end) {
289 RType brmax = (b_end - 1)->rmax;
293 }
while (a != a_end);
296 RType armax = (a_end - 1)->rmax;
300 }
while (b != b_end);
302 this->size = dst -
data;
303 const RType tol = 10;
304 RType err_mingap, err_maxgap, err_wgap;
305 this->
FixError(&err_mingap, &err_maxgap, &err_wgap);
306 if (err_mingap > tol || err_maxgap > tol || err_wgap > tol) {
307 LOG(INFO) <<
"mingap=" << err_mingap
308 <<
", maxgap=" << err_maxgap
309 <<
", wgap=" << err_wgap;
315 for (
size_t i = 0; i < this->
size; ++i) {
316 LOG(CONSOLE) <<
"[" << i <<
"] rmin=" <<
data[i].
rmin
326 RType *err_wgap)
const {
330 RType prev_rmin = 0, prev_rmax = 0;
331 for (
size_t i = 0; i < this->
size; ++i) {
332 if (
data[i].rmin < prev_rmin) {
334 *err_mingap = std::max(*err_mingap, prev_rmin -
data[i].rmin);
338 if (
data[i].rmax < prev_rmax) {
340 *err_maxgap = std::max(*err_maxgap, prev_rmax -
data[i].rmax);
343 if (
data[i].rmax < rmin_next) {
345 *err_wgap = std::max(*err_wgap,
data[i].rmax - rmin_next);
351 inline bool Check(
const char *msg)
const {
352 const float tol = 10.0f;
353 for (
size_t i = 0; i < this->
size; ++i) {
355 data[i].rmin < -1e-6f ||
data[i].rmax < -1e-6f) {
356 LOG(INFO) <<
"---------- WQSummary::Check did not pass ----------";
366 template<
typename DType,
typename RType>
375 return e.RMinNext() > e.RMaxPrev() + chunk;
379 if (src.
size <= maxsize) {
382 RType begin = src.
data[0].rmax;
384 size_t n = maxsize - 2, nbig = 0;
386 RType range = src.
data[src.
size - 1].rmin - begin;
388 if (range == 0.0f || maxsize <= 2) {
395 range = std::max(range,
static_cast<RType
>(1e-3f));
399 const RType chunk = 2 * range / n;
406 for (
size_t i = 1; i < src.
size - 1; ++i) {
412 mrange += src.
data[i].RMaxPrev() - src.
data[bid].RMinNext();
417 if (bid != src.
size - 2) {
418 mrange += src.
data[src.
size-1].RMaxPrev() - src.
data[bid].RMinNext();
424 LOG(INFO) <<
" check quantile stats, nbig=" << nbig <<
", n=" << n;
425 LOG(INFO) <<
" srcsize=" << src.
size <<
", maxsize=" << maxsize
426 <<
", range=" << range <<
", chunk=" << chunk;
428 CHECK(nbig < n) <<
"quantile: too many large chunk";
435 size_t bid = 0, k = 1, lastidx = 0;
436 for (
size_t end = 1; end < src.
size; ++end) {
438 if (bid != end - 1) {
440 RType maxdx2 = src.
data[end].RMaxPrev() * 2;
442 RType dx2 = 2 * ((k * mrange) / n + begin);
443 if (dx2 >= maxdx2)
break;
445 dx2 >= src.
data[i + 1].rmax + src.
data[i + 1].rmin) ++i;
447 if (dx2 < src.
data[i].RMinNext() + src.
data[i + 1].RMaxPrev()) {
452 if (i + 1 != lastidx) {
453 this->
data[this->
size++] = src.
data[i + 1]; lastidx = i + 1;
458 if (lastidx != end) {
464 begin += src.
data[bid].RMinNext() - src.
data[bid].RMaxPrev();
476 template<
typename DType,
typename RType,
class TSummary>
490 this->space = src.
space;
491 this->
data = dmlc::BeginPtr(this->space);
497 if (size >
space.size()) {
509 this->
Reserve((max_nbyte -
sizeof(this->size)) /
sizeof(
Entry));
512 temp.SetCombine(*
this, src);
517 return sizeof(size_t) +
sizeof(
Entry) * nentry;
520 template<
typename TStream>
521 inline void Save(TStream &fo)
const {
522 fo.Write(&(this->size),
sizeof(this->size));
523 if (this->size != 0) {
524 fo.Write(this->
data, this->size *
sizeof(
Entry));
528 template<
typename TStream>
529 inline void Load(TStream &fi) {
530 CHECK_EQ(fi.Read(&this->size,
sizeof(this->size)),
sizeof(this->size));
532 if (this->size != 0) {
533 CHECK_EQ(fi.Read(this->data, this->size *
sizeof(
Entry)),
534 this->size *
sizeof(
Entry));
543 inline void Init(
size_t maxn,
double eps) {
553 (
size_t maxn,
double eps,
size_t* out_nlevel,
size_t* out_limit_size) {
554 size_t&
nlevel = *out_nlevel;
560 size_t n = (1ULL <<
nlevel);
565 size_t n = (1ULL <<
nlevel);
566 CHECK(n *
limit_size >= maxn) <<
"invalid init parameter";
567 CHECK(
nlevel <= std::max(
static_cast<size_t>(1),
static_cast<size_t>(
limit_size * eps)))
568 <<
"invalid init parameter";
576 inline void Push(DType x, RType w = 1) {
577 if (w ==
static_cast<RType
>(0))
return;
580 if (
inqueue.queue.size() == 1) {
602 for (
size_t l = 1;
true; ++l) {
605 if (
level[l].size == 0) {
624 if (
level.size() != 0) {
627 out->Reserve(
inqueue.queue.size());
630 if (
level.size() != 0) {
632 for (
size_t l = 1; l <
level.size(); ++l) {
633 if (
level[l].size == 0)
continue;
634 if (
level[0].size == 0) {
641 out->CopyFrom(
level[0]);
652 for (
size_t l = 1; l <
level.size(); ++l) {
653 level[l].CheckValid(eps);
661 for (
size_t l = 0; l <
level.size(); ++l) {
684 template<
typename DType,
typename RType =
unsigned>
694 template<
typename DType,
typename RType =
unsigned>
709 std::vector<WQSketch> sketches_;
710 std::vector<bst_row_t> columns_size_;
712 bool use_group_ind_{
false};
726 size_t const num_groups =
729 bool const use_group_ind =
731 return use_group_ind;
736 size_t const nthreads);
740 size_t const nthreads);
743 size_t const base_rowid) {
744 CHECK_LT(base_rowid, group_ptr.back())
745 <<
"Row: " << base_rowid <<
" is not found in any group.";
747 std::upper_bound(group_ptr.cbegin(), group_ptr.cend() - 1, base_rowid) -
748 group_ptr.cbegin() - 1;
752 void GatherSketchInfo(std::vector<WQSketch::SummaryContainer>
const &reduced,
753 std::vector<bst_row_t> *p_worker_segments,
754 std::vector<bst_row_t> *p_sketches_scan,
755 std::vector<WQSketch::Entry> *p_global_sketches);
757 void AllReduce(std::vector<WQSketch::SummaryContainer> *p_reduced,
758 std::vector<int32_t>* p_num_cuts);
767 #endif // XGBOOST_COMMON_QUANTILE_H_
void MakeCuts(HistogramCuts *cuts)
void CopyFrom(const WQSummary &src)
copy content from src
Definition: quantile.h:168
void MakeSummary(WQSummary *out)
Definition: quantile.h:100
void AllReduce(std::vector< WQSketch::SummaryContainer > *p_reduced, std::vector< int32_t > *p_num_cuts)
bool Check(const char *msg) const
Definition: quantile.h:351
static void LimitSizeLevel(size_t maxn, double eps, size_t *out_nlevel, size_t *out_limit_size)
Definition: quantile.h:553
void FixError(RType *err_mingap, RType *err_maxgap, RType *err_wgap) const
Definition: quantile.h:324
void Reserve(size_t size)
reserve space for summary
Definition: quantile.h:496
void CheckValid(RType eps) const
Definition: quantile.h:651
Definition: quantile.h:704
void MakeFromSorted(const Entry *entries, size_t n)
Definition: quantile.h:182
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:243
WXQSummary(Entry *data, size_t size)
Definition: quantile.h:371
void Save(TStream &fo) const
save the data structure into stream
Definition: quantile.h:521
XGBOOST_DEVICE RType RMaxPrev() const
Definition: quantile.h:58
Definition: hist_util.h:37
static uint32_t SearchGroupIndFromRow(std::vector< bst_uint > const &group_ptr, size_t const base_rowid)
Definition: quantile.h:742
size_t limit_size
Definition: quantile.h:670
size_t qtail
Definition: quantile.h:91
RType rmax
maximum rank
Definition: quantile.h:35
std::vector< Summary > level
Definition: quantile.h:672
try to do efficient pruning
Definition: quantile.h:367
static size_t CalcMemCost(size_t nentry)
return the number of bytes this data structure cost in serialization
Definition: quantile.h:516
void GetSummary(SummaryContainer *out)
get the summary after finalize
Definition: quantile.h:623
std::vector< Entry > space
Definition: quantile.h:488
void CheckValid(RType eps=0) const
debug function, check Valid
Definition: quantile.h:49
Quantile sketch use WQSummary.
Definition: quantile.h:685
void Push(DType x, RType w)
Definition: quantile.h:93
void Print() const
Definition: quantile.h:314
same as summary, but use STL to backup the space
Definition: quantile.h:487
XGBOOST_DEVICE Entry(RType rmin, RType rmax, RType wmin, DType value)
Definition: quantile.h:43
static std::vector< bst_row_t > CalcColumnSize(SparsePage const &page, bst_feature_t const n_columns, size_t const nthreads)
void Push(DType x, RType w=1)
add an element to a sketch
Definition: quantile.h:576
bool operator<(const QEntry &b) const
Definition: quantile.h:84
XGBOOST_DEVICE Entry()
Definition: quantile.h:41
uint32_t bst_feature_t
Type for data column (feature) index.
Definition: base.h:123
void InitLevel(size_t nlevel)
Definition: quantile.h:657
friend std::ostream & operator<<(std::ostream &os, Entry const &e)
Definition: quantile.h:62
uint32_t bst_group_t
Type for ranking group index.
Definition: base.h:134
RType rmin
minimum rank
Definition: quantile.h:33
static bool CheckLarge(const Entry &e, RType chunk)
Definition: quantile.h:374
RType weight
Definition: quantile.h:77
RType MaxRank() const
Definition: quantile.h:161
void CheckValid(RType eps) const
debug function, validate whether the summary run consistency check to check if it is a valid summary
Definition: quantile.h:199
Timing utility used to measure total method execution time over the lifetime of the containing object...
Definition: timer.h:47
an entry in the sketch summary
Definition: quantile.h:31
static constexpr float kFactor
Definition: quantile.h:479
QEntry(DType value, RType weight)
Definition: quantile.h:81
void GatherSketchInfo(std::vector< WQSketch::SummaryContainer > const &reduced, std::vector< bst_row_t > *p_worker_segments, std::vector< bst_row_t > *p_sketches_scan, std::vector< WQSketch::Entry > *p_global_sketches)
HostSketchContainer(std::vector< bst_row_t > columns_size, int32_t max_bins, bool use_group)
TSummary Summary
type of summary type
Definition: quantile.h:483
typename Summary::Entry Entry
the entry type
Definition: quantile.h:485
void SetCombine(const WQSummary &sa, const WQSummary &sb)
set current summary to be merged summary of sa and sb
Definition: quantile.h:251
SummaryContainer(const SummaryContainer &src)
Definition: quantile.h:489
static bool UseGroup(MetaInfo const &info)
Definition: quantile.h:725
WQSummary(Entry *data, size_t size)
Definition: quantile.h:122
DType value
the value of data
Definition: quantile.h:39
template for all quantile sketch algorithm that uses merge/prune scheme
Definition: quantile.h:477
static std::vector< bst_feature_t > LoadBalance(SparsePage const &page, bst_feature_t n_columns, size_t const nthreads)
void Reduce(const Summary &src, size_t max_nbyte)
do elementwise combination of summary array this[i] = combine(this[i], src[i]) for each i
Definition: quantile.h:508
input data queue before entering the summary
Definition: quantile.h:71
void SetPrune(const WQSummary< DType, RType > &src, size_t maxsize)
Definition: quantile.h:378
void PushSummary(const Summary &summary)
Definition: quantile.h:593
RType wmin
maximum weight
Definition: quantile.h:37
The input data structure of xgboost.
size_t size
number of elements in the summary
Definition: quantile.h:120
SummaryContainer temp
Definition: quantile.h:676
experimental wsummary
Definition: quantile.h:29
RType MaxError() const
Definition: quantile.h:127
void PushRowPage(SparsePage const &page, MetaInfo const &info)
void PushTemp()
push up temp
Definition: quantile.h:600
Entry Query(DType qvalue, size_t &istart) const
query qvalue, start from istart
Definition: quantile.h:140
std::vector< Entry > data
Definition: quantile.h:674
Entry * data
data field
Definition: quantile.h:118
Quantile sketch use WXQSummary.
Definition: quantile.h:695
Summary::Queue inqueue
Definition: quantile.h:666
XGBOOST_DEVICE RType RMinNext() const
Definition: quantile.h:54
void SetPrune(const WQSummary &src, size_t maxsize)
set current summary to be pruned summary of src assume data field is already allocated to be at least...
Definition: quantile.h:215
typename WQSummary< DType, RType >::Entry Entry
Definition: quantile.h:369
Definition: quantile.h:73
SummaryContainer()
Definition: quantile.h:493
void Load(TStream &fi)
load data structure from input stream
Definition: quantile.h:529
#define XGBOOST_DEVICE
Tag function as usable by device.
Definition: base.h:84
DType value
Definition: quantile.h:75
size_t nlevel
Definition: quantile.h:668
void Init(size_t maxn, double eps)
initialize the quantile sketch, given the performance specification
Definition: quantile.h:543
std::vector< QEntry > queue
Definition: quantile.h:89
namespace of xgboost
Definition: base.h:110