7 #ifndef XGBOOST_COMMON_QUANTILE_H_ 8 #define XGBOOST_COMMON_QUANTILE_H_ 10 #include <dmlc/base.h> 11 #include <xgboost/logging.h> 25 template<
typename DType,
typename RType>
41 : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
47 CHECK(rmin >= 0 && rmax >= 0 && wmin >= 0) <<
"nonneg constraint";
48 CHECK(rmax- rmin - wmin > -eps) <<
"relation constraint: min/max";
60 os <<
"rmin: " << e.
rmin <<
", " 61 <<
"rmax: " << e.
rmax <<
", " 62 <<
"wmin: " << e.
wmin <<
", " 63 <<
"value: " << e.
value;
79 : value(value), weight(weight) {}
82 return value < b.
value;
90 inline void Push(DType x, RType w) {
91 if (qtail == 0 || queue[qtail - 1].
value != x) {
92 queue[qtail++] =
QEntry(x, w);
94 queue[qtail - 1].weight += w;
98 std::sort(queue.begin(), queue.begin() + qtail);
103 for (
size_t i = 0; i < qtail;) {
105 RType w = queue[i].weight;
106 while (j < qtail && queue[j].
value == queue[i].
value) {
107 w += queue[j].weight; ++j;
109 out->
data[out->
size++] =
Entry(wsum, wsum + w, w, queue[i].value);
120 : data(data), size(size) {}
126 for (
size_t i = 1; i <
size; ++i) {
128 res = std::max(data[i].
rmax - data[i].
rmin - data[i].
wmin, res);
138 while (istart < size && qvalue > data[istart].
value) {
141 if (istart == size) {
143 return Entry(rmax, rmax, 0.0f, qvalue);
145 if (qvalue == data[istart].value) {
149 return Entry(0.0f, 0.0f, 0.0f, qvalue);
159 return data[size - 1].
rmax;
167 std::memcpy(data, src.
data,
sizeof(
Entry) * size);
171 for (
size_t i = 0; i < n;) {
174 for (; j < n && entries[j].
value == entries[i].
value; ++j) {}
187 for (
size_t i = 0; i <
size; ++i) {
190 CHECK(data[i].
rmin >= data[i - 1].
rmin + data[i - 1].
wmin) <<
"rmin range constraint";
191 CHECK(data[i].
rmax >= data[i - 1].
rmax + data[i].
wmin) <<
"rmax range constraint";
203 if (src.
size <= maxsize) {
206 const RType begin = src.
data[0].
rmax;
208 const size_t n = maxsize - 1;
209 data[0] = src.
data[0];
212 size_t i = 1, lastidx = 0;
213 for (
size_t k = 1; k < n; ++k) {
214 RType dx2 = 2 * ((k * range) / n + begin);
216 while (i < src.
size - 1
218 if (i == src.
size - 1)
break;
221 data[size++] = src.
data[i]; lastidx = i;
224 if (i + 1 != lastidx) {
225 data[size++] = src.
data[i + 1]; lastidx = i + 1;
229 if (lastidx != src.
size - 1) {
230 data[size++] = src.
data[src.
size - 1];
250 RType aprev_rmin = 0, bprev_rmin = 0;
252 while (a != a_end && b != b_end) {
276 RType brmax = (b_end - 1)->
rmax;
280 }
while (a != a_end);
283 RType armax = (a_end - 1)->
rmax;
287 }
while (b != b_end);
289 this->size = dst -
data;
290 const RType tol = 10;
291 RType err_mingap, err_maxgap, err_wgap;
292 this->
FixError(&err_mingap, &err_maxgap, &err_wgap);
293 if (err_mingap > tol || err_maxgap > tol || err_wgap > tol) {
294 LOG(INFO) <<
"mingap=" << err_mingap
295 <<
", maxgap=" << err_maxgap
296 <<
", wgap=" << err_wgap;
298 CHECK(size <= sa.
size + sb.
size) <<
"bug in combine";
302 for (
size_t i = 0; i < this->
size; ++i) {
303 LOG(CONSOLE) <<
"[" << i <<
"] rmin=" << data[i].
rmin 304 <<
", rmax=" << data[i].
rmax 305 <<
", wmin=" << data[i].
wmin 306 <<
", v=" << data[i].
value;
313 RType *err_wgap)
const {
317 RType prev_rmin = 0, prev_rmax = 0;
318 for (
size_t i = 0; i < this->
size; ++i) {
319 if (data[i].
rmin < prev_rmin) {
320 data[i].
rmin = prev_rmin;
321 *err_mingap = std::max(*err_mingap, prev_rmin - data[i].
rmin);
323 prev_rmin = data[i].
rmin;
325 if (data[i].
rmax < prev_rmax) {
326 data[i].
rmax = prev_rmax;
327 *err_maxgap = std::max(*err_maxgap, prev_rmax - data[i].
rmax);
329 RType rmin_next = data[i].
RMinNext();
330 if (data[i].
rmax < rmin_next) {
331 data[i].
rmax = rmin_next;
332 *err_wgap = std::max(*err_wgap, data[i].
rmax - rmin_next);
334 prev_rmax = data[i].
rmax;
338 inline bool Check(
const char *msg)
const {
339 const float tol = 10.0f;
340 for (
size_t i = 0; i < this->
size; ++i) {
342 data[i].
rmin < -1e-6f || data[i].
rmax < -1e-6f) {
343 LOG(INFO) <<
"---------- WQSummary::Check did not pass ----------";
353 template<
typename DType,
typename RType>
362 return e.RMinNext() > e.RMaxPrev() + chunk;
366 if (src.
size <= maxsize) {
369 RType begin = src.
data[0].rmax;
371 size_t n = maxsize - 2, nbig = 0;
373 RType range = src.
data[src.
size - 1].rmin - begin;
375 if (range == 0.0f || maxsize <= 2) {
382 range = std::max(range, static_cast<RType>(1e-3f));
386 const RType chunk = 2 * range / n;
393 for (
size_t i = 1; i < src.
size - 1; ++i) {
396 if (CheckLarge(src.
data[i], chunk)) {
399 mrange += src.
data[i].RMaxPrev() - src.
data[bid].RMinNext();
404 if (bid != src.
size - 2) {
405 mrange += src.
data[src.
size-1].RMaxPrev() - src.
data[bid].RMinNext();
411 LOG(INFO) <<
" check quantile stats, nbig=" << nbig <<
", n=" << n;
412 LOG(INFO) <<
" srcsize=" << src.
size <<
", maxsize=" << maxsize
413 <<
", range=" << range <<
", chunk=" << chunk;
415 CHECK(nbig < n) <<
"quantile: too many large chunk";
422 size_t bid = 0, k = 1, lastidx = 0;
423 for (
size_t end = 1; end < src.
size; ++end) {
424 if (end == src.
size - 1 || CheckLarge(src.
data[end], chunk)) {
425 if (bid != end - 1) {
427 RType maxdx2 = src.
data[end].RMaxPrev() * 2;
429 RType dx2 = 2 * ((k * mrange) / n + begin);
430 if (dx2 >= maxdx2)
break;
432 dx2 >= src.
data[i + 1].rmax + src.
data[i + 1].rmin) ++i;
434 if (dx2 < src.
data[i].RMinNext() + src.
data[i + 1].RMaxPrev()) {
439 if (i + 1 != lastidx) {
440 this->
data[this->
size++] = src.
data[i + 1]; lastidx = i + 1;
445 if (lastidx != end) {
451 begin += src.
data[bid].RMinNext() - src.
data[bid].RMaxPrev();
463 template<
typename DType,
typename RType,
class TSummary>
466 static float constexpr kFactor = 8.0;
477 this->space = src.
space;
478 this->
data = dmlc::BeginPtr(this->space);
484 if (size > space.size()) {
486 this->
data = dmlc::BeginPtr(space);
496 this->Reserve((max_nbyte -
sizeof(this->
size)) /
sizeof(
Entry));
499 temp.SetCombine(*
this, src);
504 return sizeof(size_t) +
sizeof(
Entry) * nentry;
507 template<
typename TStream>
508 inline void Save(TStream &fo)
const {
509 fo.Write(&(this->
size),
sizeof(this->
size));
510 if (this->
size != 0) {
515 template<
typename TStream>
516 inline void Load(TStream &fi) {
517 CHECK_EQ(fi.Read(&this->size,
sizeof(this->size)),
sizeof(this->
size));
518 this->Reserve(this->
size);
519 if (this->
size != 0) {
520 CHECK_EQ(fi.Read(this->data, this->size *
sizeof(
Entry)),
530 inline void Init(
size_t maxn,
double eps) {
531 LimitSizeLevel(maxn, eps, &nlevel, &limit_size);
533 inqueue.queue.resize(1);
539 inline static void LimitSizeLevel
540 (
size_t maxn,
double eps,
size_t* out_nlevel,
size_t* out_limit_size) {
541 size_t& nlevel = *out_nlevel;
542 size_t& limit_size = *out_limit_size;
545 limit_size =
static_cast<size_t>(ceil(nlevel / eps)) + 1;
546 limit_size = std::min(maxn, limit_size);
547 size_t n = (1ULL << nlevel);
548 if (n * limit_size >= maxn)
break;
552 size_t n = (1ULL << nlevel);
553 CHECK(n * limit_size >= maxn) <<
"invalid init parameter";
554 CHECK(nlevel <= std::max(static_cast<size_t>(1), static_cast<size_t>(limit_size * eps)))
555 <<
"invalid init parameter";
563 inline void Push(DType x, RType w = 1) {
564 if (w == static_cast<RType>(0))
return;
565 if (inqueue.qtail == inqueue.queue.size()) {
567 if (inqueue.queue.size() == 1) {
568 inqueue.queue.resize(limit_size * 2);
570 temp.Reserve(limit_size * 2);
571 inqueue.MakeSummary(&temp);
581 temp.Reserve(limit_size * 2);
582 temp.SetPrune(summary, limit_size * 2);
588 temp.Reserve(limit_size * 2);
589 for (
size_t l = 1;
true; ++l) {
590 this->InitLevel(l + 1);
592 if (level[l].
size == 0) {
593 level[l].SetPrune(temp, limit_size);
597 level[0].SetPrune(temp, limit_size);
598 temp.SetCombine(level[0], level[l]);
599 if (temp.size > limit_size) {
604 level[l].CopyFrom(temp);
break;
611 if (level.size() != 0) {
612 out->Reserve(limit_size * 2);
614 out->Reserve(inqueue.queue.size());
616 inqueue.MakeSummary(out);
617 if (level.size() != 0) {
618 level[0].SetPrune(*out, limit_size);
619 for (
size_t l = 1; l < level.size(); ++l) {
620 if (level[l].
size == 0)
continue;
621 if (level[0].
size == 0) {
622 level[0].CopyFrom(level[l]);
624 out->SetCombine(level[0], level[l]);
625 level[0].SetPrune(*out, limit_size);
628 out->CopyFrom(level[0]);
630 if (out->size > limit_size) {
631 temp.Reserve(limit_size);
632 temp.SetPrune(*out, limit_size);
639 for (
size_t l = 1; l < level.size(); ++l) {
640 level[l].CheckValid(eps);
645 if (level.size() >= nlevel)
return;
646 data.resize(limit_size * nlevel);
647 level.resize(nlevel,
Summary(
nullptr, 0));
648 for (
size_t l = 0; l < level.size(); ++l) {
649 level[l].data = dmlc::BeginPtr(
data) + l * limit_size;
671 template<
typename DType,
typename RType =
unsigned>
681 template<
typename DType,
typename RType =
unsigned>
687 #endif // XGBOOST_COMMON_QUANTILE_H_ void MakeSummary(WQSummary *out)
Definition: quantile.h:97
WQSummary(Entry *data, size_t size)
Definition: quantile.h:119
SummaryContainer(const SummaryContainer &src)
Definition: quantile.h:476
void SetCombine(const WQSummary &sa, const WQSummary &sb)
set current summary to be merged summary of sa and sb
Definition: quantile.h:238
typename WQSummary< DType, RType >::Entry Entry
Definition: quantile.h:356
bool operator<(const QEntry &b) const
Definition: quantile.h:81
std::vector< QEntry > queue
Definition: quantile.h:86
size_t qtail
Definition: quantile.h:88
WXQSummary(Entry *data, size_t size)
Definition: quantile.h:358
std::vector< Entry > space
Definition: quantile.h:475
size_t nlevel
Definition: quantile.h:655
RType MaxRank() const
Definition: quantile.h:158
void Reserve(size_t size)
reserve space for summary
Definition: quantile.h:483
void Save(TStream &fo) const
save the data structure into stream
Definition: quantile.h:508
SummaryContainer temp
Definition: quantile.h:663
template for all quantile sketch algorithm that uses merge/prune scheme
Definition: quantile.h:464
static size_t CalcMemCost(size_t nentry)
return the number of bytes this data structure cost in serialization
Definition: quantile.h:503
experimental wsummary
Definition: quantile.h:26
void CheckValid(RType eps) const
debug function, validate whether the summary run consistency check to check if it is a valid summary ...
Definition: quantile.h:186
void Push(DType x, RType w)
Definition: quantile.h:90
typename Summary::Entry Entry
the entry type
Definition: quantile.h:472
void Push(DType x, RType w=1)
add an element to a sketch
Definition: quantile.h:563
DType value
the value of data
Definition: quantile.h:36
void Init(size_t maxn, double eps)
initialize the quantile sketch, given the performance specification
Definition: quantile.h:530
RType wmin
maximum weight
Definition: quantile.h:34
bool Check(const char *msg) const
Definition: quantile.h:338
Definition: quantile.h:70
void CopyFrom(const WQSummary &src)
copy content from src
Definition: quantile.h:165
Entry Query(DType qvalue, size_t &istart) const
query qvalue, start from istart
Definition: quantile.h:137
size_t limit_size
Definition: quantile.h:657
void FixError(RType *err_mingap, RType *err_maxgap, RType *err_wgap) const
Definition: quantile.h:311
Quantile sketch use WXQSummary.
Definition: quantile.h:682
size_t size
number of elements in the summary
Definition: quantile.h:117
static bool CheckLarge(const Entry &e, RType chunk)
Definition: quantile.h:361
RType weight
Definition: quantile.h:74
void SetPrune(const WQSummary< DType, RType > &src, size_t maxsize)
Definition: quantile.h:365
XGBOOST_DEVICE RType RMinNext() const
Definition: quantile.h:51
Summary::Queue inqueue
Definition: quantile.h:653
Quantile sketch use WQSummary.
Definition: quantile.h:672
void PushTemp()
push up temp
Definition: quantile.h:587
void SetPrune(const WQSummary &src, size_t maxsize)
set current summary to be pruned summary of src assume data field is already allocated to be at least...
Definition: quantile.h:202
void Load(TStream &fi)
load data structure from input stream
Definition: quantile.h:516
input data queue before entering the summary
Definition: quantile.h:68
void PushSummary(const Summary &summary)
Definition: quantile.h:580
QEntry(DType value, RType weight)
Definition: quantile.h:78
#define XGBOOST_DEVICE
Tag function as usable by device.
Definition: base.h:84
same as summary, but use STL to backup the space
Definition: quantile.h:474
DType value
Definition: quantile.h:72
void InitLevel(size_t nlevel)
Definition: quantile.h:644
std::vector< Entry > data
Definition: quantile.h:661
RType MaxError() const
Definition: quantile.h:124
namespace of xgboost
Definition: base.h:102
RType rmax
maximum rank
Definition: quantile.h:32
SummaryContainer()
Definition: quantile.h:480
void GetSummary(SummaryContainer *out)
get the summary after finalize
Definition: quantile.h:610
void MakeFromSorted(const Entry *entries, size_t n)
Definition: quantile.h:169
void CheckValid(RType eps) const
Definition: quantile.h:638
std::vector< Summary > level
Definition: quantile.h:659
void Reduce(const Summary &src, size_t max_nbyte)
do elementwise combination of summary array this[i] = combine(this[i], src[i]) for each i ...
Definition: quantile.h:495
XGBOOST_DEVICE RType RMaxPrev() const
Definition: quantile.h:55
void CheckValid(RType eps=0) const
debug function, check Valid
Definition: quantile.h:46
void Print() const
Definition: quantile.h:301
try to do efficient pruning
Definition: quantile.h:354
XGBOOST_DEVICE Entry(RType rmin, RType rmax, RType wmin, DType value)
Definition: quantile.h:40
friend std::ostream & operator<<(std::ostream &os, Entry const &e)
Definition: quantile.h:59
XGBOOST_DEVICE Entry()
Definition: quantile.h:38
Entry * data
data field
Definition: quantile.h:115
an entry in the sketch summary
Definition: quantile.h:28
TSummary Summary
type of summary type
Definition: quantile.h:470
RType rmin
minimum rank
Definition: quantile.h:30