7 #ifndef XGBOOST_COMMON_QUANTILE_H_ 8 #define XGBOOST_COMMON_QUANTILE_H_ 10 #include <dmlc/base.h> 11 #include <xgboost/logging.h> 25 template<
typename DType,
typename RType>
41 : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
47 CHECK(rmin >= 0 && rmax >= 0 && wmin >= 0) <<
"nonneg constraint";
48 CHECK(rmax- rmin - wmin > -eps) <<
"relation constraint: min/max";
71 : value(value), weight(weight) {}
74 return value < b.
value;
82 inline void Push(DType x, RType w) {
83 if (qtail == 0 || queue[qtail - 1].
value != x) {
84 queue[qtail++] =
QEntry(x, w);
86 queue[qtail - 1].weight += w;
90 std::sort(queue.begin(), queue.begin() + qtail);
95 for (
size_t i = 0; i < qtail;) {
97 RType w = queue[i].weight;
98 while (j < qtail && queue[j].
value == queue[i].
value) {
99 w += queue[j].weight; ++j;
101 out->
data[out->
size++] =
Entry(wsum, wsum + w, w, queue[i].value);
112 : data(data), size(size) {}
118 for (
size_t i = 1; i <
size; ++i) {
120 res = std::max(data[i].
rmax - data[i].
rmin - data[i].
wmin, res);
130 while (istart < size && qvalue > data[istart].
value) {
133 if (istart == size) {
135 return Entry(rmax, rmax, 0.0f, qvalue);
137 if (qvalue == data[istart].value) {
141 return Entry(0.0f, 0.0f, 0.0f, qvalue);
151 return data[size - 1].
rmax;
159 std::memcpy(data, src.
data,
sizeof(
Entry) * size);
163 for (
size_t i = 0; i < n;) {
166 for (; j < n && entries[j].
value == entries[i].
value; ++j) {}
179 for (
size_t i = 0; i <
size; ++i) {
182 CHECK(data[i].
rmin >= data[i - 1].
rmin + data[i - 1].
wmin) <<
"rmin range constraint";
183 CHECK(data[i].
rmax >= data[i - 1].
rmax + data[i].
wmin) <<
"rmax range constraint";
195 if (src.
size <= maxsize) {
198 const RType begin = src.
data[0].
rmax;
200 const size_t n = maxsize - 1;
201 data[0] = src.
data[0];
204 size_t i = 1, lastidx = 0;
205 for (
size_t k = 1; k < n; ++k) {
206 RType dx2 = 2 * ((k * range) / n + begin);
208 while (i < src.
size - 1
210 CHECK(i != src.
size - 1);
213 data[size++] = src.
data[i]; lastidx = i;
216 if (i + 1 != lastidx) {
217 data[size++] = src.
data[i + 1]; lastidx = i + 1;
221 if (lastidx != src.
size - 1) {
222 data[size++] = src.
data[src.
size - 1];
242 RType aprev_rmin = 0, bprev_rmin = 0;
244 while (a != a_end && b != b_end) {
268 RType brmax = (b_end - 1)->
rmax;
272 }
while (a != a_end);
275 RType armax = (a_end - 1)->
rmax;
279 }
while (b != b_end);
281 this->size = dst -
data;
282 const RType tol = 10;
283 RType err_mingap, err_maxgap, err_wgap;
284 this->
FixError(&err_mingap, &err_maxgap, &err_wgap);
285 if (err_mingap > tol || err_maxgap > tol || err_wgap > tol) {
286 LOG(INFO) <<
"mingap=" << err_mingap
287 <<
", maxgap=" << err_maxgap
288 <<
", wgap=" << err_wgap;
290 CHECK(size <= sa.
size + sb.
size) <<
"bug in combine";
294 for (
size_t i = 0; i < this->
size; ++i) {
295 LOG(CONSOLE) <<
"[" << i <<
"] rmin=" << data[i].
rmin 296 <<
", rmax=" << data[i].
rmax 297 <<
", wmin=" << data[i].
wmin 298 <<
", v=" << data[i].
value;
305 RType *err_wgap)
const {
309 RType prev_rmin = 0, prev_rmax = 0;
310 for (
size_t i = 0; i < this->
size; ++i) {
311 if (data[i].
rmin < prev_rmin) {
312 data[i].
rmin = prev_rmin;
313 *err_mingap = std::max(*err_mingap, prev_rmin - data[i].
rmin);
315 prev_rmin = data[i].
rmin;
317 if (data[i].
rmax < prev_rmax) {
318 data[i].
rmax = prev_rmax;
319 *err_maxgap = std::max(*err_maxgap, prev_rmax - data[i].
rmax);
321 RType rmin_next = data[i].
RMinNext();
322 if (data[i].
rmax < rmin_next) {
323 data[i].
rmax = rmin_next;
324 *err_wgap = std::max(*err_wgap, data[i].
rmax - rmin_next);
326 prev_rmax = data[i].
rmax;
330 inline bool Check(
const char *msg)
const {
331 const float tol = 10.0f;
332 for (
size_t i = 0; i < this->
size; ++i) {
334 data[i].
rmin < -1e-6f || data[i].
rmax < -1e-6f) {
335 LOG(INFO) <<
"---------- WQSummary::Check did not pass ----------";
345 template<
typename DType,
typename RType>
354 return e.RMinNext() > e.RMaxPrev() + chunk;
358 if (src.
size <= maxsize) {
361 RType begin = src.
data[0].rmax;
363 size_t n = maxsize - 2, nbig = 0;
365 RType range = src.
data[src.
size - 1].rmin - begin;
367 if (range == 0.0f || maxsize <= 2) {
374 range = std::max(range, static_cast<RType>(1e-3f));
378 const RType chunk = 2 * range / n;
385 for (
size_t i = 1; i < src.
size - 1; ++i) {
388 if (CheckLarge(src.
data[i], chunk)) {
391 mrange += src.
data[i].RMaxPrev() - src.
data[bid].RMinNext();
396 if (bid != src.
size - 2) {
397 mrange += src.
data[src.
size-1].RMaxPrev() - src.
data[bid].RMinNext();
403 LOG(INFO) <<
" check quantile stats, nbig=" << nbig <<
", n=" << n;
404 LOG(INFO) <<
" srcsize=" << src.
size <<
", maxsize=" << maxsize
405 <<
", range=" << range <<
", chunk=" << chunk;
407 CHECK(nbig < n) <<
"quantile: too many large chunk";
414 size_t bid = 0, k = 1, lastidx = 0;
415 for (
size_t end = 1; end < src.
size; ++end) {
416 if (end == src.
size - 1 || CheckLarge(src.
data[end], chunk)) {
417 if (bid != end - 1) {
419 RType maxdx2 = src.
data[end].RMaxPrev() * 2;
421 RType dx2 = 2 * ((k * mrange) / n + begin);
422 if (dx2 >= maxdx2)
break;
424 dx2 >= src.
data[i + 1].rmax + src.
data[i + 1].rmin) ++i;
426 if (dx2 < src.
data[i].RMinNext() + src.
data[i + 1].RMaxPrev()) {
431 if (i + 1 != lastidx) {
432 this->
data[this->
size++] = src.
data[i + 1]; lastidx = i + 1;
437 if (lastidx != end) {
443 begin += src.
data[bid].RMinNext() - src.
data[bid].RMaxPrev();
451 template<
typename DType,
typename RType>
464 Entry(RType rmin, RType rmax, DType value)
465 : rmin(rmin), rmax(rmax), value(value) {}
474 inline void Push(DType x, RType w) {
478 std::sort(queue.begin(), queue.begin() + qtail);
480 for (
size_t i = 0; i < qtail; ++i) {
481 out->
data[i] =
Entry(i + 1, i + 1, queue[i]);
490 : data(data), size(size) {}
494 for (
size_t i = 1; i <
size; ++i) {
495 res = std::max(data[i].
rmax - data[i-1].
rmin, res);
501 return data[size - 1].
rmax;
509 std::memcpy(data, src.
data,
sizeof(
Entry) * size);
516 for (
size_t i = 0; i <
size; ++i) {
517 LOG(CONSOLE) <<
"x=" << data[i].
value <<
"\t" 518 <<
"[" << data[i].
rmin <<
"," << data[i].
rmax <<
"]";
528 if (src.
size <= maxsize) {
531 const RType max_rank = src.
MaxRank();
532 this->size = maxsize;
533 data[0] = src.
data[0];
534 size_t n = maxsize - 1;
536 for (
size_t i = 1; i < n; ++i) {
537 RType k = (i * max_rank) / n;
538 while (k > src.
data[top + 1].
rmax) ++top;
542 data[i] = src.
data[top];
544 data[i] = src.
data[top + 1];
557 CHECK(sa.
size > 0 && sb.
size > 0) <<
"invalid input for merge";
561 RType aprev_rmin = 0, bprev_rmin = 0;
563 while (a != a_end && b != b_end) {
567 aprev_rmin = a->
rmin;
572 bprev_rmin = b->
rmin;
577 RType bprev_rmax = (b_end - 1)->
rmax;
581 }
while (a != a_end);
584 RType aprev_rmax = (a_end - 1)->
rmax;
588 }
while (b != b_end);
590 CHECK(dst == data + size) <<
"bug in combine";
601 template<
typename DType,
typename RType,
class TSummary>
612 this->space = src.
space;
613 this->
data = dmlc::BeginPtr(this->space);
619 if (size > space.size()) {
621 this->
data = dmlc::BeginPtr(space);
631 CHECK(begin < end) <<
"can not set combine to empty instance";
632 size_t len = end - begin;
634 this->Reserve(begin[0].
size);
636 }
else if (len == 2) {
637 this->Reserve(begin[0].
size + begin[1].
size);
638 this->SetMerge(begin[0], begin[1]);
642 lhs.SetCombine(begin, begin + len / 2);
643 rhs.SetCombine(begin + len / 2, end);
644 this->Reserve(lhs.size + rhs.size);
655 this->Reserve((max_nbyte -
sizeof(this->
size)) /
sizeof(
Entry));
658 temp.SetCombine(*
this, src);
663 return sizeof(size_t) +
sizeof(
Entry) * nentry;
666 template<
typename TStream>
667 inline void Save(TStream &fo)
const {
668 fo.Write(&(this->
size),
sizeof(this->
size));
669 if (this->
size != 0) {
674 template<
typename TStream>
675 inline void Load(TStream &fi) {
676 CHECK_EQ(fi.Read(&this->size,
sizeof(this->size)),
sizeof(this->
size));
677 this->Reserve(this->
size);
678 if (this->
size != 0) {
679 CHECK_EQ(fi.Read(this->data, this->size *
sizeof(
Entry)),
689 inline void Init(
size_t maxn,
double eps) {
690 LimitSizeLevel(maxn, eps, &nlevel, &limit_size);
692 inqueue.queue.resize(1);
698 inline static void LimitSizeLevel
699 (
size_t maxn,
double eps,
size_t* out_nlevel,
size_t* out_limit_size) {
700 size_t& nlevel = *out_nlevel;
701 size_t& limit_size = *out_limit_size;
704 limit_size =
static_cast<size_t>(ceil(nlevel / eps)) + 1;
705 size_t n = (1ULL << nlevel);
706 if (n * limit_size >= maxn)
break;
710 size_t n = (1ULL << nlevel);
711 CHECK(n * limit_size >= maxn) <<
"invalid init parameter";
712 CHECK(nlevel <= limit_size * eps) <<
"invalid init parameter";
720 inline void Push(DType x, RType w = 1) {
721 if (w == static_cast<RType>(0))
return;
722 if (inqueue.qtail == inqueue.queue.size()) {
724 if (inqueue.queue.size() == 1) {
725 inqueue.queue.resize(limit_size * 2);
727 temp.Reserve(limit_size * 2);
728 inqueue.MakeSummary(&temp);
738 temp.Reserve(limit_size * 2);
739 temp.SetPrune(summary, limit_size * 2);
745 temp.Reserve(limit_size * 2);
746 for (
size_t l = 1;
true; ++l) {
747 this->InitLevel(l + 1);
749 if (level[l].
size == 0) {
750 level[l].SetPrune(temp, limit_size);
754 level[0].SetPrune(temp, limit_size);
755 temp.SetCombine(level[0], level[l]);
756 if (temp.size > limit_size) {
761 level[l].CopyFrom(temp);
break;
768 if (level.size() != 0) {
769 out->Reserve(limit_size * 2);
771 out->Reserve(inqueue.queue.size());
773 inqueue.MakeSummary(out);
774 if (level.size() != 0) {
775 level[0].SetPrune(*out, limit_size);
776 for (
size_t l = 1; l < level.size(); ++l) {
777 if (level[l].
size == 0)
continue;
778 if (level[0].
size == 0) {
779 level[0].CopyFrom(level[l]);
781 out->SetCombine(level[0], level[l]);
782 level[0].SetPrune(*out, limit_size);
785 out->CopyFrom(level[0]);
787 if (out->size > limit_size) {
788 temp.Reserve(limit_size);
789 temp.SetPrune(*out, limit_size);
796 for (
size_t l = 1; l < level.size(); ++l) {
797 level[l].CheckValid(eps);
802 if (level.size() >= nlevel)
return;
803 data.resize(limit_size * nlevel);
804 level.resize(nlevel,
Summary(
nullptr, 0));
805 for (
size_t l = 0; l < level.size(); ++l) {
806 level[l].data = dmlc::BeginPtr(
data) + l * limit_size;
828 template<
typename DType,
typename RType =
unsigned>
838 template<
typename DType,
typename RType =
unsigned>
847 template<
typename DType,
typename RType =
unsigned>
853 #endif // XGBOOST_COMMON_QUANTILE_H_ input data queue before entering the summary
Definition: quantile.h:468
void MakeSummary(WQSummary *out)
Definition: quantile.h:89
WQSummary(Entry *data, size_t size)
Definition: quantile.h:111
SummaryContainer(const SummaryContainer &src)
Definition: quantile.h:611
void SetCombine(const WQSummary &sa, const WQSummary &sb)
set current summary to be merged summary of sa and sb
Definition: quantile.h:230
RType rmin
minimum rank
Definition: quantile.h:456
typename WQSummary< DType, RType >::Entry Entry
Definition: quantile.h:348
bool operator<(const QEntry &b) const
Definition: quantile.h:73
std::vector< QEntry > queue
Definition: quantile.h:78
size_t qtail
Definition: quantile.h:80
WXQSummary(Entry *data, size_t size)
Definition: quantile.h:350
std::vector< Entry > space
Definition: quantile.h:610
std::vector< DType > queue
Definition: quantile.h:470
size_t nlevel
Definition: quantile.h:812
RType MaxRank() const
Definition: quantile.h:150
void Reserve(size_t size)
reserve space for summary
Definition: quantile.h:618
void Save(TStream &fo) const
save the data structure into stream
Definition: quantile.h:667
SummaryContainer temp
Definition: quantile.h:820
void MakeSummary(GKSummary *out)
Definition: quantile.h:477
template for all quantile sketch algorithm that uses merge/prune scheme
Definition: quantile.h:602
static size_t CalcMemCost(size_t nentry)
return the number of bytes this data structure cost in serialization
Definition: quantile.h:662
experimental wsummary
Definition: quantile.h:26
void CheckValid(RType eps) const
debug function, validate whether the summary run consistency check to check if it is a valid summary ...
Definition: quantile.h:178
void Push(DType x, RType w)
Definition: quantile.h:82
typename Summary::Entry Entry
the entry type
Definition: quantile.h:607
Entry * data
data field
Definition: quantile.h:486
void Push(DType x, RType w=1)
add an element to a sketch
Definition: quantile.h:720
DType value
the value of data
Definition: quantile.h:36
void Push(DType x, RType w)
Definition: quantile.h:474
Quantile sketch use WQSummary.
Definition: quantile.h:848
void Init(size_t maxn, double eps)
initialize the quantile sketch, given the performance specification
Definition: quantile.h:689
RType wmin
maximum weight
Definition: quantile.h:34
void SetMerge(const Summary *begin, const Summary *end)
set the space to be merge of all Summary arrays
Definition: quantile.h:629
bool Check(const char *msg) const
Definition: quantile.h:330
Definition: quantile.h:62
void CopyFrom(const WQSummary &src)
copy content from src
Definition: quantile.h:157
Entry Query(DType qvalue, size_t &istart) const
query qvalue, start from istart
Definition: quantile.h:129
size_t limit_size
Definition: quantile.h:814
void FixError(RType *err_mingap, RType *err_maxgap, RType *err_wgap) const
Definition: quantile.h:303
Quantile sketch use WXQSummary.
Definition: quantile.h:839
size_t size
number of elements in the summary
Definition: quantile.h:109
static bool CheckLarge(const Entry &e, RType chunk)
Definition: quantile.h:353
RType weight
Definition: quantile.h:66
void SetPrune(const WQSummary< DType, RType > &src, size_t maxsize)
Definition: quantile.h:357
XGBOOST_DEVICE RType RMinNext() const
Definition: quantile.h:51
Summary::Queue inqueue
Definition: quantile.h:810
Quantile sketch use WQSummary.
Definition: quantile.h:829
void PushTemp()
push up temp
Definition: quantile.h:744
void SetPrune(const WQSummary &src, size_t maxsize)
set current summary to be pruned summary of src assume data field is already allocated to be at least...
Definition: quantile.h:194
void Load(TStream &fi)
load data structure from input stream
Definition: quantile.h:675
input data queue before entering the summary
Definition: quantile.h:60
GKSummary(Entry *data, size_t size)
Definition: quantile.h:489
void PushSummary(const Summary &summary)
Definition: quantile.h:737
QEntry(DType value, RType weight)
Definition: quantile.h:70
void Print() const
used for debug purpose, print the summary
Definition: quantile.h:515
Entry(RType rmin, RType rmax, DType value)
Definition: quantile.h:464
#define XGBOOST_DEVICE
Tag function as usable by device.
Definition: base.h:75
same as summary, but use STL to backup the space
Definition: quantile.h:609
DType value
Definition: quantile.h:64
void InitLevel(size_t nlevel)
Definition: quantile.h:801
RType MaxRank() const
Definition: quantile.h:500
std::vector< Entry > data
Definition: quantile.h:818
RType MaxError() const
Definition: quantile.h:116
namespace of xgboost
Definition: base.h:79
size_t qtail
Definition: quantile.h:472
RType MaxError() const
the maximum error of the summary
Definition: quantile.h:492
void SetPrune(const GKSummary &src, size_t maxsize)
set current summary to be pruned summary of src assume data field is already allocated to be at least...
Definition: quantile.h:527
RType rmax
maximum rank
Definition: quantile.h:32
SummaryContainer()
Definition: quantile.h:615
void GetSummary(SummaryContainer *out)
get the summary after finalize
Definition: quantile.h:767
void MakeFromSorted(const Entry *entries, size_t n)
Definition: quantile.h:161
void CheckValid(RType eps) const
Definition: quantile.h:795
std::vector< Summary > level
Definition: quantile.h:816
void Reduce(const Summary &src, size_t max_nbyte)
do elementwise combination of summary array this[i] = combine(this[i], src[i]) for each i ...
Definition: quantile.h:654
DType value
the value of data
Definition: quantile.h:460
an entry in the sketch summary
Definition: quantile.h:454
XGBOOST_DEVICE RType RMaxPrev() const
Definition: quantile.h:55
void CopyFrom(const GKSummary &src)
copy content from src
Definition: quantile.h:507
void CheckValid(RType eps=0) const
debug function, check Valid
Definition: quantile.h:46
void Print() const
Definition: quantile.h:293
try to do efficient pruning
Definition: quantile.h:346
XGBOOST_DEVICE Entry(RType rmin, RType rmax, RType wmin, DType value)
Definition: quantile.h:40
size_t size
number of elements in the summary
Definition: quantile.h:488
void SetCombine(const GKSummary &sa, const GKSummary &sb)
Definition: quantile.h:549
XGBOOST_DEVICE Entry()
Definition: quantile.h:38
void CheckValid(RType eps) const
Definition: quantile.h:511
Entry * data
data field
Definition: quantile.h:107
an entry in the sketch summary
Definition: quantile.h:28
TSummary Summary
type of summary type
Definition: quantile.h:605
traditional GK summary
Definition: quantile.h:452
RType rmin
minimum rank
Definition: quantile.h:30
RType rmax
maximum rank
Definition: quantile.h:458