xgboost
quantile.h
Go to the documentation of this file.
1 
7 #ifndef XGBOOST_COMMON_QUANTILE_H_
8 #define XGBOOST_COMMON_QUANTILE_H_
9 
10 #include <dmlc/base.h>
11 #include <xgboost/logging.h>
12 #include <cmath>
13 #include <vector>
14 #include <cstring>
15 #include <algorithm>
16 #include <iostream>
17 
18 namespace xgboost {
19 namespace common {
25 template<typename DType, typename RType>
26 struct WQSummary {
28  struct Entry {
30  RType rmin;
32  RType rmax;
34  RType wmin;
36  DType value;
37  // constructor
38  XGBOOST_DEVICE Entry() {} // NOLINT
39  // constructor
40  XGBOOST_DEVICE Entry(RType rmin, RType rmax, RType wmin, DType value)
41  : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
46  inline void CheckValid(RType eps = 0) const {
47  CHECK(rmin >= 0 && rmax >= 0 && wmin >= 0) << "nonneg constraint";
48  CHECK(rmax- rmin - wmin > -eps) << "relation constraint: min/max";
49  }
51  XGBOOST_DEVICE inline RType RMinNext() const {
52  return rmin + wmin;
53  }
55  XGBOOST_DEVICE inline RType RMaxPrev() const {
56  return rmax - wmin;
57  }
58  };
60  struct Queue {
61  // entry in the queue
62  struct QEntry {
63  // value of the instance
64  DType value;
65  // weight of instance
66  RType weight;
67  // default constructor
68  QEntry() = default;
69  // constructor
70  QEntry(DType value, RType weight)
71  : value(value), weight(weight) {}
72  // comparator on value
73  inline bool operator<(const QEntry &b) const {
74  return value < b.value;
75  }
76  };
77  // the input queue
78  std::vector<QEntry> queue;
79  // end of the queue
80  size_t qtail;
81  // push data to the queue
82  inline void Push(DType x, RType w) {
83  if (qtail == 0 || queue[qtail - 1].value != x) {
84  queue[qtail++] = QEntry(x, w);
85  } else {
86  queue[qtail - 1].weight += w;
87  }
88  }
89  inline void MakeSummary(WQSummary *out) {
90  std::sort(queue.begin(), queue.begin() + qtail);
91  out->size = 0;
92  // start update sketch
93  RType wsum = 0;
94  // construct data with unique weights
95  for (size_t i = 0; i < qtail;) {
96  size_t j = i + 1;
97  RType w = queue[i].weight;
98  while (j < qtail && queue[j].value == queue[i].value) {
99  w += queue[j].weight; ++j;
100  }
101  out->data[out->size++] = Entry(wsum, wsum + w, w, queue[i].value);
102  wsum += w; i = j;
103  }
104  }
105  };
109  size_t size;
110  // constructor
111  WQSummary(Entry *data, size_t size)
112  : data(data), size(size) {}
116  inline RType MaxError() const {
117  RType res = data[0].rmax - data[0].rmin - data[0].wmin;
118  for (size_t i = 1; i < size; ++i) {
119  res = std::max(data[i].RMaxPrev() - data[i - 1].RMinNext(), res);
120  res = std::max(data[i].rmax - data[i].rmin - data[i].wmin, res);
121  }
122  return res;
123  }
129  inline Entry Query(DType qvalue, size_t &istart) const { // NOLINT(*)
130  while (istart < size && qvalue > data[istart].value) {
131  ++istart;
132  }
133  if (istart == size) {
134  RType rmax = data[size - 1].rmax;
135  return Entry(rmax, rmax, 0.0f, qvalue);
136  }
137  if (qvalue == data[istart].value) {
138  return data[istart];
139  } else {
140  if (istart == 0) {
141  return Entry(0.0f, 0.0f, 0.0f, qvalue);
142  } else {
143  return Entry(data[istart - 1].RMinNext(),
144  data[istart].RMaxPrev(),
145  0.0f, qvalue);
146  }
147  }
148  }
150  inline RType MaxRank() const {
151  return data[size - 1].rmax;
152  }
157  inline void CopyFrom(const WQSummary &src) {
158  size = src.size;
159  std::memcpy(data, src.data, sizeof(Entry) * size);
160  }
161  inline void MakeFromSorted(const Entry* entries, size_t n) {
162  size = 0;
163  for (size_t i = 0; i < n;) {
164  size_t j = i + 1;
165  // ignore repeated values
166  for (; j < n && entries[j].value == entries[i].value; ++j) {}
167  data[size++] = Entry(entries[i].rmin, entries[i].rmax, entries[i].wmin,
168  entries[i].value);
169  i = j;
170  }
171  }
178  inline void CheckValid(RType eps) const {
179  for (size_t i = 0; i < size; ++i) {
180  data[i].CheckValid(eps);
181  if (i != 0) {
182  CHECK(data[i].rmin >= data[i - 1].rmin + data[i - 1].wmin) << "rmin range constraint";
183  CHECK(data[i].rmax >= data[i - 1].rmax + data[i].wmin) << "rmax range constraint";
184  }
185  }
186  }
194  inline void SetPrune(const WQSummary &src, size_t maxsize) {
195  if (src.size <= maxsize) {
196  this->CopyFrom(src); return;
197  }
198  const RType begin = src.data[0].rmax;
199  const RType range = src.data[src.size - 1].rmin - src.data[0].rmax;
200  const size_t n = maxsize - 1;
201  data[0] = src.data[0];
202  this->size = 1;
203  // lastidx is used to avoid duplicated records
204  size_t i = 1, lastidx = 0;
205  for (size_t k = 1; k < n; ++k) {
206  RType dx2 = 2 * ((k * range) / n + begin);
207  // find first i such that d < (rmax[i+1] + rmin[i+1]) / 2
208  while (i < src.size - 1
209  && dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
210  CHECK(i != src.size - 1);
211  if (dx2 < src.data[i].RMinNext() + src.data[i + 1].RMaxPrev()) {
212  if (i != lastidx) {
213  data[size++] = src.data[i]; lastidx = i;
214  }
215  } else {
216  if (i + 1 != lastidx) {
217  data[size++] = src.data[i + 1]; lastidx = i + 1;
218  }
219  }
220  }
221  if (lastidx != src.size - 1) {
222  data[size++] = src.data[src.size - 1];
223  }
224  }
230  inline void SetCombine(const WQSummary &sa,
231  const WQSummary &sb) {
232  if (sa.size == 0) {
233  this->CopyFrom(sb); return;
234  }
235  if (sb.size == 0) {
236  this->CopyFrom(sa); return;
237  }
238  CHECK(sa.size > 0 && sb.size > 0);
239  const Entry *a = sa.data, *a_end = sa.data + sa.size;
240  const Entry *b = sb.data, *b_end = sb.data + sb.size;
241  // extended rmin value
242  RType aprev_rmin = 0, bprev_rmin = 0;
243  Entry *dst = this->data;
244  while (a != a_end && b != b_end) {
245  // duplicated value entry
246  if (a->value == b->value) {
247  *dst = Entry(a->rmin + b->rmin,
248  a->rmax + b->rmax,
249  a->wmin + b->wmin, a->value);
250  aprev_rmin = a->RMinNext();
251  bprev_rmin = b->RMinNext();
252  ++dst; ++a; ++b;
253  } else if (a->value < b->value) {
254  *dst = Entry(a->rmin + bprev_rmin,
255  a->rmax + b->RMaxPrev(),
256  a->wmin, a->value);
257  aprev_rmin = a->RMinNext();
258  ++dst; ++a;
259  } else {
260  *dst = Entry(b->rmin + aprev_rmin,
261  b->rmax + a->RMaxPrev(),
262  b->wmin, b->value);
263  bprev_rmin = b->RMinNext();
264  ++dst; ++b;
265  }
266  }
267  if (a != a_end) {
268  RType brmax = (b_end - 1)->rmax;
269  do {
270  *dst = Entry(a->rmin + bprev_rmin, a->rmax + brmax, a->wmin, a->value);
271  ++dst; ++a;
272  } while (a != a_end);
273  }
274  if (b != b_end) {
275  RType armax = (a_end - 1)->rmax;
276  do {
277  *dst = Entry(b->rmin + aprev_rmin, b->rmax + armax, b->wmin, b->value);
278  ++dst; ++b;
279  } while (b != b_end);
280  }
281  this->size = dst - data;
282  const RType tol = 10;
283  RType err_mingap, err_maxgap, err_wgap;
284  this->FixError(&err_mingap, &err_maxgap, &err_wgap);
285  if (err_mingap > tol || err_maxgap > tol || err_wgap > tol) {
286  LOG(INFO) << "mingap=" << err_mingap
287  << ", maxgap=" << err_maxgap
288  << ", wgap=" << err_wgap;
289  }
290  CHECK(size <= sa.size + sb.size) << "bug in combine";
291  }
292  // helper function to print the current content of sketch
293  inline void Print() const {
294  for (size_t i = 0; i < this->size; ++i) {
295  LOG(CONSOLE) << "[" << i << "] rmin=" << data[i].rmin
296  << ", rmax=" << data[i].rmax
297  << ", wmin=" << data[i].wmin
298  << ", v=" << data[i].value;
299  }
300  }
301  // try to fix rounding error
302  // and re-establish invariance
303  inline void FixError(RType *err_mingap,
304  RType *err_maxgap,
305  RType *err_wgap) const {
306  *err_mingap = 0;
307  *err_maxgap = 0;
308  *err_wgap = 0;
309  RType prev_rmin = 0, prev_rmax = 0;
310  for (size_t i = 0; i < this->size; ++i) {
311  if (data[i].rmin < prev_rmin) {
312  data[i].rmin = prev_rmin;
313  *err_mingap = std::max(*err_mingap, prev_rmin - data[i].rmin);
314  } else {
315  prev_rmin = data[i].rmin;
316  }
317  if (data[i].rmax < prev_rmax) {
318  data[i].rmax = prev_rmax;
319  *err_maxgap = std::max(*err_maxgap, prev_rmax - data[i].rmax);
320  }
321  RType rmin_next = data[i].RMinNext();
322  if (data[i].rmax < rmin_next) {
323  data[i].rmax = rmin_next;
324  *err_wgap = std::max(*err_wgap, data[i].rmax - rmin_next);
325  }
326  prev_rmax = data[i].rmax;
327  }
328  }
329  // check consistency of the summary
330  inline bool Check(const char *msg) const {
331  const float tol = 10.0f;
332  for (size_t i = 0; i < this->size; ++i) {
333  if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
334  data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
335  LOG(INFO) << "---------- WQSummary::Check did not pass ----------";
336  this->Print();
337  return false;
338  }
339  }
340  return true;
341  }
342 };
343 
345 template<typename DType, typename RType>
346 struct WXQSummary : public WQSummary<DType, RType> {
347  // redefine entry type
349  // constructor
351  : WQSummary<DType, RType>(data, size) {}
352  // check if the block is large chunk
353  inline static bool CheckLarge(const Entry &e, RType chunk) {
354  return e.RMinNext() > e.RMaxPrev() + chunk;
355  }
356  // set prune
357  inline void SetPrune(const WQSummary<DType, RType> &src, size_t maxsize) {
358  if (src.size <= maxsize) {
359  this->CopyFrom(src); return;
360  }
361  RType begin = src.data[0].rmax;
362  // n is number of points exclude the min/max points
363  size_t n = maxsize - 2, nbig = 0;
364  // these is the range of data exclude the min/max point
365  RType range = src.data[src.size - 1].rmin - begin;
366  // prune off zero weights
367  if (range == 0.0f || maxsize <= 2) {
368  // special case, contain only two effective data pts
369  this->data[0] = src.data[0];
370  this->data[1] = src.data[src.size - 1];
371  this->size = 2;
372  return;
373  } else {
374  range = std::max(range, static_cast<RType>(1e-3f));
375  }
376  // Get a big enough chunk size, bigger than range / n
377  // (multiply by 2 is a safe factor)
378  const RType chunk = 2 * range / n;
379  // minimized range
380  RType mrange = 0;
381  {
382  // first scan, grab all the big chunk
383  // moving block index, exclude the two ends.
384  size_t bid = 0;
385  for (size_t i = 1; i < src.size - 1; ++i) {
386  // detect big chunk data point in the middle
387  // always save these data points.
388  if (CheckLarge(src.data[i], chunk)) {
389  if (bid != i - 1) {
390  // accumulate the range of the rest points
391  mrange += src.data[i].RMaxPrev() - src.data[bid].RMinNext();
392  }
393  bid = i; ++nbig;
394  }
395  }
396  if (bid != src.size - 2) {
397  mrange += src.data[src.size-1].RMaxPrev() - src.data[bid].RMinNext();
398  }
399  }
400  // assert: there cannot be more than n big data points
401  if (nbig >= n) {
402  // see what was the case
403  LOG(INFO) << " check quantile stats, nbig=" << nbig << ", n=" << n;
404  LOG(INFO) << " srcsize=" << src.size << ", maxsize=" << maxsize
405  << ", range=" << range << ", chunk=" << chunk;
406  src.Print();
407  CHECK(nbig < n) << "quantile: too many large chunk";
408  }
409  this->data[0] = src.data[0];
410  this->size = 1;
411  // The counter on the rest of points, to be selected equally from small chunks.
412  n = n - nbig;
413  // find the rest of point
414  size_t bid = 0, k = 1, lastidx = 0;
415  for (size_t end = 1; end < src.size; ++end) {
416  if (end == src.size - 1 || CheckLarge(src.data[end], chunk)) {
417  if (bid != end - 1) {
418  size_t i = bid;
419  RType maxdx2 = src.data[end].RMaxPrev() * 2;
420  for (; k < n; ++k) {
421  RType dx2 = 2 * ((k * mrange) / n + begin);
422  if (dx2 >= maxdx2) break;
423  while (i < end &&
424  dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
425  if (i == end) break;
426  if (dx2 < src.data[i].RMinNext() + src.data[i + 1].RMaxPrev()) {
427  if (i != lastidx) {
428  this->data[this->size++] = src.data[i]; lastidx = i;
429  }
430  } else {
431  if (i + 1 != lastidx) {
432  this->data[this->size++] = src.data[i + 1]; lastidx = i + 1;
433  }
434  }
435  }
436  }
437  if (lastidx != end) {
438  this->data[this->size++] = src.data[end];
439  lastidx = end;
440  }
441  bid = end;
442  // shift base by the gap
443  begin += src.data[bid].RMinNext() - src.data[bid].RMaxPrev();
444  }
445  }
446  }
447 };
451 template<typename DType, typename RType>
452 struct GKSummary {
454  struct Entry {
456  RType rmin;
458  RType rmax;
460  DType value;
461  // constructor
462  Entry() = default;
463  // constructor
464  Entry(RType rmin, RType rmax, DType value)
465  : rmin(rmin), rmax(rmax), value(value) {}
466  };
468  struct Queue {
469  // the input queue
470  std::vector<DType> queue;
471  // end of the queue
472  size_t qtail;
473  // push data to the queue
474  inline void Push(DType x, RType w) {
475  queue[qtail++] = x;
476  }
477  inline void MakeSummary(GKSummary *out) {
478  std::sort(queue.begin(), queue.begin() + qtail);
479  out->size = qtail;
480  for (size_t i = 0; i < qtail; ++i) {
481  out->data[i] = Entry(i + 1, i + 1, queue[i]);
482  }
483  }
484  };
488  size_t size;
489  GKSummary(Entry *data, size_t size)
490  : data(data), size(size) {}
492  inline RType MaxError() const {
493  RType res = 0;
494  for (size_t i = 1; i < size; ++i) {
495  res = std::max(data[i].rmax - data[i-1].rmin, res);
496  }
497  return res;
498  }
500  inline RType MaxRank() const {
501  return data[size - 1].rmax;
502  }
507  inline void CopyFrom(const GKSummary &src) {
508  size = src.size;
509  std::memcpy(data, src.data, sizeof(Entry) * size);
510  }
511  inline void CheckValid(RType eps) const {
512  // assume always valid
513  }
515  inline void Print() const {
516  for (size_t i = 0; i < size; ++i) {
517  LOG(CONSOLE) << "x=" << data[i].value << "\t"
518  << "[" << data[i].rmin << "," << data[i].rmax << "]";
519  }
520  }
527  inline void SetPrune(const GKSummary &src, size_t maxsize) {
528  if (src.size <= maxsize) {
529  this->CopyFrom(src); return;
530  }
531  const RType max_rank = src.MaxRank();
532  this->size = maxsize;
533  data[0] = src.data[0];
534  size_t n = maxsize - 1;
535  RType top = 1;
536  for (size_t i = 1; i < n; ++i) {
537  RType k = (i * max_rank) / n;
538  while (k > src.data[top + 1].rmax) ++top;
539  // assert src.data[top].rmin <= k
540  // because k > src.data[top].rmax >= src.data[top].rmin
541  if ((k - src.data[top].rmin) < (src.data[top+1].rmax - k)) {
542  data[i] = src.data[top];
543  } else {
544  data[i] = src.data[top + 1];
545  }
546  }
547  data[n] = src.data[src.size - 1];
548  }
549  inline void SetCombine(const GKSummary &sa,
550  const GKSummary &sb) {
551  if (sa.size == 0) {
552  this->CopyFrom(sb); return;
553  }
554  if (sb.size == 0) {
555  this->CopyFrom(sa); return;
556  }
557  CHECK(sa.size > 0 && sb.size > 0) << "invalid input for merge";
558  const Entry *a = sa.data, *a_end = sa.data + sa.size;
559  const Entry *b = sb.data, *b_end = sb.data + sb.size;
560  this->size = sa.size + sb.size;
561  RType aprev_rmin = 0, bprev_rmin = 0;
562  Entry *dst = this->data;
563  while (a != a_end && b != b_end) {
564  if (a->value < b->value) {
565  *dst = Entry(bprev_rmin + a->rmin,
566  a->rmax + b->rmax - 1, a->value);
567  aprev_rmin = a->rmin;
568  ++dst; ++a;
569  } else {
570  *dst = Entry(aprev_rmin + b->rmin,
571  b->rmax + a->rmax - 1, b->value);
572  bprev_rmin = b->rmin;
573  ++dst; ++b;
574  }
575  }
576  if (a != a_end) {
577  RType bprev_rmax = (b_end - 1)->rmax;
578  do {
579  *dst = Entry(bprev_rmin + a->rmin, bprev_rmax + a->rmax, a->value);
580  ++dst; ++a;
581  } while (a != a_end);
582  }
583  if (b != b_end) {
584  RType aprev_rmax = (a_end - 1)->rmax;
585  do {
586  *dst = Entry(aprev_rmin + b->rmin, aprev_rmax + b->rmax, b->value);
587  ++dst; ++b;
588  } while (b != b_end);
589  }
590  CHECK(dst == data + size) << "bug in combine";
591  }
592 };
593 
601 template<typename DType, typename RType, class TSummary>
603  public:
605  using Summary = TSummary;
607  using Entry = typename Summary::Entry;
609  struct SummaryContainer : public Summary {
610  std::vector<Entry> space;
611  SummaryContainer(const SummaryContainer &src) : Summary(nullptr, src.size) {
612  this->space = src.space;
613  this->data = dmlc::BeginPtr(this->space);
614  }
615  SummaryContainer() : Summary(nullptr, 0) {
616  }
618  inline void Reserve(size_t size) {
619  if (size > space.size()) {
620  space.resize(size);
621  this->data = dmlc::BeginPtr(space);
622  }
623  }
629  inline void SetMerge(const Summary *begin,
630  const Summary *end) {
631  CHECK(begin < end) << "can not set combine to empty instance";
632  size_t len = end - begin;
633  if (len == 1) {
634  this->Reserve(begin[0].size);
635  this->CopyFrom(begin[0]);
636  } else if (len == 2) {
637  this->Reserve(begin[0].size + begin[1].size);
638  this->SetMerge(begin[0], begin[1]);
639  } else {
640  // recursive merge
641  SummaryContainer lhs, rhs;
642  lhs.SetCombine(begin, begin + len / 2);
643  rhs.SetCombine(begin + len / 2, end);
644  this->Reserve(lhs.size + rhs.size);
645  this->SetCombine(lhs, rhs);
646  }
647  }
654  inline void Reduce(const Summary &src, size_t max_nbyte) {
655  this->Reserve((max_nbyte - sizeof(this->size)) / sizeof(Entry));
656  SummaryContainer temp;
657  temp.Reserve(this->size + src.size);
658  temp.SetCombine(*this, src);
659  this->SetPrune(temp, space.size());
660  }
662  inline static size_t CalcMemCost(size_t nentry) {
663  return sizeof(size_t) + sizeof(Entry) * nentry;
664  }
666  template<typename TStream>
667  inline void Save(TStream &fo) const { // NOLINT(*)
668  fo.Write(&(this->size), sizeof(this->size));
669  if (this->size != 0) {
670  fo.Write(this->data, this->size * sizeof(Entry));
671  }
672  }
674  template<typename TStream>
675  inline void Load(TStream &fi) { // NOLINT(*)
676  CHECK_EQ(fi.Read(&this->size, sizeof(this->size)), sizeof(this->size));
677  this->Reserve(this->size);
678  if (this->size != 0) {
679  CHECK_EQ(fi.Read(this->data, this->size * sizeof(Entry)),
680  this->size * sizeof(Entry));
681  }
682  }
683  };
689  inline void Init(size_t maxn, double eps) {
690  LimitSizeLevel(maxn, eps, &nlevel, &limit_size);
691  // lazy reserve the space, if there is only one value, no need to allocate space
692  inqueue.queue.resize(1);
693  inqueue.qtail = 0;
694  data.clear();
695  level.clear();
696  }
697 
698  inline static void LimitSizeLevel
699  (size_t maxn, double eps, size_t* out_nlevel, size_t* out_limit_size) {
700  size_t& nlevel = *out_nlevel;
701  size_t& limit_size = *out_limit_size;
702  nlevel = 1;
703  while (true) {
704  limit_size = static_cast<size_t>(ceil(nlevel / eps)) + 1;
705  size_t n = (1ULL << nlevel);
706  if (n * limit_size >= maxn) break;
707  ++nlevel;
708  }
709  // check invariant
710  size_t n = (1ULL << nlevel);
711  CHECK(n * limit_size >= maxn) << "invalid init parameter";
712  CHECK(nlevel <= limit_size * eps) << "invalid init parameter";
713  }
714 
720  inline void Push(DType x, RType w = 1) {
721  if (w == static_cast<RType>(0)) return;
722  if (inqueue.qtail == inqueue.queue.size()) {
723  // jump from lazy one value to limit_size * 2
724  if (inqueue.queue.size() == 1) {
725  inqueue.queue.resize(limit_size * 2);
726  } else {
727  temp.Reserve(limit_size * 2);
728  inqueue.MakeSummary(&temp);
729  // cleanup queue
730  inqueue.qtail = 0;
731  this->PushTemp();
732  }
733  }
734  inqueue.Push(x, w);
735  }
736 
737  inline void PushSummary(const Summary& summary) {
738  temp.Reserve(limit_size * 2);
739  temp.SetPrune(summary, limit_size * 2);
740  PushTemp();
741  }
742 
744  inline void PushTemp() {
745  temp.Reserve(limit_size * 2);
746  for (size_t l = 1; true; ++l) {
747  this->InitLevel(l + 1);
748  // check if level l is empty
749  if (level[l].size == 0) {
750  level[l].SetPrune(temp, limit_size);
751  break;
752  } else {
753  // level 0 is actually temp space
754  level[0].SetPrune(temp, limit_size);
755  temp.SetCombine(level[0], level[l]);
756  if (temp.size > limit_size) {
757  // try next level
758  level[l].size = 0;
759  } else {
760  // if merged record is still smaller, no need to send to next level
761  level[l].CopyFrom(temp); break;
762  }
763  }
764  }
765  }
767  inline void GetSummary(SummaryContainer *out) {
768  if (level.size() != 0) {
769  out->Reserve(limit_size * 2);
770  } else {
771  out->Reserve(inqueue.queue.size());
772  }
773  inqueue.MakeSummary(out);
774  if (level.size() != 0) {
775  level[0].SetPrune(*out, limit_size);
776  for (size_t l = 1; l < level.size(); ++l) {
777  if (level[l].size == 0) continue;
778  if (level[0].size == 0) {
779  level[0].CopyFrom(level[l]);
780  } else {
781  out->SetCombine(level[0], level[l]);
782  level[0].SetPrune(*out, limit_size);
783  }
784  }
785  out->CopyFrom(level[0]);
786  } else {
787  if (out->size > limit_size) {
788  temp.Reserve(limit_size);
789  temp.SetPrune(*out, limit_size);
790  out->CopyFrom(temp);
791  }
792  }
793  }
794  // used for debug, check if the sketch is valid
795  inline void CheckValid(RType eps) const {
796  for (size_t l = 1; l < level.size(); ++l) {
797  level[l].CheckValid(eps);
798  }
799  }
800  // initialize level space to at least nlevel
801  inline void InitLevel(size_t nlevel) {
802  if (level.size() >= nlevel) return;
803  data.resize(limit_size * nlevel);
804  level.resize(nlevel, Summary(nullptr, 0));
805  for (size_t l = 0; l < level.size(); ++l) {
806  level[l].data = dmlc::BeginPtr(data) + l * limit_size;
807  }
808  }
809  // input data queue
810  typename Summary::Queue inqueue;
811  // number of levels
812  size_t nlevel;
813  // size of summary in each level
814  size_t limit_size;
815  // the level of each summaries
816  std::vector<Summary> level;
817  // content of the summary
818  std::vector<Entry> data;
819  // temporal summary, used for temp-merge
820  SummaryContainer temp;
821 };
822 
828 template<typename DType, typename RType = unsigned>
830  public QuantileSketchTemplate<DType, RType, WQSummary<DType, RType> > {
831 };
832 
838 template<typename DType, typename RType = unsigned>
840  public QuantileSketchTemplate<DType, RType, WXQSummary<DType, RType> > {
841 };
847 template<typename DType, typename RType = unsigned>
849  public QuantileSketchTemplate<DType, RType, GKSummary<DType, RType> > {
850 };
851 } // namespace common
852 } // namespace xgboost
853 #endif // XGBOOST_COMMON_QUANTILE_H_
input data queue before entering the summary
Definition: quantile.h:468
void MakeSummary(WQSummary *out)
Definition: quantile.h:89
WQSummary(Entry *data, size_t size)
Definition: quantile.h:111
SummaryContainer(const SummaryContainer &src)
Definition: quantile.h:611
void SetCombine(const WQSummary &sa, const WQSummary &sb)
set current summary to be merged summary of sa and sb
Definition: quantile.h:230
RType rmin
minimum rank
Definition: quantile.h:456
typename WQSummary< DType, RType >::Entry Entry
Definition: quantile.h:348
bool operator<(const QEntry &b) const
Definition: quantile.h:73
std::vector< QEntry > queue
Definition: quantile.h:78
size_t qtail
Definition: quantile.h:80
WXQSummary(Entry *data, size_t size)
Definition: quantile.h:350
std::vector< Entry > space
Definition: quantile.h:610
std::vector< DType > queue
Definition: quantile.h:470
size_t nlevel
Definition: quantile.h:812
RType MaxRank() const
Definition: quantile.h:150
void Reserve(size_t size)
reserve space for summary
Definition: quantile.h:618
void Save(TStream &fo) const
save the data structure into stream
Definition: quantile.h:667
SummaryContainer temp
Definition: quantile.h:820
void MakeSummary(GKSummary *out)
Definition: quantile.h:477
template for all quantile sketch algorithm that uses merge/prune scheme
Definition: quantile.h:602
static size_t CalcMemCost(size_t nentry)
return the number of bytes this data structure cost in serialization
Definition: quantile.h:662
experimental wsummary
Definition: quantile.h:26
void CheckValid(RType eps) const
debug function, validate whether the summary run consistency check to check if it is a valid summary ...
Definition: quantile.h:178
void Push(DType x, RType w)
Definition: quantile.h:82
typename Summary::Entry Entry
the entry type
Definition: quantile.h:607
Entry * data
data field
Definition: quantile.h:486
void Push(DType x, RType w=1)
add an element to a sketch
Definition: quantile.h:720
DType value
the value of data
Definition: quantile.h:36
void Push(DType x, RType w)
Definition: quantile.h:474
Quantile sketch use WQSummary.
Definition: quantile.h:848
void Init(size_t maxn, double eps)
initialize the quantile sketch, given the performance specification
Definition: quantile.h:689
RType wmin
maximum weight
Definition: quantile.h:34
void SetMerge(const Summary *begin, const Summary *end)
set the space to be merge of all Summary arrays
Definition: quantile.h:629
bool Check(const char *msg) const
Definition: quantile.h:330
void CopyFrom(const WQSummary &src)
copy content from src
Definition: quantile.h:157
Entry Query(DType qvalue, size_t &istart) const
query qvalue, start from istart
Definition: quantile.h:129
size_t limit_size
Definition: quantile.h:814
void FixError(RType *err_mingap, RType *err_maxgap, RType *err_wgap) const
Definition: quantile.h:303
Quantile sketch use WXQSummary.
Definition: quantile.h:839
size_t size
number of elements in the summary
Definition: quantile.h:109
static bool CheckLarge(const Entry &e, RType chunk)
Definition: quantile.h:353
RType weight
Definition: quantile.h:66
void SetPrune(const WQSummary< DType, RType > &src, size_t maxsize)
Definition: quantile.h:357
XGBOOST_DEVICE RType RMinNext() const
Definition: quantile.h:51
Summary::Queue inqueue
Definition: quantile.h:810
Quantile sketch use WQSummary.
Definition: quantile.h:829
void PushTemp()
push up temp
Definition: quantile.h:744
void SetPrune(const WQSummary &src, size_t maxsize)
set current summary to be pruned summary of src assume data field is already allocated to be at least...
Definition: quantile.h:194
void Load(TStream &fi)
load data structure from input stream
Definition: quantile.h:675
input data queue before entering the summary
Definition: quantile.h:60
GKSummary(Entry *data, size_t size)
Definition: quantile.h:489
void PushSummary(const Summary &summary)
Definition: quantile.h:737
QEntry(DType value, RType weight)
Definition: quantile.h:70
void Print() const
used for debug purpose, print the summary
Definition: quantile.h:515
Entry(RType rmin, RType rmax, DType value)
Definition: quantile.h:464
#define XGBOOST_DEVICE
Tag function as usable by device.
Definition: base.h:84
same as summary, but use STL to backup the space
Definition: quantile.h:609
DType value
Definition: quantile.h:64
void InitLevel(size_t nlevel)
Definition: quantile.h:801
RType MaxRank() const
Definition: quantile.h:500
std::vector< Entry > data
Definition: quantile.h:818
RType MaxError() const
Definition: quantile.h:116
namespace of xgboost
Definition: base.h:102
size_t qtail
Definition: quantile.h:472
RType MaxError() const
the maximum error of the summary
Definition: quantile.h:492
void SetPrune(const GKSummary &src, size_t maxsize)
set current summary to be pruned summary of src assume data field is already allocated to be at least...
Definition: quantile.h:527
RType rmax
maximum rank
Definition: quantile.h:32
void GetSummary(SummaryContainer *out)
get the summary after finalize
Definition: quantile.h:767
void MakeFromSorted(const Entry *entries, size_t n)
Definition: quantile.h:161
void CheckValid(RType eps) const
Definition: quantile.h:795
std::vector< Summary > level
Definition: quantile.h:816
void Reduce(const Summary &src, size_t max_nbyte)
do elementwise combination of summary array this[i] = combine(this[i], src[i]) for each i ...
Definition: quantile.h:654
DType value
the value of data
Definition: quantile.h:460
an entry in the sketch summary
Definition: quantile.h:454
XGBOOST_DEVICE RType RMaxPrev() const
Definition: quantile.h:55
void CopyFrom(const GKSummary &src)
copy content from src
Definition: quantile.h:507
void CheckValid(RType eps=0) const
debug function, check Valid
Definition: quantile.h:46
void Print() const
Definition: quantile.h:293
try to do efficient pruning
Definition: quantile.h:346
XGBOOST_DEVICE Entry(RType rmin, RType rmax, RType wmin, DType value)
Definition: quantile.h:40
size_t size
number of elements in the summary
Definition: quantile.h:488
void SetCombine(const GKSummary &sa, const GKSummary &sb)
Definition: quantile.h:549
XGBOOST_DEVICE Entry()
Definition: quantile.h:38
void CheckValid(RType eps) const
Definition: quantile.h:511
Entry * data
data field
Definition: quantile.h:107
an entry in the sketch summary
Definition: quantile.h:28
TSummary Summary
type of summary type
Definition: quantile.h:605
traditional GK summary
Definition: quantile.h:452
RType rmin
minimum rank
Definition: quantile.h:30
RType rmax
maximum rank
Definition: quantile.h:458