xgboost
data.h
Go to the documentation of this file.
1 
7 #ifndef XGBOOST_DATA_H_
8 #define XGBOOST_DATA_H_
9 
10 #include <dmlc/base.h>
11 #include <dmlc/data.h>
12 #include <dmlc/serializer.h>
13 #include <xgboost/base.h>
15 #include <xgboost/linalg.h>
16 #include <xgboost/span.h>
17 #include <xgboost/string_view.h>
18 
19 #include <algorithm>
20 #include <limits>
21 #include <memory>
22 #include <numeric>
23 #include <string>
24 #include <utility>
25 #include <vector>
26 
27 namespace xgboost {
28 // forward declare dmatrix.
29 class DMatrix;
30 struct Context;
31 
33 enum class DataType : uint8_t {
34  kFloat32 = 1,
35  kDouble = 2,
36  kUInt32 = 3,
37  kUInt64 = 4,
38  kStr = 5
39 };
40 
41 enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 };
42 
43 enum class DataSplitMode : int { kRow = 0, kCol = 1 };
44 
48 class MetaInfo {
49  public:
51  static constexpr uint64_t kNumField = 12;
52 
54  uint64_t num_row_{0}; // NOLINT
56  uint64_t num_col_{0}; // NOLINT
58  uint64_t num_nonzero_{0}; // NOLINT
67  std::vector<bst_group_t> group_ptr_; // NOLINT
84 
88  std::vector<std::string> feature_type_names;
92  std::vector<std::string> feature_names;
93  /*
94  * \brief Type of each feature. Automatically set when feature_type_names is specifed.
95  */
97  /*
98  * \brief Weight of each feature, used to define the probability of each feature being
99  * selected when using column sampling.
100  */
102 
104  MetaInfo() = default;
105  MetaInfo(MetaInfo&& that) = default;
106  MetaInfo& operator=(MetaInfo&& that) = default;
107  MetaInfo& operator=(MetaInfo const& that) = delete;
108 
112  void Validate(int32_t device) const;
113 
115 
116  MetaInfo Copy() const;
117 
123  inline bst_float GetWeight(size_t i) const {
124  return weights_.Size() != 0 ? weights_.HostVector()[i] : 1.0f;
125  }
127  const std::vector<size_t>& LabelAbsSort(Context const* ctx) const;
129  void Clear();
134  void LoadBinary(dmlc::Stream* fi);
139  void SaveBinary(dmlc::Stream* fo) const;
147  void SetInfo(Context const& ctx, const char* key, const void* dptr, DataType dtype, size_t num);
153  void SetInfo(Context const& ctx, StringView key, StringView interface_str);
154 
155  void GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
156  const void** out_dptr) const;
157 
158  void SetFeatureInfo(const char *key, const char **info, const bst_ulong size);
159  void GetFeatureInfo(const char *field, std::vector<std::string>* out_str_vecs) const;
160 
161  /*
162  * \brief Extend with other MetaInfo.
163  *
164  * \param that The other MetaInfo object.
165  *
166  * \param accumulate_rows Whether rows need to be accumulated in this function. If
167  * client code knows number of rows in advance, set this
168  * parameter to false.
169  * \param check_column Whether the extend method should check the consistency of
170  * columns.
171  */
172  void Extend(MetaInfo const& that, bool accumulate_rows, bool check_column);
173 
182 
184  bool IsRowSplit() const {
186  }
187 
191  bool IsRanking() const { return !group_ptr_.empty(); }
192 
197  bool IsVerticalFederated() const;
198 
205  bool ShouldHaveLabels() const;
206 
207  private:
208  void SetInfoFromHost(Context const& ctx, StringView key, Json arr);
209  void SetInfoFromCUDA(Context const& ctx, StringView key, Json arr);
210 
212  mutable std::vector<size_t> label_order_cache_;
213 };
214 
216 struct Entry {
222  Entry() = default;
230  inline static bool CmpValue(const Entry& a, const Entry& b) {
231  return a.fvalue < b.fvalue;
232  }
233  static bool CmpIndex(Entry const& a, Entry const& b) {
234  return a.index < b.index;
235  }
236  inline bool operator==(const Entry& other) const {
237  return (this->index == other.index && this->fvalue == other.fvalue);
238  }
239 };
240 
244 struct BatchParam {
257  bool regen{false};
261  bool forbid_regen{false};
265  double sparse_thresh{std::numeric_limits<double>::quiet_NaN()};
266 
270  BatchParam() = default;
283  : max_bin{max_bin}, hess{hessian}, regen{regenerate} {}
284 
285  [[nodiscard]] bool ParamNotEqual(BatchParam const& other) const {
286  // Check non-floating parameters.
287  bool cond = max_bin != other.max_bin;
288  // Check sparse thresh.
289  bool l_nan = std::isnan(sparse_thresh);
290  bool r_nan = std::isnan(other.sparse_thresh);
291  bool st_chg = (l_nan != r_nan) || (!l_nan && !r_nan && (sparse_thresh != other.sparse_thresh));
292  cond |= st_chg;
293 
294  return cond;
295  }
296  [[nodiscard]] bool Initialized() const { return max_bin != 0; }
300  [[nodiscard]] BatchParam MakeCache() const {
301  auto p = *this;
302  // These parameters have nothing to do with how the gradient index was generated in the
303  // first place.
304  p.regen = false;
305  p.forbid_regen = false;
306  return p;
307  }
308 };
309 
312 
315 
316  Inst operator[](size_t i) const {
317  auto size = *(offset.data() + i + 1) - *(offset.data() + i);
318  return {data.data() + *(offset.data() + i),
319  static_cast<Inst::index_type>(size)};
320  }
321 
322  [[nodiscard]] size_t Size() const { return offset.size() == 0 ? 0 : offset.size() - 1; }
323 };
324 
328 class SparsePage {
329  public:
330  // Offset for each row.
334 
335  size_t base_rowid {0};
336 
339 
340  [[nodiscard]] HostSparsePageView GetView() const {
341  return {offset.ConstHostSpan(), data.ConstHostSpan()};
342  }
343 
346  this->Clear();
347  }
348 
349  SparsePage(SparsePage const& that) = delete;
350  SparsePage(SparsePage&& that) = default;
351  SparsePage& operator=(SparsePage const& that) = delete;
352  SparsePage& operator=(SparsePage&& that) = default;
353  virtual ~SparsePage() = default;
354 
356  [[nodiscard]] size_t Size() const {
357  return offset.Size() == 0 ? 0 : offset.Size() - 1;
358  }
359 
361  [[nodiscard]] size_t MemCostBytes() const {
362  return offset.Size() * sizeof(size_t) + data.Size() * sizeof(Entry);
363  }
364 
366  inline void Clear() {
367  base_rowid = 0;
368  auto& offset_vec = offset.HostVector();
369  offset_vec.clear();
370  offset_vec.push_back(0);
371  data.HostVector().clear();
372  }
373 
375  inline void SetBaseRowId(size_t row_id) {
376  base_rowid = row_id;
377  }
378 
379  [[nodiscard]] SparsePage GetTranspose(int num_columns, int32_t n_threads) const;
380 
384  void SortIndices(int32_t n_threads);
388  [[nodiscard]] bool IsIndicesSorted(int32_t n_threads) const;
392  void Reindex(uint64_t feature_offset, int32_t n_threads);
393 
394  void SortRows(int32_t n_threads);
395 
406  template <typename AdapterBatchT>
407  uint64_t Push(const AdapterBatchT& batch, float missing, int nthread);
408 
413  void Push(const SparsePage &batch);
418  void PushCSC(const SparsePage& batch);
419 };
420 
421 class CSCPage: public SparsePage {
422  public:
424  explicit CSCPage(SparsePage page) : SparsePage(std::move(page)) {}
425 };
426 
432  public:
433  std::shared_ptr<SparsePage const> page;
434  explicit ExtSparsePage(std::shared_ptr<SparsePage const> p) : page{std::move(p)} {}
435 };
436 
437 class SortedCSCPage : public SparsePage {
438  public:
440  explicit SortedCSCPage(SparsePage page) : SparsePage(std::move(page)) {}
441 };
442 
443 class EllpackPage;
444 class GHistIndexMatrix;
445 
446 template<typename T>
448  public:
449  using iterator_category = std::forward_iterator_tag; // NOLINT
450  virtual ~BatchIteratorImpl() = default;
451  virtual const T& operator*() const = 0;
453  [[nodiscard]] virtual bool AtEnd() const = 0;
454  virtual std::shared_ptr<T const> Page() const = 0;
455 };
456 
457 template<typename T>
459  public:
460  using iterator_category = std::forward_iterator_tag; // NOLINT
461  explicit BatchIterator(BatchIteratorImpl<T>* impl) { impl_.reset(impl); }
462  explicit BatchIterator(std::shared_ptr<BatchIteratorImpl<T>> impl) { impl_ = impl; }
463 
465  CHECK(impl_ != nullptr);
466  ++(*impl_);
467  return *this;
468  }
469 
470  const T& operator*() const {
471  CHECK(impl_ != nullptr);
472  return *(*impl_);
473  }
474 
475  bool operator!=(const BatchIterator&) const {
476  CHECK(impl_ != nullptr);
477  return !impl_->AtEnd();
478  }
479 
480  [[nodiscard]] bool AtEnd() const {
481  CHECK(impl_ != nullptr);
482  return impl_->AtEnd();
483  }
484 
485  [[nodiscard]] std::shared_ptr<T const> Page() const {
486  return impl_->Page();
487  }
488 
489  private:
490  std::shared_ptr<BatchIteratorImpl<T>> impl_;
491 };
492 
493 template<typename T>
494 class BatchSet {
495  public:
496  explicit BatchSet(BatchIterator<T> begin_iter) : begin_iter_(std::move(begin_iter)) {}
497  BatchIterator<T> begin() { return begin_iter_; } // NOLINT
498  BatchIterator<T> end() { return BatchIterator<T>(nullptr); } // NOLINT
499 
500  private:
501  BatchIterator<T> begin_iter_;
502 };
503 
504 struct XGBAPIThreadLocalEntry;
505 
509 class DMatrix {
510  public:
512  DMatrix() = default;
514  virtual MetaInfo& Info() = 0;
515  virtual void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) {
516  auto const& ctx = *this->Ctx();
517  this->Info().SetInfo(ctx, key, dptr, dtype, num);
518  }
519  virtual void SetInfo(const char* key, std::string const& interface_str) {
520  auto const& ctx = *this->Ctx();
521  this->Info().SetInfo(ctx, key, StringView{interface_str});
522  }
524  [[nodiscard]] virtual const MetaInfo& Info() const = 0;
525 
527  [[nodiscard]] XGBAPIThreadLocalEntry& GetThreadLocal() const;
532  [[nodiscard]] virtual Context const* Ctx() const = 0;
533 
537  template <typename T>
539  template <typename T>
541  template <typename T>
542  BatchSet<T> GetBatches(Context const* ctx, const BatchParam& param);
543  template <typename T>
544  [[nodiscard]] bool PageExists() const;
545 
546  // the following are column meta data, should be able to answer them fast.
548  [[nodiscard]] virtual bool SingleColBlock() const = 0;
550  virtual ~DMatrix();
551 
553  [[nodiscard]] bool IsDense() const {
554  return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
555  }
556 
566  static DMatrix* Load(const std::string& uri, bool silent = true,
567  DataSplitMode data_split_mode = DataSplitMode::kRow);
568 
581  template <typename AdapterT>
582  static DMatrix* Create(AdapterT* adapter, float missing, int nthread,
583  const std::string& cache_prefix = "",
584  DataSplitMode data_split_mode = DataSplitMode::kRow);
585 
605  template <typename DataIterHandle, typename DMatrixHandle, typename DataIterResetCallback,
606  typename XGDMatrixCallbackNext>
607  static DMatrix* Create(DataIterHandle iter, DMatrixHandle proxy, std::shared_ptr<DMatrix> ref,
608  DataIterResetCallback* reset, XGDMatrixCallbackNext* next, float missing,
609  int nthread, bst_bin_t max_bin);
610 
629  template <typename DataIterHandle, typename DMatrixHandle,
630  typename DataIterResetCallback, typename XGDMatrixCallbackNext>
632  DataIterResetCallback *reset,
633  XGDMatrixCallbackNext *next, float missing,
634  int32_t nthread, std::string cache);
635 
637 
645  virtual DMatrix *SliceCol(int num_slices, int slice_id) = 0;
646 
647  protected:
649  virtual BatchSet<CSCPage> GetColumnBatches(Context const* ctx) = 0;
651  virtual BatchSet<EllpackPage> GetEllpackBatches(Context const* ctx, BatchParam const& param) = 0;
653  BatchParam const& param) = 0;
654  virtual BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) = 0;
655 
656  [[nodiscard]] virtual bool EllpackExists() const = 0;
657  [[nodiscard]] virtual bool GHistIndexExists() const = 0;
658  [[nodiscard]] virtual bool SparsePageExists() const = 0;
659 };
660 
661 template <>
663  return GetRowBatches();
664 }
665 
666 template <>
667 inline bool DMatrix::PageExists<EllpackPage>() const {
668  return this->EllpackExists();
669 }
670 
671 template <>
672 inline bool DMatrix::PageExists<GHistIndexMatrix>() const {
673  return this->GHistIndexExists();
674 }
675 
676 template <>
677 inline bool DMatrix::PageExists<SparsePage>() const {
678  return this->SparsePageExists();
679 }
680 
681 template <>
683  return GetRowBatches();
684 }
685 
686 template <>
687 inline BatchSet<CSCPage> DMatrix::GetBatches(Context const* ctx) {
688  return GetColumnBatches(ctx);
689 }
690 
691 template <>
692 inline BatchSet<SortedCSCPage> DMatrix::GetBatches(Context const* ctx) {
693  return GetSortedColumnBatches(ctx);
694 }
695 
696 template <>
698  return GetEllpackBatches(ctx, param);
699 }
700 
701 template <>
702 inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
703  return GetGradientIndex(ctx, param);
704 }
705 
706 template <>
707 inline BatchSet<ExtSparsePage> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
708  return GetExtBatches(ctx, param);
709 }
710 } // namespace xgboost
711 
713 
714 namespace dmlc {
716 
717 namespace serializer {
718 
719 template <>
720 struct Handler<xgboost::Entry> {
721  inline static void Write(Stream* strm, const xgboost::Entry& data) {
722  strm->Write(data.index);
723  strm->Write(data.fvalue);
724  }
725 
726  inline static bool Read(Stream* strm, xgboost::Entry* data) {
727  return strm->Read(&data->index) && strm->Read(&data->fvalue);
728  }
729 };
730 
731 } // namespace serializer
732 } // namespace dmlc
733 #endif // XGBOOST_DATA_H_
Defines configuration macros and basic types for xgboost.
#define XGBOOST_DEVICE
Tag function as usable by device.
Definition: base.h:64
Definition: data.h:447
virtual BatchIteratorImpl & operator++()=0
std::forward_iterator_tag iterator_category
Definition: data.h:449
virtual std::shared_ptr< T const > Page() const =0
virtual bool AtEnd() const =0
virtual const T & operator*() const =0
virtual ~BatchIteratorImpl()=default
Definition: data.h:458
BatchIterator(std::shared_ptr< BatchIteratorImpl< T >> impl)
Definition: data.h:462
std::forward_iterator_tag iterator_category
Definition: data.h:460
BatchIterator(BatchIteratorImpl< T > *impl)
Definition: data.h:461
const T & operator*() const
Definition: data.h:470
std::shared_ptr< T const > Page() const
Definition: data.h:485
BatchIterator & operator++()
Definition: data.h:464
bool operator!=(const BatchIterator &) const
Definition: data.h:475
bool AtEnd() const
Definition: data.h:480
Definition: data.h:494
BatchSet(BatchIterator< T > begin_iter)
Definition: data.h:496
BatchIterator< T > begin()
Definition: data.h:497
BatchIterator< T > end()
Definition: data.h:498
Definition: data.h:421
CSCPage()
Definition: data.h:423
CSCPage(SparsePage page)
Definition: data.h:424
Internal data structured used by XGBoost during training.
Definition: data.h:509
virtual BatchSet< EllpackPage > GetEllpackBatches(Context const *ctx, BatchParam const &param)=0
static DMatrix * Load(const std::string &uri, bool silent=true, DataSplitMode data_split_mode=DataSplitMode::kRow)
Load DMatrix from URI.
virtual BatchSet< SparsePage > GetRowBatches()=0
virtual BatchSet< GHistIndexMatrix > GetGradientIndex(Context const *ctx, BatchParam const &param)=0
virtual void SetInfo(const char *key, std::string const &interface_str)
Definition: data.h:519
static DMatrix * Create(DataIterHandle iter, DMatrixHandle proxy, std::shared_ptr< DMatrix > ref, DataIterResetCallback *reset, XGDMatrixCallbackNext *next, float missing, int nthread, bst_bin_t max_bin)
Create a new Quantile based DMatrix used for histogram based algorithm.
virtual BatchSet< ExtSparsePage > GetExtBatches(Context const *ctx, BatchParam const &param)=0
bool PageExists() const
BatchSet< T > GetBatches(Context const *ctx)
virtual ~DMatrix()
virtual destructor
virtual MetaInfo & Info()=0
meta information of the dataset
virtual void SetInfo(const char *key, const void *dptr, DataType dtype, size_t num)
Definition: data.h:515
static DMatrix * Create(AdapterT *adapter, float missing, int nthread, const std::string &cache_prefix="", DataSplitMode data_split_mode=DataSplitMode::kRow)
Creates a new DMatrix from an external data adapter.
virtual DMatrix * SliceCol(int num_slices, int slice_id)=0
Slice a DMatrix by columns.
virtual bool GHistIndexExists() const =0
XGBAPIThreadLocalEntry & GetThreadLocal() const
Get thread local memory for returning data from DMatrix.
virtual bool SparsePageExists() const =0
static DMatrix * Create(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset, XGDMatrixCallbackNext *next, float missing, int32_t nthread, std::string cache)
Create an external memory DMatrix with callbacks.
virtual DMatrix * Slice(common::Span< int32_t const > ridxs)=0
virtual Context const * Ctx() const =0
Get the context object of this DMatrix. The context is created during construction of DMatrix with us...
virtual bool SingleColBlock() const =0
BatchSet< T > GetBatches()
Gets batches. Use range based for loop over BatchSet to access individual batches.
virtual const MetaInfo & Info() const =0
meta information of the dataset
virtual bool EllpackExists() const =0
virtual BatchSet< CSCPage > GetColumnBatches(Context const *ctx)=0
virtual BatchSet< SortedCSCPage > GetSortedColumnBatches(Context const *ctx)=0
BatchSet< T > GetBatches(Context const *ctx, const BatchParam &param)
bool IsDense() const
Whether the matrix is dense.
Definition: data.h:553
DMatrix()=default
default constructor
Sparse page for exporting DMatrix. Same as SparsePage, just a different type to prevent being used in...
Definition: data.h:431
ExtSparsePage(std::shared_ptr< SparsePage const > p)
Definition: data.h:434
std::shared_ptr< SparsePage const > page
Definition: data.h:433
common::Span< T const > ConstHostSpan() const
Definition: host_device_vector.h:115
std::vector< T > & HostVector()
Data structure representing JSON format.
Definition: json.h:357
Meta information about dataset, always sit in memory.
Definition: data.h:48
linalg::Tensor< float, 2 > base_margin_
initialized margins, if specified, xgboost will start from this init margin can be used to specify in...
Definition: data.h:75
std::vector< std::string > feature_names
Name for each feature.
Definition: data.h:92
MetaInfo(MetaInfo &&that)=default
HostDeviceVector< bst_float > labels_upper_bound_
upper bound of the label, to be used for survival analysis (censored regression)
Definition: data.h:83
void Validate(int32_t device) const
Validate all metainfo.
uint64_t num_col_
number of columns in the data
Definition: data.h:56
std::vector< std::string > feature_type_names
Name of type for each feature provided by users. Eg. "int"/"float"/"i"/"q".
Definition: data.h:88
HostDeviceVector< bst_float > weights_
weights of each instance, optional
Definition: data.h:69
bool IsVerticalFederated() const
A convenient method to check if we are doing vertical federated learning, which requires some special...
MetaInfo & operator=(MetaInfo const &that)=delete
void GetInfo(char const *key, bst_ulong *out_len, DataType dtype, const void **out_dptr) const
void SynchronizeNumberOfColumns()
Synchronize the number of columns across all workers.
bool IsColumnSplit() const
Whether the data is split column-wise.
Definition: data.h:189
bst_float GetWeight(size_t i) const
Get weight of each instances.
Definition: data.h:123
HostDeviceVector< FeatureType > feature_types
Definition: data.h:96
DataSplitMode data_split_mode
data split mode
Definition: data.h:62
void LoadBinary(dmlc::Stream *fi)
Load the Meta info from binary stream.
std::vector< bst_group_t > group_ptr_
the index of begin and end of a group needed when the learning task is ranking.
Definition: data.h:67
HostDeviceVector< float > feature_weights
Definition: data.h:101
void GetFeatureInfo(const char *field, std::vector< std::string > *out_str_vecs) const
uint64_t num_row_
number of rows in the data
Definition: data.h:54
MetaInfo & operator=(MetaInfo &&that)=default
void Clear()
clear all the information
void Extend(MetaInfo const &that, bool accumulate_rows, bool check_column)
bool IsRanking() const
Whether this is a learning to rank data.
Definition: data.h:191
uint64_t num_nonzero_
number of nonzero entries in the data
Definition: data.h:58
MetaInfo Slice(common::Span< int32_t const > ridxs) const
MetaInfo Copy() const
linalg::Tensor< float, 2 > labels
label of each instance
Definition: data.h:60
void SaveBinary(dmlc::Stream *fo) const
Save the Meta info to binary stream.
bool ShouldHaveLabels() const
A convenient method to check if the MetaInfo should contain labels.
MetaInfo()=default
default constructor
void SetInfo(Context const &ctx, const char *key, const void *dptr, DataType dtype, size_t num)
Set information in the meta info.
void SetInfo(Context const &ctx, StringView key, StringView interface_str)
Set information in the meta info with array interface.
static constexpr uint64_t kNumField
number of data fields in MetaInfo
Definition: data.h:51
bool IsRowSplit() const
Whether the data is split row-wise.
Definition: data.h:184
HostDeviceVector< bst_float > labels_lower_bound_
lower bound of the label, to be used for survival analysis (censored regression)
Definition: data.h:79
const std::vector< size_t > & LabelAbsSort(Context const *ctx) const
get sorted indexes (argsort) of labels by absolute value (used by cox loss)
void SetFeatureInfo(const char *key, const char **info, const bst_ulong size)
Definition: data.h:437
SortedCSCPage(SparsePage page)
Definition: data.h:440
SortedCSCPage()
Definition: data.h:439
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:328
void Push(const SparsePage &batch)
Push a sparse page.
SparsePage()
constructor
Definition: data.h:345
SparsePage GetTranspose(int num_columns, int32_t n_threads) const
void SetBaseRowId(size_t row_id)
Set the base row id for this page.
Definition: data.h:375
void Reindex(uint64_t feature_offset, int32_t n_threads)
Reindex the column index with an offset.
uint64_t Push(const AdapterBatchT &batch, float missing, int nthread)
Pushes external data batch onto this page.
void PushCSC(const SparsePage &batch)
Push a SparsePage stored in CSC format.
bool IsIndicesSorted(int32_t n_threads) const
Check wether the column index is sorted.
virtual ~SparsePage()=default
void SortIndices(int32_t n_threads)
Sort the column index.
HostDeviceVector< Entry > data
the data of the segments
Definition: data.h:333
HostSparsePageView GetView() const
Definition: data.h:340
HostDeviceVector< bst_row_t > offset
Definition: data.h:331
SparsePage & operator=(SparsePage const &that)=delete
size_t MemCostBytes() const
Definition: data.h:361
void Clear()
clear the page
Definition: data.h:366
SparsePage(SparsePage const &that)=delete
size_t Size() const
Definition: data.h:356
void SortRows(int32_t n_threads)
SparsePage & operator=(SparsePage &&that)=default
SparsePage(SparsePage &&that)=default
size_t base_rowid
Definition: data.h:335
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:424
constexpr XGBOOST_DEVICE pointer data() const __span_noexcept
Definition: span.h:549
std::size_t index_type
Definition: span.h:428
constexpr XGBOOST_DEVICE index_type size() const __span_noexcept
Definition: span.h:554
DECLARE_FIELD_ENUM_CLASS(xgboost::DataSplitMode)
void * DMatrixHandle
handle to DMatrix
Definition: c_api.h:50
int XGDMatrixCallbackNext(DataIterHandle iter)
Callback function prototype for getting next batch of data.
Definition: c_api.h:422
void * DataIterHandle
handle to a external data iterator
Definition: c_api.h:335
void DataIterResetCallback(DataIterHandle handle)
Callback function prototype for resetting external iterator.
Definition: c_api.h:427
A device-and-host vector abstraction layer.
Linear algebra related utilities.
Definition: data.h:714
DMLC_DECLARE_TRAITS(is_pod, xgboost::Entry, true)
Definition: intrusive_ptr.h:207
namespace of xgboost
Definition: base.h:90
uint32_t bst_feature_t
Type for data column (feature) index.
Definition: base.h:101
FeatureType
Definition: data.h:41
DataSplitMode
Definition: data.h:43
uint64_t bst_ulong
unsigned long integers
Definition: base.h:95
int32_t bst_bin_t
Type for histogram bin index.
Definition: base.h:103
DataType
data type accepted by xgboost interface
Definition: data.h:33
float bst_float
float type, used for storing statistics
Definition: base.h:97
static void Write(Stream *strm, const xgboost::Entry &data)
Definition: data.h:721
static bool Read(Stream *strm, xgboost::Entry *data)
Definition: data.h:726
Parameters for constructing histogram index batches.
Definition: data.h:244
bool forbid_regen
Forbid regenerating the gradient index. Used for internal validation.
Definition: data.h:261
bst_bin_t max_bin
Maximum number of bins per feature for histograms.
Definition: data.h:248
common::Span< float const > hess
Hessian, used for sketching with future approx implementation.
Definition: data.h:252
bool regen
Whether should we force DMatrix to regenerate the batch. Only used for GHistIndex.
Definition: data.h:257
bool ParamNotEqual(BatchParam const &other) const
Definition: data.h:285
BatchParam()=default
Exact or others that don't need histogram.
double sparse_thresh
Parameter used to generate column matrix for hist.
Definition: data.h:265
bool Initialized() const
Definition: data.h:296
BatchParam(bst_bin_t max_bin, common::Span< float const > hessian, bool regenerate)
Used by the approx tree method.
Definition: data.h:282
BatchParam MakeCache() const
Make a copy of self for DMatrix to describe how its existing index was generated.
Definition: data.h:300
BatchParam(bst_bin_t max_bin, double sparse_thresh)
Used by the hist tree method.
Definition: data.h:274
Runtime context for XGBoost. Contains information like threads and device.
Definition: context.h:84
Element from a sparse vector.
Definition: data.h:216
XGBOOST_DEVICE Entry(bst_feature_t index, bst_float fvalue)
constructor with index and value
Definition: data.h:228
Entry()=default
default constructor
bst_feature_t index
feature index
Definition: data.h:218
static bool CmpIndex(Entry const &a, Entry const &b)
Definition: data.h:233
bst_float fvalue
feature value
Definition: data.h:220
bool operator==(const Entry &other) const
Definition: data.h:236
static bool CmpValue(const Entry &a, const Entry &b)
reversely compare feature values
Definition: data.h:230
Definition: data.h:310
size_t Size() const
Definition: data.h:322
common::Span< Entry const > data
Definition: data.h:314
Inst operator[](size_t i) const
Definition: data.h:316
common::Span< bst_row_t const > offset
Definition: data.h:313
Definition: string_view.h:15