xgboost
data.h
Go to the documentation of this file.
1 
7 #ifndef XGBOOST_DATA_H_
8 #define XGBOOST_DATA_H_
9 
10 #include <dmlc/base.h>
11 #include <dmlc/data.h>
12 #include <rabit/rabit.h>
13 #include <cstring>
14 #include <memory>
15 #include <numeric>
16 #include <algorithm>
17 #include <string>
18 #include <vector>
19 #include "./base.h"
20 #include "../../src/common/span.h"
21 #include "../../src/common/group_data.h"
22 
23 #include "../../src/common/host_device_vector.h"
24 
25 namespace xgboost {
26 // forward declare learner.
27 class LearnerImpl;
28 
30 enum DataType {
31  kFloat32 = 1,
32  kDouble = 2,
33  kUInt32 = 3,
34  kUInt64 = 4
35 };
36 
40 class MetaInfo {
41  public:
43  uint64_t num_row_{0};
45  uint64_t num_col_{0};
47  uint64_t num_nonzero_{0};
54  std::vector<bst_uint> root_index_;
59  std::vector<bst_uint> group_ptr_;
63  std::vector<uint64_t> qids_;
71  static const int kVersion = 2;
73  static const int kVersionQidAdded = 2;
75  MetaInfo() = default;
81  inline bst_float GetWeight(size_t i) const {
82  return weights_.Size() != 0 ? weights_.HostVector()[i] : 1.0f;
83  }
89  inline unsigned GetRoot(size_t i) const {
90  return root_index_.size() != 0 ? root_index_[i] : 0U;
91  }
93  inline const std::vector<size_t>& LabelAbsSort() const {
94  if (label_order_cache_.size() == labels_.Size()) {
95  return label_order_cache_;
96  }
97  label_order_cache_.resize(labels_.Size());
98  std::iota(label_order_cache_.begin(), label_order_cache_.end(), 0);
99  const auto& l = labels_.HostVector();
100  XGBOOST_PARALLEL_SORT(label_order_cache_.begin(), label_order_cache_.end(),
101  [&l](size_t i1, size_t i2) {return std::abs(l[i1]) < std::abs(l[i2]);});
102 
103  return label_order_cache_;
104  }
106  void Clear();
111  void LoadBinary(dmlc::Stream* fi);
116  void SaveBinary(dmlc::Stream* fo) const;
124  void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num);
125 
126  private:
128  mutable std::vector<size_t> label_order_cache_;
129 };
130 
132 struct Entry {
138  Entry() = default;
144  Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
146  inline static bool CmpValue(const Entry& a, const Entry& b) {
147  return a.fvalue < b.fvalue;
148  }
149  inline bool operator==(const Entry& other) const {
150  return (this->index == other.index && this->fvalue == other.fvalue);
151  }
152 };
153 
157 class SparsePage {
158  public:
159  // Offset for each row.
163 
164  size_t base_rowid;
165 
168 
170  inline Inst operator[](size_t i) const {
171  const auto& data_vec = data.HostVector();
172  const auto& offset_vec = offset.HostVector();
173  size_t size;
174  // in distributed mode, some partitions may not get any instance for a feature. Therefore
175  // we should set the size as zero
176  if (rabit::IsDistributed() && i + 1 >= offset_vec.size()) {
177  size = 0;
178  } else {
179  size = offset_vec[i + 1] - offset_vec[i];
180  }
181  return {data_vec.data() + offset_vec[i],
182  static_cast<Inst::index_type>(size)};
183  }
184 
187  this->Clear();
188  }
190  inline size_t Size() const {
191  return offset.Size() - 1;
192  }
194  inline size_t MemCostBytes() const {
195  return offset.Size() * sizeof(size_t) + data.Size() * sizeof(Entry);
196  }
198  inline void Clear() {
199  base_rowid = 0;
200  auto& offset_vec = offset.HostVector();
201  offset_vec.clear();
202  offset_vec.push_back(0);
203  data.HostVector().clear();
204  }
205 
206  SparsePage GetTranspose(int num_columns) const {
207  SparsePage transpose;
209  &transpose.data.HostVector());
210  const int nthread = omp_get_max_threads();
211  builder.InitBudget(num_columns, nthread);
212  long batch_size = static_cast<long>(this->Size()); // NOLINT(*)
213 #pragma omp parallel for schedule(static)
214  for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
215  int tid = omp_get_thread_num();
216  auto inst = (*this)[i];
217  for (bst_uint j = 0; j < inst.size(); ++j) {
218  builder.AddBudget(inst[j].index, tid);
219  }
220  }
221  builder.InitStorage();
222 #pragma omp parallel for schedule(static)
223  for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
224  int tid = omp_get_thread_num();
225  auto inst = (*this)[i];
226  for (bst_uint j = 0; j < inst.size(); ++j) {
227  builder.Push(
228  inst[j].index,
229  Entry(static_cast<bst_uint>(this->base_rowid + i), inst[j].fvalue),
230  tid);
231  }
232  }
233  return transpose;
234  }
235 
236  void SortRows() {
237  auto ncol = static_cast<bst_omp_uint>(this->Size());
238 #pragma omp parallel for schedule(dynamic, 1)
239  for (bst_omp_uint i = 0; i < ncol; ++i) {
240  if (this->offset.HostVector()[i] < this->offset.HostVector()[i + 1]) {
241  std::sort(
242  this->data.HostVector().begin() + this->offset.HostVector()[i],
243  this->data.HostVector().begin() + this->offset.HostVector()[i + 1],
245  }
246  }
247  }
248 
253  void Push(const dmlc::RowBlock<uint32_t>& batch);
258  void Push(const SparsePage &batch);
263  void PushCSC(const SparsePage& batch);
268  inline void Push(const Inst &inst) {
269  auto& data_vec = data.HostVector();
270  auto& offset_vec = offset.HostVector();
271  offset_vec.push_back(offset_vec.back() + inst.size());
272  size_t begin = data_vec.size();
273  data_vec.resize(begin + inst.size());
274  if (inst.size() != 0) {
275  std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(),
276  sizeof(Entry) * inst.size());
277  }
278  }
279 
280  size_t Size() { return offset.Size() - 1; }
281 };
282 
284  public:
285  virtual ~BatchIteratorImpl() {}
286  virtual BatchIteratorImpl* Clone() = 0;
287  virtual SparsePage& operator*() = 0;
288  virtual const SparsePage& operator*() const = 0;
289  virtual void operator++() = 0;
290  virtual bool AtEnd() const = 0;
291 };
292 
294  public:
295  using iterator_category = std::forward_iterator_tag;
296  explicit BatchIterator(BatchIteratorImpl* impl) { impl_.reset(impl); }
297 
298  BatchIterator(const BatchIterator& other) {
299  if (other.impl_) {
300  impl_.reset(other.impl_->Clone());
301  } else {
302  impl_.reset();
303  }
304  }
305 
306  void operator++() {
307  CHECK(impl_ != nullptr);
308  ++(*impl_);
309  }
310 
312  CHECK(impl_ != nullptr);
313  return *(*impl_);
314  }
315 
316  const SparsePage& operator*() const {
317  CHECK(impl_ != nullptr);
318  return *(*impl_);
319  }
320 
321  bool operator!=(const BatchIterator& rhs) const {
322  CHECK(impl_ != nullptr);
323  return !impl_->AtEnd();
324  }
325 
326  bool AtEnd() const {
327  CHECK(impl_ != nullptr);
328  return impl_->AtEnd();
329  }
330 
331  private:
332  std::unique_ptr<BatchIteratorImpl> impl_;
333 };
334 
335 class BatchSet {
336  public:
337  explicit BatchSet(BatchIterator begin_iter) : begin_iter_(begin_iter) {}
338  BatchIterator begin() { return begin_iter_; }
339  BatchIterator end() { return BatchIterator(nullptr); }
340 
341  private:
342  BatchIterator begin_iter_;
343 };
344 
352 class DataSource : public dmlc::DataIter<SparsePage> {
353  public:
359 };
360 
365 class RowSet {
366  public:
368  inline bst_uint operator[](size_t i) const;
370  inline size_t Size() const;
372  inline void PushBack(bst_uint i);
374  inline void Clear();
379  inline void Save(dmlc::Stream* fo) const;
385  inline bool Load(dmlc::Stream* fi);
387  RowSet() = default;
388 
389  private:
391  uint64_t size_{0};
393  std::vector<bst_uint> rows_;
394 };
395 
406 class DMatrix {
407  public:
409  DMatrix() = default;
411  virtual MetaInfo& Info() = 0;
413  virtual const MetaInfo& Info() const = 0;
417  virtual BatchSet GetRowBatches() = 0;
418  virtual BatchSet GetSortedColumnBatches() = 0;
419  virtual BatchSet GetColumnBatches() = 0;
420  // the following are column meta data, should be able to answer them fast.
422  virtual bool SingleColBlock() const = 0;
424  virtual float GetColDensity(size_t cidx) = 0;
426  virtual ~DMatrix() = default;
434  virtual void SaveToLocalFile(const std::string& fname);
445  static DMatrix* Load(const std::string& uri,
446  bool silent,
447  bool load_row_split,
448  const std::string& file_format = "auto",
449  const size_t page_size = kPageSize);
457  static DMatrix* Create(std::unique_ptr<DataSource>&& source,
458  const std::string& cache_prefix = "");
472  static DMatrix* Create(dmlc::Parser<uint32_t>* parser,
473  const std::string& cache_prefix = "",
474  const size_t page_size = kPageSize);
475 
477  static const size_t kPageSize = 32UL << 20UL;
478 };
479 
480 // implementation of inline functions
481 inline bst_uint RowSet::operator[](size_t i) const {
482  return rows_.size() == 0 ? static_cast<bst_uint>(i) : rows_[i];
483 }
484 
485 inline size_t RowSet::Size() const {
486  return size_;
487 }
488 
489 inline void RowSet::Clear() {
490  rows_.clear(); size_ = 0;
491 }
492 
493 inline void RowSet::PushBack(bst_uint i) {
494  if (rows_.size() == 0) {
495  if (i == size_) {
496  ++size_; return;
497  } else {
498  rows_.resize(size_);
499  for (size_t i = 0; i < size_; ++i) {
500  rows_[i] = static_cast<bst_uint>(i);
501  }
502  }
503  }
504  rows_.push_back(i);
505  ++size_;
506 }
507 
508 inline void RowSet::Save(dmlc::Stream* fo) const {
509  fo->Write(rows_);
510  fo->Write(&size_, sizeof(size_));
511 }
512 
513 inline bool RowSet::Load(dmlc::Stream* fi) {
514  if (!fi->Read(&rows_)) return false;
515  if (rows_.size() != 0) return true;
516  return fi->Read(&size_, sizeof(size_)) == sizeof(size_);
517 }
518 } // namespace xgboost
519 
520 namespace dmlc {
521 DMLC_DECLARE_TRAITS(is_pod, xgboost::Entry, true);
522 DMLC_DECLARE_TRAITS(has_saveload, xgboost::RowSet, true);
523 }
524 #endif // XGBOOST_DATA_H_
std::vector< uint64_t > qids_
session-id of each instance, optional
Definition: data.h:63
Definition: data.h:293
Definition: data.h:32
uint64_t num_col_
number of columns in the data
Definition: data.h:45
void SetInfo(const char *key, const void *dptr, DataType dtype, size_t num)
Set information in the meta info.
float bst_float
float type, used for storing statistics
Definition: base.h:89
MetaInfo info
Meta information about the dataset The subclass need to be able to load this correctly from data...
Definition: data.h:358
XGBOOST_DEVICE constexpr index_type size() const __span_noexcept
Definition: span.h:502
bool Load(dmlc::Stream *fi)
Load rowset from file.
Definition: data.h:513
#define XGBOOST_PARALLEL_SORT(X, Y, Z)
Definition: base.h:65
Definition: data.h:34
void Save(dmlc::Stream *fo) const
save rowset to file.
Definition: data.h:508
Meta information about dataset, always sit in memory.
Definition: data.h:40
detail::ptrdiff_t index_type
Definition: span.h:387
BatchIterator end()
Definition: data.h:339
std::vector< bst_uint > group_ptr_
the index of begin and end of a group needed when the learning task is ranking.
Definition: data.h:59
size_t Size() const
Definition: data.h:485
std::forward_iterator_tag iterator_category
Definition: data.h:295
void SortRows()
Definition: data.h:236
multi-thread version of group builder
Definition: group_data.h:27
static const int kVersion
version flag, used to check version of this info
Definition: data.h:71
dmlc::omp_uint bst_omp_uint
define unsigned int for openmp loop
Definition: base.h:208
unsigned GetRoot(size_t i) const
Get the root index of i-th instance.
Definition: data.h:89
size_t Size()
Definition: data.h:280
Internal data structured used by XGBoost during training. There are two ways to create a customized D...
Definition: data.h:406
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:157
DataType
data type accepted by xgboost interface
Definition: data.h:30
A vector-like structure to represent set of rows. But saves the memory when all rows are in the set (...
Definition: data.h:365
MetaInfo()=default
default constructor
std::vector< bst_uint > root_index_
specified root index of each instance, can be used for multi task setting
Definition: data.h:54
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:109
Definition: data.h:31
DMLC_DECLARE_TRAITS(has_saveload, xgboost::RowSet, true)
Definition: data.h:335
virtual ~BatchIteratorImpl()
Definition: data.h:285
void PushBack(bst_uint i)
push the index back to the set
Definition: data.h:493
SparsePage GetTranspose(int num_columns) const
Definition: data.h:206
SparsePage()
constructor
Definition: data.h:186
void SaveBinary(dmlc::Stream *fo) const
Save the Meta info to binary stream.
void Push(const Inst &inst)
Push one instance into page.
Definition: data.h:268
bool operator!=(const BatchIterator &rhs) const
Definition: data.h:321
void operator++()
Definition: data.h:306
XGBOOST_DEVICE constexpr pointer data() const __span_noexcept
Definition: span.h:497
Definition: data.h:283
bst_float GetWeight(size_t i) const
Get weight of each instances.
Definition: data.h:81
Definition: data.h:520
void LoadBinary(dmlc::Stream *fi)
Load the Meta info from binary stream.
SparsePage & operator*()
Definition: data.h:311
std::vector< T > & HostVector()
bool operator==(const Entry &other) const
Definition: data.h:149
HostDeviceVector< bst_float > labels_
label of each instance
Definition: data.h:49
BatchIterator(BatchIteratorImpl *impl)
Definition: data.h:296
Inst operator[](size_t i) const
get i-th row from the batch
Definition: data.h:170
uint64_t num_row_
number of rows in the data
Definition: data.h:43
BatchIterator begin()
Definition: data.h:338
This is data structure that user can pass to DMatrix::Create to create a DMatrix for training...
Definition: data.h:352
HostDeviceVector< size_t > offset
Definition: data.h:160
Definition: data.h:33
bst_uint operator[](size_t i) const
Definition: data.h:481
HostDeviceVector< bst_float > base_margin_
initialized margins, if specified, xgboost will start from this init margin can be used to specify in...
Definition: data.h:69
size_t MemCostBytes() const
Definition: data.h:194
namespace of xgboost
Definition: base.h:79
BatchIterator(const BatchIterator &other)
Definition: data.h:298
void Clear()
clear the set
Definition: data.h:489
static const int kVersionQidAdded
version that introduced qid field
Definition: data.h:73
const SparsePage & operator*() const
Definition: data.h:316
defines configuration macros of xgboost.
size_t Size() const
Definition: data.h:190
size_t base_rowid
Definition: data.h:164
BatchSet(BatchIterator begin_iter)
Definition: data.h:337
HostDeviceVector< Entry > data
the data of the segments
Definition: data.h:162
uint64_t num_nonzero_
number of nonzero entries in the data
Definition: data.h:47
Element from a sparse vector.
Definition: data.h:132
Entry(bst_uint index, bst_float fvalue)
constructor with index and value
Definition: data.h:144
uint32_t bst_uint
unsigned integer type used in boost, used for feature index and row index.
Definition: base.h:84
const std::vector< size_t > & LabelAbsSort() const
get sorted indexes (argsort) of labels by absolute value (used by cox loss)
Definition: data.h:93
HostDeviceVector< bst_float > weights_
weights of each instance, optional
Definition: data.h:61
bst_float fvalue
feature value
Definition: data.h:136
static bool CmpValue(const Entry &a, const Entry &b)
reversely compare feature values
Definition: data.h:146
void Clear()
clear the page
Definition: data.h:198
bool AtEnd() const
Definition: data.h:326
void Clear()
clear all the information
bst_uint index
feature index
Definition: data.h:134