xgboost
column_matrix.h
Go to the documentation of this file.
1 
8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
10 
11 #include <dmlc/endian.h>
12 
13 #include <algorithm>
14 #include <limits>
15 #include <memory>
16 #include <vector>
17 
18 #include "../data/gradient_index.h"
19 #include "hist_util.h"
20 
21 namespace xgboost {
22 namespace common {
23 
24 class ColumnMatrix;
27 
32 template <typename BinIdxType>
33 class Column {
34  public:
35  static constexpr int32_t kMissingId = -1;
36 
37  Column(ColumnType type, common::Span<const BinIdxType> index, const uint32_t index_base)
38  : type_(type), index_(index), index_base_(index_base) {}
39 
40  virtual ~Column() = default;
41 
42  uint32_t GetGlobalBinIdx(size_t idx) const {
43  return index_base_ + static_cast<uint32_t>(index_[idx]);
44  }
45 
46  BinIdxType GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
47 
48  uint32_t GetBaseIdx() const { return index_base_; }
49 
51 
52  ColumnType GetType() const { return type_; }
53 
54  /* returns number of elements in column */
55  size_t Size() const { return index_.size(); }
56 
57  private:
58  /* type of column */
59  ColumnType type_;
60  /* bin indexes in range [0, max_bins - 1] */
62  /* bin index offset for specific feature */
63  const uint32_t index_base_;
64 };
65 
66 template <typename BinIdxType>
67 class SparseColumn : public Column<BinIdxType> {
68  public:
69  SparseColumn(ColumnType type, common::Span<const BinIdxType> index, uint32_t index_base,
71  : Column<BinIdxType>(type, index, index_base), row_ind_(row_ind) {}
72 
73  const size_t* GetRowData() const { return row_ind_.data(); }
74 
75  int32_t GetBinIdx(size_t rid, size_t* state) const {
76  const size_t column_size = this->Size();
77  if (!((*state) < column_size)) {
78  return this->kMissingId;
79  }
80  while ((*state) < column_size && GetRowIdx(*state) < rid) {
81  ++(*state);
82  }
83  if (((*state) < column_size) && GetRowIdx(*state) == rid) {
84  return this->GetGlobalBinIdx(*state);
85  } else {
86  return this->kMissingId;
87  }
88  }
89 
90  size_t GetInitialState(const size_t first_row_id) const {
91  const size_t* row_data = GetRowData();
92  const size_t column_size = this->Size();
93  // search first nonzero row with index >= rid_span.front()
94  const size_t* p = std::lower_bound(row_data, row_data + column_size, first_row_id);
95  // column_size if all messing
96  return p - row_data;
97  }
98 
99  size_t GetRowIdx(size_t idx) const { return row_ind_.data()[idx]; }
100 
101  private:
102  /* indexes of rows */
104 };
105 
106 template <typename BinIdxType, bool any_missing>
107 class DenseColumn : public Column<BinIdxType> {
108  public:
109  DenseColumn(ColumnType type, common::Span<const BinIdxType> index, uint32_t index_base,
110  const std::vector<bool>& missing_flags, size_t feature_offset)
111  : Column<BinIdxType>(type, index, index_base),
112  missing_flags_(missing_flags),
113  feature_offset_(feature_offset) {}
114  bool IsMissing(size_t idx) const { return missing_flags_[feature_offset_ + idx]; }
115 
116  int32_t GetBinIdx(size_t idx, size_t* state) const {
117  if (any_missing) {
118  return IsMissing(idx) ? this->kMissingId : this->GetGlobalBinIdx(idx);
119  } else {
120  return this->GetGlobalBinIdx(idx);
121  }
122  }
123 
124  size_t GetInitialState(const size_t first_row_id) const { return 0; }
125 
126  private:
127  /* flags for missing values in dense columns */
128  const std::vector<bool>& missing_flags_;
129  size_t feature_offset_;
130 };
131 
135  public:
136  // get number of features
137  bst_feature_t GetNumFeature() const { return static_cast<bst_feature_t>(type_.size()); }
138 
139  // construct column matrix from GHistIndexMatrix
140  inline void Init(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
141  int32_t n_threads) {
142  auto const nfeature = static_cast<bst_feature_t>(gmat.cut.Ptrs().size() - 1);
143  const size_t nrow = gmat.row_ptr.size() - 1;
144  // identify type of each column
145  feature_counts_.resize(nfeature);
146  type_.resize(nfeature);
147  std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
148  uint32_t max_val = std::numeric_limits<uint32_t>::max();
149  for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
150  CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
151  }
152  bool all_dense = gmat.IsDense();
153  gmat.GetFeatureCounts(&feature_counts_[0]);
154  // classify features
155  for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
156  if (static_cast<double>(feature_counts_[fid]) < sparse_threshold * nrow) {
157  type_[fid] = kSparseColumn;
158  all_dense = false;
159  } else {
160  type_[fid] = kDenseColumn;
161  }
162  }
163 
164  // want to compute storage boundary for each feature
165  // using variants of prefix sum scan
166  feature_offsets_.resize(nfeature + 1);
167  size_t accum_index_ = 0;
168  feature_offsets_[0] = accum_index_;
169  for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) {
170  if (type_[fid - 1] == kDenseColumn) {
171  accum_index_ += static_cast<size_t>(nrow);
172  } else {
173  accum_index_ += feature_counts_[fid - 1];
174  }
175  feature_offsets_[fid] = accum_index_;
176  }
177 
178  SetTypeSize(gmat.max_num_bins);
179 
180  index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
181  if (!all_dense) {
182  row_ind_.resize(feature_offsets_[nfeature]);
183  }
184 
185  // store least bin id for each feature
186  index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());
187 
188  const bool noMissingValues = NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature);
189  any_missing_ = !noMissingValues;
190 
191  missing_flags_.clear();
192  if (noMissingValues) {
193  missing_flags_.resize(feature_offsets_[nfeature], false);
194  } else {
195  missing_flags_.resize(feature_offsets_[nfeature], true);
196  }
197 
198  // pre-fill index_ for dense columns
199  if (all_dense) {
200  BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize();
201  if (gmat_bin_size == kUint8BinsTypeSize) {
202  SetIndexAllDense(page, gmat.index.data<uint8_t>(), gmat, nrow, nfeature, noMissingValues,
203  n_threads);
204  } else if (gmat_bin_size == kUint16BinsTypeSize) {
205  SetIndexAllDense(page, gmat.index.data<uint16_t>(), gmat, nrow, nfeature, noMissingValues,
206  n_threads);
207  } else {
208  CHECK_EQ(gmat_bin_size, kUint32BinsTypeSize);
209  SetIndexAllDense(page, gmat.index.data<uint32_t>(), gmat, nrow, nfeature, noMissingValues,
210  n_threads);
211  }
212  /* For sparse DMatrix gmat.index.getBinTypeSize() returns always kUint32BinsTypeSize
213  but for ColumnMatrix we still have a chance to reduce the memory consumption */
214  } else {
215  if (bins_type_size_ == kUint8BinsTypeSize) {
216  SetIndex<uint8_t>(page, gmat.index.data<uint32_t>(), gmat, nfeature);
217  } else if (bins_type_size_ == kUint16BinsTypeSize) {
218  SetIndex<uint16_t>(page, gmat.index.data<uint32_t>(), gmat, nfeature);
219  } else {
220  CHECK_EQ(bins_type_size_, kUint32BinsTypeSize);
221  SetIndex<uint32_t>(page, gmat.index.data<uint32_t>(), gmat, nfeature);
222  }
223  }
224  }
225 
226  /* Set the number of bytes based on numeric limit of maximum number of bins provided by user */
227  void SetTypeSize(size_t max_num_bins) {
228  if ((max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint8_t>::max())) {
229  bins_type_size_ = kUint8BinsTypeSize;
230  } else if ((max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint16_t>::max())) {
231  bins_type_size_ = kUint16BinsTypeSize;
232  } else {
233  bins_type_size_ = kUint32BinsTypeSize;
234  }
235  }
236 
237  /* Fetch an individual column. This code should be used with type swith
238  to determine type of bin id's */
239  template <typename BinIdxType, bool any_missing>
240  std::unique_ptr<const Column<BinIdxType> > GetColumn(unsigned fid) const {
241  CHECK_EQ(sizeof(BinIdxType), bins_type_size_);
242 
243  const size_t feature_offset = feature_offsets_[fid]; // to get right place for certain feature
244  const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
245  common::Span<const BinIdxType> bin_index = {
246  reinterpret_cast<const BinIdxType*>(&index_[feature_offset * bins_type_size_]),
247  column_size};
248  std::unique_ptr<const Column<BinIdxType> > res;
249  if (type_[fid] == ColumnType::kDenseColumn) {
250  CHECK_EQ(any_missing, any_missing_);
251  res.reset(new DenseColumn<BinIdxType, any_missing>(type_[fid], bin_index, index_base_[fid],
252  missing_flags_, feature_offset));
253  } else {
254  res.reset(new SparseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
255  {&row_ind_[feature_offset], column_size}));
256  }
257  return res;
258  }
259 
260  template <typename T>
261  inline void SetIndexAllDense(SparsePage const& page, T const* index, const GHistIndexMatrix& gmat,
262  const size_t nrow, const size_t nfeature, const bool noMissingValues,
263  int32_t n_threads) {
264  T* local_index = reinterpret_cast<T*>(&index_[0]);
265 
266  /* missing values make sense only for column with type kDenseColumn,
267  and if no missing values were observed it could be handled much faster. */
268  if (noMissingValues) {
269  ParallelFor(nrow, n_threads, [&](auto rid) {
270  const size_t ibegin = rid * nfeature;
271  const size_t iend = (rid + 1) * nfeature;
272  size_t j = 0;
273  for (size_t i = ibegin; i < iend; ++i, ++j) {
274  const size_t idx = feature_offsets_[j];
275  local_index[idx + rid] = index[i];
276  }
277  });
278  } else {
279  /* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
280  auto get_bin_idx = [&](auto bin_id, auto rid, bst_feature_t fid) {
281  // T* begin = &local_index[feature_offsets_[fid]];
282  const size_t idx = feature_offsets_[fid];
283  /* rbegin allows to store indexes from specific SparsePage batch */
284  local_index[idx + rid] = bin_id;
285 
286  missing_flags_[idx + rid] = false;
287  };
288  this->SetIndexSparse(page, index, gmat, nfeature, get_bin_idx);
289  }
290  }
291 
292  // FIXME(jiamingy): In the future we might want to simply use binary search to simplify
293  // this and remove the dependency on SparsePage. This way we can have quantilized
294  // matrix for host similar to `DeviceQuantileDMatrix`.
295  template <typename T, typename BinFn>
296  void SetIndexSparse(SparsePage const& batch, T* index, const GHistIndexMatrix& gmat,
297  const size_t nfeature, BinFn&& assign_bin) {
298  std::vector<size_t> num_nonzeros(nfeature, 0ul);
299  const xgboost::Entry* data_ptr = batch.data.HostVector().data();
300  const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
301  auto rbegin = 0;
302  const size_t batch_size = gmat.Size();
303  CHECK_LT(batch_size, offset_vec.size());
304 
305  for (size_t rid = 0; rid < batch_size; ++rid) {
306  const size_t ibegin = gmat.row_ptr[rbegin + rid];
307  const size_t iend = gmat.row_ptr[rbegin + rid + 1];
308  const size_t size = offset_vec[rid + 1] - offset_vec[rid];
309  SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
310 
311  CHECK_EQ(ibegin + inst.size(), iend);
312  size_t j = 0;
313  for (size_t i = ibegin; i < iend; ++i, ++j) {
314  const uint32_t bin_id = index[i];
315  auto fid = inst[j].index;
316  assign_bin(bin_id, rid, fid);
317  }
318  }
319  }
320 
321  template <typename T>
322  inline void SetIndex(SparsePage const& page, uint32_t const* index, const GHistIndexMatrix& gmat,
323  const size_t nfeature) {
324  T* local_index = reinterpret_cast<T*>(&index_[0]);
325  std::vector<size_t> num_nonzeros;
326  num_nonzeros.resize(nfeature);
327  std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
328 
329  auto get_bin_idx = [&](auto bin_id, auto rid, bst_feature_t fid) {
330  if (type_[fid] == kDenseColumn) {
331  T* begin = &local_index[feature_offsets_[fid]];
332  begin[rid] = bin_id - index_base_[fid];
333  missing_flags_[feature_offsets_[fid] + rid] = false;
334  } else {
335  T* begin = &local_index[feature_offsets_[fid]];
336  begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
337  row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid;
338  ++num_nonzeros[fid];
339  }
340  };
341  this->SetIndexSparse(page, index, gmat, nfeature, get_bin_idx);
342  }
343 
344  BinTypeSize GetTypeSize() const { return bins_type_size_; }
345 
346  // This is just an utility function
347  bool NoMissingValues(const size_t n_elements, const size_t n_row, const size_t n_features) {
348  return n_elements == n_features * n_row;
349  }
350 
351  // And this returns part of state
352  bool AnyMissing() const { return any_missing_; }
353 
354  // IO procedures for external memory.
355  bool Read(dmlc::SeekStream* fi, uint32_t const* index_base) {
356  fi->Read(&index_);
357  fi->Read(&feature_counts_);
358 #if !DMLC_LITTLE_ENDIAN
359  // s390x
360  std::vector<std::underlying_type<ColumnType>::type> int_types;
361  fi->Read(&int_types);
362  type_.resize(int_types.size());
363  std::transform(
364  int_types.begin(), int_types.end(), type_.begin(),
365  [](std::underlying_type<ColumnType>::type i) { return static_cast<ColumnType>(i); });
366 #else
367  fi->Read(&type_);
368 #endif // !DMLC_LITTLE_ENDIAN
369 
370  fi->Read(&row_ind_);
371  fi->Read(&feature_offsets_);
372  index_base_ = index_base;
373 #if !DMLC_LITTLE_ENDIAN
374  std::underlying_type<BinTypeSize>::type v;
375  fi->Read(&v);
376  bins_type_size_ = static_cast<BinTypeSize>(v);
377 #else
378  fi->Read(&bins_type_size_);
379 #endif
380 
381  fi->Read(&any_missing_);
382  return true;
383  }
384 
385  size_t Write(dmlc::Stream* fo) const {
386  size_t bytes{0};
387 
388  auto write_vec = [&](auto const& vec) {
389  fo->Write(vec);
390  bytes += vec.size() * sizeof(typename std::remove_reference_t<decltype(vec)>::value_type) +
391  sizeof(uint64_t);
392  };
393  write_vec(index_);
394  write_vec(feature_counts_);
395 #if !DMLC_LITTLE_ENDIAN
396  // s390x
397  std::vector<std::underlying_type<ColumnType>::type> int_types(type_.size());
398  std::transform(type_.begin(), type_.end(), int_types.begin(), [](ColumnType t) {
399  return static_cast<std::underlying_type<ColumnType>::type>(t);
400  });
401  write_vec(int_types);
402 #else
403  write_vec(type_);
404 #endif // !DMLC_LITTLE_ENDIAN
405  write_vec(row_ind_);
406  write_vec(feature_offsets_);
407 
408 #if !DMLC_LITTLE_ENDIAN
409  auto v = static_cast<std::underlying_type<BinTypeSize>::type>(bins_type_size_);
410  fo->Write(v);
411 #else
412  fo->Write(bins_type_size_);
413 #endif // DMLC_LITTLE_ENDIAN
414  bytes += sizeof(bins_type_size_);
415  fo->Write(any_missing_);
416  bytes += sizeof(any_missing_);
417 
418  return bytes;
419  }
420 
421  private:
422  std::vector<uint8_t> index_;
423 
424  std::vector<size_t> feature_counts_;
425  std::vector<ColumnType> type_;
426  std::vector<size_t> row_ind_;
427  /* indicate where each column's index and row_ind is stored. */
428  std::vector<size_t> feature_offsets_;
429 
430  // index_base_[fid]: least bin id for feature fid
431  uint32_t const* index_base_;
432  std::vector<bool> missing_flags_;
433  BinTypeSize bins_type_size_;
434  bool any_missing_;
435 };
436 } // namespace common
437 } // namespace xgboost
438 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_
xgboost::common::Column::kMissingId
static constexpr int32_t kMissingId
Definition: column_matrix.h:35
xgboost::common::ColumnMatrix::SetIndexAllDense
void SetIndexAllDense(SparsePage const &page, T const *index, const GHistIndexMatrix &gmat, const size_t nrow, const size_t nfeature, const bool noMissingValues, int32_t n_threads)
Definition: column_matrix.h:261
xgboost::common::SparseColumn::SparseColumn
SparseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, common::Span< const size_t > row_ind)
Definition: column_matrix.h:69
xgboost::common::DenseColumn::IsMissing
bool IsMissing(size_t idx) const
Definition: column_matrix.h:114
hist_util.h
Utility for fast histogram aggregation.
xgboost::common::kUint32BinsTypeSize
@ kUint32BinsTypeSize
Definition: hist_util.h:197
xgboost::common::Column
a column storage, to be used with ApplySplit. Note that each bin id is stored as index[i] + index_bas...
Definition: column_matrix.h:33
xgboost::SparsePage
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:271
xgboost::common::kSparseColumn
@ kSparseColumn
Definition: column_matrix.h:26
xgboost::Entry
Element from a sparse vector.
Definition: data.h:187
xgboost::common::Column::GetFeatureBinIdx
BinIdxType GetFeatureBinIdx(size_t idx) const
Definition: column_matrix.h:46
xgboost::common::SparseColumn::GetBinIdx
int32_t GetBinIdx(size_t rid, size_t *state) const
Definition: column_matrix.h:75
xgboost::common::ColumnMatrix::Init
void Init(SparsePage const &page, const GHistIndexMatrix &gmat, double sparse_threshold, int32_t n_threads)
Definition: column_matrix.h:140
xgboost::common::Column::GetFeatureBinIdxPtr
common::Span< const BinIdxType > GetFeatureBinIdxPtr() const
Definition: column_matrix.h:50
xgboost::common::ColumnMatrix::GetTypeSize
BinTypeSize GetTypeSize() const
Definition: column_matrix.h:344
xgboost::common::SparseColumn::GetInitialState
size_t GetInitialState(const size_t first_row_id) const
Definition: column_matrix.h:90
xgboost::common::ColumnMatrix::Read
bool Read(dmlc::SeekStream *fi, uint32_t const *index_base)
Definition: column_matrix.h:355
xgboost::common::ParallelFor
void ParallelFor(Index size, int32_t n_threads, Sched sched, Func fn)
Definition: threading_utils.h:170
xgboost::common::ColumnMatrix::Write
size_t Write(dmlc::Stream *fo) const
Definition: column_matrix.h:385
xgboost::common::ColumnMatrix
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:134
xgboost::common::ColumnMatrix::SetTypeSize
void SetTypeSize(size_t max_num_bins)
Definition: column_matrix.h:227
xgboost::common::kDenseColumn
@ kDenseColumn
Definition: column_matrix.h:26
xgboost::bst_feature_t
uint32_t bst_feature_t
Type for data column (feature) index.
Definition: base.h:123
xgboost::common::SparseColumn::GetRowIdx
size_t GetRowIdx(size_t idx) const
Definition: column_matrix.h:99
xgboost::HostDeviceVector::HostVector
std::vector< T > & HostVector()
xgboost::common::ColumnMatrix::AnyMissing
bool AnyMissing() const
Definition: column_matrix.h:352
xgboost::common::DenseColumn::GetInitialState
size_t GetInitialState(const size_t first_row_id) const
Definition: column_matrix.h:124
xgboost::common::kUint8BinsTypeSize
@ kUint8BinsTypeSize
Definition: hist_util.h:195
xgboost::common::ColumnMatrix::NoMissingValues
bool NoMissingValues(const size_t n_elements, const size_t n_row, const size_t n_features)
Definition: column_matrix.h:347
xgboost::common::DenseColumn
Definition: column_matrix.h:107
xgboost::common::SparseColumn
Definition: column_matrix.h:67
xgboost::common::Column::GetBaseIdx
uint32_t GetBaseIdx() const
Definition: column_matrix.h:48
xgboost::common::DenseColumn::DenseColumn
DenseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, const std::vector< bool > &missing_flags, size_t feature_offset)
Definition: column_matrix.h:109
xgboost::common::DenseColumn::GetBinIdx
int32_t GetBinIdx(size_t idx, size_t *state) const
Definition: column_matrix.h:116
xgboost::common::Column::GetGlobalBinIdx
uint32_t GetGlobalBinIdx(size_t idx) const
Definition: column_matrix.h:42
xgboost::common::ColumnMatrix::GetNumFeature
bst_feature_t GetNumFeature() const
Definition: column_matrix.h:137
xgboost::common::BinTypeSize
BinTypeSize
Definition: hist_util.h:194
xgboost::common::Span< const BinIdxType >
xgboost::common::ColumnType
ColumnType
column type
Definition: column_matrix.h:26
xgboost::SparsePage::offset
HostDeviceVector< bst_row_t > offset
Definition: data.h:274
xgboost::common::Column::Column
Column(ColumnType type, common::Span< const BinIdxType > index, const uint32_t index_base)
Definition: column_matrix.h:37
xgboost::common::Span::size
constexpr XGBOOST_DEVICE index_type size() const __span_noexcept
Definition: span.h:553
xgboost::common::ColumnMatrix::GetColumn
std::unique_ptr< const Column< BinIdxType > > GetColumn(unsigned fid) const
Definition: column_matrix.h:240
xgboost::common::Span::data
constexpr XGBOOST_DEVICE pointer data() const __span_noexcept
Definition: span.h:548
xgboost::common::ColumnMatrix::SetIndexSparse
void SetIndexSparse(SparsePage const &batch, T *index, const GHistIndexMatrix &gmat, const size_t nfeature, BinFn &&assign_bin)
Definition: column_matrix.h:296
xgboost::common::SparseColumn::GetRowData
const size_t * GetRowData() const
Definition: column_matrix.h:73
xgboost::common::Column::Size
size_t Size() const
Definition: column_matrix.h:55
xgboost::common::ColumnMatrix::SetIndex
void SetIndex(SparsePage const &page, uint32_t const *index, const GHistIndexMatrix &gmat, const size_t nfeature)
Definition: column_matrix.h:322
xgboost::common::kUint16BinsTypeSize
@ kUint16BinsTypeSize
Definition: hist_util.h:196
xgboost::common::Column::~Column
virtual ~Column()=default
xgboost::common::Column::GetType
ColumnType GetType() const
Definition: column_matrix.h:52
xgboost::SparsePage::data
HostDeviceVector< Entry > data
the data of the segments
Definition: data.h:276
xgboost
namespace of xgboost
Definition: base.h:110