xgboost
column_matrix.h
Go to the documentation of this file.
1 
8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
10 
11 #include <limits>
12 #include <vector>
13 #include <memory>
14 #include "hist_util.h"
15 #include "../data/gradient_index.h"
16 
17 namespace xgboost {
18 namespace common {
19 
20 class ColumnMatrix;
22 enum ColumnType {
25 };
26 
31 template <typename BinIdxType>
32 class Column {
33  public:
34  static constexpr int32_t kMissingId = -1;
35 
36  Column(ColumnType type, common::Span<const BinIdxType> index, const uint32_t index_base)
37  : type_(type),
38  index_(index),
39  index_base_(index_base) {}
40 
41  virtual ~Column() = default;
42 
43  uint32_t GetGlobalBinIdx(size_t idx) const {
44  return index_base_ + static_cast<uint32_t>(index_[idx]);
45  }
46 
47  BinIdxType GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
48 
49  uint32_t GetBaseIdx() const { return index_base_; }
50 
52 
53  ColumnType GetType() const { return type_; }
54 
55  /* returns number of elements in column */
56  size_t Size() const { return index_.size(); }
57 
58  private:
59  /* type of column */
60  ColumnType type_;
61  /* bin indexes in range [0, max_bins - 1] */
63  /* bin index offset for specific feature */
64  const uint32_t index_base_;
65 };
66 
67 template <typename BinIdxType>
68 class SparseColumn: public Column<BinIdxType> {
69  public:
71  uint32_t index_base, common::Span<const size_t> row_ind)
72  : Column<BinIdxType>(type, index, index_base),
73  row_ind_(row_ind) {}
74 
75  const size_t* GetRowData() const { return row_ind_.data(); }
76 
77  int32_t GetBinIdx(size_t rid, size_t* state) const {
78  const size_t column_size = this->Size();
79  if (!((*state) < column_size)) {
80  return this->kMissingId;
81  }
82  while ((*state) < column_size && GetRowIdx(*state) < rid) {
83  ++(*state);
84  }
85  if (((*state) < column_size) && GetRowIdx(*state) == rid) {
86  return this->GetGlobalBinIdx(*state);
87  } else {
88  return this->kMissingId;
89  }
90  }
91 
92  size_t GetInitialState(const size_t first_row_id) const {
93  const size_t* row_data = GetRowData();
94  const size_t column_size = this->Size();
95  // search first nonzero row with index >= rid_span.front()
96  const size_t* p = std::lower_bound(row_data, row_data + column_size, first_row_id);
97  // column_size if all messing
98  return p - row_data;
99  }
100 
101  size_t GetRowIdx(size_t idx) const {
102  return row_ind_.data()[idx];
103  }
104 
105  private:
106  /* indexes of rows */
108 };
109 
110 template <typename BinIdxType, bool any_missing>
111 class DenseColumn: public Column<BinIdxType> {
112  public:
114  uint32_t index_base, const std::vector<bool>& missing_flags,
115  size_t feature_offset)
116  : Column<BinIdxType>(type, index, index_base),
117  missing_flags_(missing_flags),
118  feature_offset_(feature_offset) {}
119  bool IsMissing(size_t idx) const { return missing_flags_[feature_offset_ + idx]; }
120 
121  int32_t GetBinIdx(size_t idx, size_t* state) const {
122  if (any_missing) {
123  return IsMissing(idx) ? this->kMissingId : this->GetGlobalBinIdx(idx);
124  } else {
125  return this->GetGlobalBinIdx(idx);
126  }
127  }
128 
129  size_t GetInitialState(const size_t first_row_id) const {
130  return 0;
131  }
132 
133  private:
134  /* flags for missing values in dense columns */
135  const std::vector<bool>& missing_flags_;
136  size_t feature_offset_;
137 };
138 
142  public:
143  // get number of features
144  inline bst_uint GetNumFeature() const {
145  return static_cast<bst_uint>(type_.size());
146  }
147 
148  // construct column matrix from GHistIndexMatrix
149  inline void Init(const GHistIndexMatrix& gmat,
150  double sparse_threshold) {
151  const int32_t nfeature = static_cast<int32_t>(gmat.cut.Ptrs().size() - 1);
152  const size_t nrow = gmat.row_ptr.size() - 1;
153  // identify type of each column
154  feature_counts_.resize(nfeature);
155  type_.resize(nfeature);
156  std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
157  uint32_t max_val = std::numeric_limits<uint32_t>::max();
158  for (int32_t fid = 0; fid < nfeature; ++fid) {
159  CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
160  }
161  bool all_dense = gmat.IsDense();
162  gmat.GetFeatureCounts(&feature_counts_[0]);
163  // classify features
164  for (int32_t fid = 0; fid < nfeature; ++fid) {
165  if (static_cast<double>(feature_counts_[fid])
166  < sparse_threshold * nrow) {
167  type_[fid] = kSparseColumn;
168  all_dense = false;
169  } else {
170  type_[fid] = kDenseColumn;
171  }
172  }
173 
174  // want to compute storage boundary for each feature
175  // using variants of prefix sum scan
176  feature_offsets_.resize(nfeature + 1);
177  size_t accum_index_ = 0;
178  feature_offsets_[0] = accum_index_;
179  for (int32_t fid = 1; fid < nfeature + 1; ++fid) {
180  if (type_[fid - 1] == kDenseColumn) {
181  accum_index_ += static_cast<size_t>(nrow);
182  } else {
183  accum_index_ += feature_counts_[fid - 1];
184  }
185  feature_offsets_[fid] = accum_index_;
186  }
187 
188  SetTypeSize(gmat.max_num_bins);
189 
190  index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
191  if (!all_dense) {
192  row_ind_.resize(feature_offsets_[nfeature]);
193  }
194 
195  // store least bin id for each feature
196  index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());
197 
198  const bool noMissingValues = NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature);
199  any_missing_ = !noMissingValues;
200 
201  if (noMissingValues) {
202  missing_flags_.resize(feature_offsets_[nfeature], false);
203  } else {
204  missing_flags_.resize(feature_offsets_[nfeature], true);
205  }
206 
207  // pre-fill index_ for dense columns
208  if (all_dense) {
209  BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize();
210  if (gmat_bin_size == kUint8BinsTypeSize) {
211  SetIndexAllDense(gmat.index.data<uint8_t>(), gmat, nrow, nfeature, noMissingValues);
212  } else if (gmat_bin_size == kUint16BinsTypeSize) {
213  SetIndexAllDense(gmat.index.data<uint16_t>(), gmat, nrow, nfeature, noMissingValues);
214  } else {
215  CHECK_EQ(gmat_bin_size, kUint32BinsTypeSize);
216  SetIndexAllDense(gmat.index.data<uint32_t>(), gmat, nrow, nfeature, noMissingValues);
217  }
218  /* For sparse DMatrix gmat.index.getBinTypeSize() returns always kUint32BinsTypeSize
219  but for ColumnMatrix we still have a chance to reduce the memory consumption */
220  } else {
221  if (bins_type_size_ == kUint8BinsTypeSize) {
222  SetIndex<uint8_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
223  } else if (bins_type_size_ == kUint16BinsTypeSize) {
224  SetIndex<uint16_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
225  } else {
226  CHECK_EQ(bins_type_size_, kUint32BinsTypeSize);
227  SetIndex<uint32_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
228  }
229  }
230  }
231 
232  /* Set the number of bytes based on numeric limit of maximum number of bins provided by user */
233  void SetTypeSize(size_t max_num_bins) {
234  if ( (max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint8_t>::max()) ) {
235  bins_type_size_ = kUint8BinsTypeSize;
236  } else if ((max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint16_t>::max())) {
237  bins_type_size_ = kUint16BinsTypeSize;
238  } else {
239  bins_type_size_ = kUint32BinsTypeSize;
240  }
241  }
242 
243  /* Fetch an individual column. This code should be used with type swith
244  to determine type of bin id's */
245  template <typename BinIdxType, bool any_missing>
246  std::unique_ptr<const Column<BinIdxType> > GetColumn(unsigned fid) const {
247  CHECK_EQ(sizeof(BinIdxType), bins_type_size_);
248 
249  const size_t feature_offset = feature_offsets_[fid]; // to get right place for certain feature
250  const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
251  common::Span<const BinIdxType> bin_index = { reinterpret_cast<const BinIdxType*>(
252  &index_[feature_offset * bins_type_size_]),
253  column_size };
254  std::unique_ptr<const Column<BinIdxType> > res;
255  if (type_[fid] == ColumnType::kDenseColumn) {
256  CHECK_EQ(any_missing, any_missing_);
257  res.reset(new DenseColumn<BinIdxType, any_missing>(type_[fid], bin_index, index_base_[fid],
258  missing_flags_, feature_offset));
259  } else {
260  res.reset(new SparseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
261  {&row_ind_[feature_offset], column_size}));
262  }
263  return res;
264  }
265 
266  template <typename T>
267  inline void SetIndexAllDense(T *index, const GHistIndexMatrix &gmat,
268  const size_t nrow, const size_t nfeature,
269  const bool noMissingValues) {
270  T* local_index = reinterpret_cast<T*>(&index_[0]);
271 
272  /* missing values make sense only for column with type kDenseColumn,
273  and if no missing values were observed it could be handled much faster. */
274  if (noMissingValues) {
275  ParallelFor(omp_ulong(nrow), [&](omp_ulong rid) {
276  const size_t ibegin = rid*nfeature;
277  const size_t iend = (rid+1)*nfeature;
278  size_t j = 0;
279  for (size_t i = ibegin; i < iend; ++i, ++j) {
280  const size_t idx = feature_offsets_[j];
281  local_index[idx + rid] = index[i];
282  }
283  });
284  } else {
285  /* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
286  size_t rbegin = 0;
287  for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
288  const xgboost::Entry* data_ptr = batch.data.HostVector().data();
289  const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
290  const size_t batch_size = batch.Size();
291  CHECK_LT(batch_size, offset_vec.size());
292  for (size_t rid = 0; rid < batch_size; ++rid) {
293  const size_t size = offset_vec[rid + 1] - offset_vec[rid];
294  SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
295  const size_t ibegin = gmat.row_ptr[rbegin + rid];
296  const size_t iend = gmat.row_ptr[rbegin + rid + 1];
297  CHECK_EQ(ibegin + inst.size(), iend);
298  size_t j = 0;
299  size_t fid = 0;
300  for (size_t i = ibegin; i < iend; ++i, ++j) {
301  fid = inst[j].index;
302  const size_t idx = feature_offsets_[fid];
303  /* rbegin allows to store indexes from specific SparsePage batch */
304  local_index[idx + rbegin + rid] = index[i];
305  missing_flags_[idx + rbegin + rid] = false;
306  }
307  }
308  rbegin += batch.Size();
309  }
310  }
311  }
312 
313  template<typename T>
314  inline void SetIndex(uint32_t* index, const GHistIndexMatrix& gmat,
315  const size_t nfeature) {
316  std::vector<size_t> num_nonzeros;
317  num_nonzeros.resize(nfeature);
318  std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
319 
320  T* local_index = reinterpret_cast<T*>(&index_[0]);
321  size_t rbegin = 0;
322  for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
323  const xgboost::Entry* data_ptr = batch.data.HostVector().data();
324  const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
325  const size_t batch_size = batch.Size();
326  CHECK_LT(batch_size, offset_vec.size());
327  for (size_t rid = 0; rid < batch_size; ++rid) {
328  const size_t ibegin = gmat.row_ptr[rbegin + rid];
329  const size_t iend = gmat.row_ptr[rbegin + rid + 1];
330  size_t fid = 0;
331  const size_t size = offset_vec[rid + 1] - offset_vec[rid];
332  SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
333 
334  CHECK_EQ(ibegin + inst.size(), iend);
335  size_t j = 0;
336  for (size_t i = ibegin; i < iend; ++i, ++j) {
337  const uint32_t bin_id = index[i];
338 
339  fid = inst[j].index;
340  if (type_[fid] == kDenseColumn) {
341  T* begin = &local_index[feature_offsets_[fid]];
342  begin[rid + rbegin] = bin_id - index_base_[fid];
343  missing_flags_[feature_offsets_[fid] + rid + rbegin] = false;
344  } else {
345  T* begin = &local_index[feature_offsets_[fid]];
346  begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
347  row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid + rbegin;
348  ++num_nonzeros[fid];
349  }
350  }
351  }
352  rbegin += batch.Size();
353  }
354  }
356  return bins_type_size_;
357  }
358 
359  // This is just an utility function
360  bool NoMissingValues(const size_t n_elements,
361  const size_t n_row, const size_t n_features) {
362  return n_elements == n_features * n_row;
363  }
364 
365  // And this returns part of state
366  bool AnyMissing() const {
367  return any_missing_;
368  }
369 
370  private:
371  std::vector<uint8_t> index_;
372 
373  std::vector<size_t> feature_counts_;
374  std::vector<ColumnType> type_;
375  std::vector<size_t> row_ind_;
376  /* indicate where each column's index and row_ind is stored. */
377  std::vector<size_t> feature_offsets_;
378 
379  // index_base_[fid]: least bin id for feature fid
380  uint32_t* index_base_;
381  std::vector<bool> missing_flags_;
382  BinTypeSize bins_type_size_;
383  bool any_missing_;
384 };
385 
386 } // namespace common
387 } // namespace xgboost
388 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_
xgboost::common::Column::kMissingId
static constexpr int32_t kMissingId
Definition: column_matrix.h:34
xgboost::common::SparseColumn::SparseColumn
SparseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, common::Span< const size_t > row_ind)
Definition: column_matrix.h:70
xgboost::common::DenseColumn::IsMissing
bool IsMissing(size_t idx) const
Definition: column_matrix.h:119
hist_util.h
Utility for fast histogram aggregation.
xgboost::common::kUint32BinsTypeSize
@ kUint32BinsTypeSize
Definition: hist_util.h:143
xgboost::common::Column
a column storage, to be used with ApplySplit. Note that each bin id is stored as index[i] + index_bas...
Definition: column_matrix.h:32
xgboost::SparsePage
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:259
xgboost::Entry
Element from a sparse vector.
Definition: data.h:187
xgboost::common::Column::GetFeatureBinIdx
BinIdxType GetFeatureBinIdx(size_t idx) const
Definition: column_matrix.h:47
xgboost::common::SparseColumn::GetBinIdx
int32_t GetBinIdx(size_t rid, size_t *state) const
Definition: column_matrix.h:77
xgboost::common::Column::GetFeatureBinIdxPtr
common::Span< const BinIdxType > GetFeatureBinIdxPtr() const
Definition: column_matrix.h:51
xgboost::common::ColumnMatrix::GetTypeSize
BinTypeSize GetTypeSize() const
Definition: column_matrix.h:355
xgboost::common::SparseColumn::GetInitialState
size_t GetInitialState(const size_t first_row_id) const
Definition: column_matrix.h:92
xgboost::common::ParallelFor
void ParallelFor(Index size, int32_t n_threads, Sched sched, Func fn)
Definition: threading_utils.h:173
xgboost::common::ColumnMatrix
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:141
xgboost::omp_ulong
dmlc::omp_ulong omp_ulong
define unsigned long for openmp loop
Definition: base.h:271
xgboost::common::ColumnMatrix::SetTypeSize
void SetTypeSize(size_t max_num_bins)
Definition: column_matrix.h:233
xgboost::common::SparseColumn::GetRowIdx
size_t GetRowIdx(size_t idx) const
Definition: column_matrix.h:101
xgboost::common::ColumnMatrix::SetIndexAllDense
void SetIndexAllDense(T *index, const GHistIndexMatrix &gmat, const size_t nrow, const size_t nfeature, const bool noMissingValues)
Definition: column_matrix.h:267
xgboost::common::ColumnMatrix::GetNumFeature
bst_uint GetNumFeature() const
Definition: column_matrix.h:144
xgboost::common::ColumnMatrix::AnyMissing
bool AnyMissing() const
Definition: column_matrix.h:366
xgboost::common::DenseColumn::GetInitialState
size_t GetInitialState(const size_t first_row_id) const
Definition: column_matrix.h:129
xgboost::common::kUint8BinsTypeSize
@ kUint8BinsTypeSize
Definition: hist_util.h:141
xgboost::common::ColumnMatrix::NoMissingValues
bool NoMissingValues(const size_t n_elements, const size_t n_row, const size_t n_features)
Definition: column_matrix.h:360
xgboost::common::kDenseColumn
@ kDenseColumn
Definition: column_matrix.h:23
xgboost::common::DenseColumn
Definition: column_matrix.h:111
xgboost::common::SparseColumn
Definition: column_matrix.h:68
xgboost::common::Column::GetBaseIdx
uint32_t GetBaseIdx() const
Definition: column_matrix.h:49
xgboost::common::DenseColumn::DenseColumn
DenseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, const std::vector< bool > &missing_flags, size_t feature_offset)
Definition: column_matrix.h:113
xgboost::common::DenseColumn::GetBinIdx
int32_t GetBinIdx(size_t idx, size_t *state) const
Definition: column_matrix.h:121
xgboost::common::Column::GetGlobalBinIdx
uint32_t GetGlobalBinIdx(size_t idx) const
Definition: column_matrix.h:43
xgboost::common::BinTypeSize
BinTypeSize
Definition: hist_util.h:140
xgboost::common::Span< const BinIdxType >
xgboost::common::ColumnType
ColumnType
column type
Definition: column_matrix.h:22
xgboost::common::Column::Column
Column(ColumnType type, common::Span< const BinIdxType > index, const uint32_t index_base)
Definition: column_matrix.h:36
xgboost::common::Span::size
constexpr XGBOOST_DEVICE index_type size() const __span_noexcept
Definition: span.h:547
xgboost::common::ColumnMatrix::GetColumn
std::unique_ptr< const Column< BinIdxType > > GetColumn(unsigned fid) const
Definition: column_matrix.h:246
xgboost::common::Span::data
constexpr XGBOOST_DEVICE pointer data() const __span_noexcept
Definition: span.h:542
xgboost::common::SparseColumn::GetRowData
const size_t * GetRowData() const
Definition: column_matrix.h:75
xgboost::common::ColumnMatrix::Init
void Init(const GHistIndexMatrix &gmat, double sparse_threshold)
Definition: column_matrix.h:149
xgboost::bst_uint
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:113
xgboost::common::Column::Size
size_t Size() const
Definition: column_matrix.h:56
xgboost::common::kSparseColumn
@ kSparseColumn
Definition: column_matrix.h:24
xgboost::common::kUint16BinsTypeSize
@ kUint16BinsTypeSize
Definition: hist_util.h:142
xgboost::common::Column::~Column
virtual ~Column()=default
xgboost::common::Column::GetType
ColumnType GetType() const
Definition: column_matrix.h:53
xgboost
namespace of xgboost
Definition: base.h:110
xgboost::common::ColumnMatrix::SetIndex
void SetIndex(uint32_t *index, const GHistIndexMatrix &gmat, const size_t nfeature)
Definition: column_matrix.h:314