xgboost
column_matrix.h
Go to the documentation of this file.
1 
8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
10 
11 #include <limits>
12 #include <vector>
13 #include <memory>
14 #include "hist_util.h"
15 
16 namespace xgboost {
17 namespace common {
18 
19 class ColumnMatrix;
21 enum ColumnType {
24 };
25 
30 template <typename BinIdxType>
31 class Column {
32  public:
33  Column(ColumnType type, common::Span<const BinIdxType> index, const uint32_t index_base)
34  : type_(type),
35  index_(index),
36  index_base_(index_base) {}
37 
38  virtual ~Column() = default;
39 
40  uint32_t GetGlobalBinIdx(size_t idx) const {
41  return index_base_ + static_cast<uint32_t>(index_[idx]);
42  }
43 
44  BinIdxType GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
45 
46  uint32_t GetBaseIdx() const { return index_base_; }
47 
49 
50  ColumnType GetType() const { return type_; }
51 
52  /* returns number of elements in column */
53  size_t Size() const { return index_.size(); }
54 
55  private:
56  /* type of column */
57  ColumnType type_;
58  /* bin indexes in range [0, max_bins - 1] */
60  /* bin index offset for specific feature */
61  const uint32_t index_base_;
62 };
63 
64 template <typename BinIdxType>
65 class SparseColumn: public Column<BinIdxType> {
66  public:
68  uint32_t index_base, common::Span<const size_t> row_ind)
69  : Column<BinIdxType>(type, index, index_base),
70  row_ind_(row_ind) {}
71 
72  const size_t* GetRowData() const { return row_ind_.data(); }
73 
74  size_t GetRowIdx(size_t idx) const {
75  return row_ind_.data()[idx];
76  }
77 
78  private:
79  /* indexes of rows */
81 };
82 
83 template <typename BinIdxType>
84 class DenseColumn: public Column<BinIdxType> {
85  public:
87  uint32_t index_base, const std::vector<bool>& missing_flags,
88  size_t feature_offset)
89  : Column<BinIdxType>(type, index, index_base),
90  missing_flags_(missing_flags),
91  feature_offset_(feature_offset) {}
92  bool IsMissing(size_t idx) const { return missing_flags_[feature_offset_ + idx]; }
93  private:
94  /* flags for missing values in dense columns */
95  const std::vector<bool>& missing_flags_;
96  size_t feature_offset_;
97 };
98 
102  public:
103  // get number of features
104  inline bst_uint GetNumFeature() const {
105  return static_cast<bst_uint>(type_.size());
106  }
107 
108  // construct column matrix from GHistIndexMatrix
109  inline void Init(const GHistIndexMatrix& gmat,
110  double sparse_threshold) {
111  const int32_t nfeature = static_cast<int32_t>(gmat.cut.Ptrs().size() - 1);
112  const size_t nrow = gmat.row_ptr.size() - 1;
113  // identify type of each column
114  feature_counts_.resize(nfeature);
115  type_.resize(nfeature);
116  std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
117  uint32_t max_val = std::numeric_limits<uint32_t>::max();
118  for (int32_t fid = 0; fid < nfeature; ++fid) {
119  CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
120  }
121  bool all_dense = gmat.IsDense();
122  gmat.GetFeatureCounts(&feature_counts_[0]);
123  // classify features
124  for (int32_t fid = 0; fid < nfeature; ++fid) {
125  if (static_cast<double>(feature_counts_[fid])
126  < sparse_threshold * nrow) {
127  type_[fid] = kSparseColumn;
128  all_dense = false;
129  } else {
130  type_[fid] = kDenseColumn;
131  }
132  }
133 
134  // want to compute storage boundary for each feature
135  // using variants of prefix sum scan
136  feature_offsets_.resize(nfeature + 1);
137  size_t accum_index_ = 0;
138  feature_offsets_[0] = accum_index_;
139  for (int32_t fid = 1; fid < nfeature + 1; ++fid) {
140  if (type_[fid - 1] == kDenseColumn) {
141  accum_index_ += static_cast<size_t>(nrow);
142  } else {
143  accum_index_ += feature_counts_[fid - 1];
144  }
145  feature_offsets_[fid] = accum_index_;
146  }
147 
149 
150  index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
151  if (!all_dense) {
152  row_ind_.resize(feature_offsets_[nfeature]);
153  }
154 
155  // store least bin id for each feature
156  index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());
157 
158  const bool noMissingValues = NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature);
159  any_missing_ = !noMissingValues;
160 
161  if (noMissingValues) {
162  missing_flags_.resize(feature_offsets_[nfeature], false);
163  } else {
164  missing_flags_.resize(feature_offsets_[nfeature], true);
165  }
166 
167  // pre-fill index_ for dense columns
168  if (all_dense) {
169  BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize();
170  if (gmat_bin_size == kUint8BinsTypeSize) {
171  SetIndexAllDense(gmat.index.data<uint8_t>(), gmat, nrow, nfeature, noMissingValues);
172  } else if (gmat_bin_size == kUint16BinsTypeSize) {
173  SetIndexAllDense(gmat.index.data<uint16_t>(), gmat, nrow, nfeature, noMissingValues);
174  } else {
175  CHECK_EQ(gmat_bin_size, kUint32BinsTypeSize);
176  SetIndexAllDense(gmat.index.data<uint32_t>(), gmat, nrow, nfeature, noMissingValues);
177  }
178  /* For sparse DMatrix gmat.index.getBinTypeSize() returns always kUint32BinsTypeSize
179  but for ColumnMatrix we still have a chance to reduce the memory consumption */
180  } else {
181  if (bins_type_size_ == kUint8BinsTypeSize) {
182  SetIndex<uint8_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
183  } else if (bins_type_size_ == kUint16BinsTypeSize) {
184  SetIndex<uint16_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
185  } else {
186  CHECK_EQ(bins_type_size_, kUint32BinsTypeSize);
187  SetIndex<uint32_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
188  }
189  }
190  }
191 
192  /* Set the number of bytes based on numeric limit of maximum number of bins provided by user */
193  void SetTypeSize(size_t max_num_bins) {
194  if ( (max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint8_t>::max()) ) {
195  bins_type_size_ = kUint8BinsTypeSize;
196  } else if ((max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint16_t>::max())) {
197  bins_type_size_ = kUint16BinsTypeSize;
198  } else {
199  bins_type_size_ = kUint32BinsTypeSize;
200  }
201  }
202 
203  /* Fetch an individual column. This code should be used with type swith
204  to determine type of bin id's */
205  template <typename BinIdxType>
206  std::unique_ptr<const Column<BinIdxType> > GetColumn(unsigned fid) const {
207  CHECK_EQ(sizeof(BinIdxType), bins_type_size_);
208 
209  const size_t feature_offset = feature_offsets_[fid]; // to get right place for certain feature
210  const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
211  common::Span<const BinIdxType> bin_index = { reinterpret_cast<const BinIdxType*>(
212  &index_[feature_offset * bins_type_size_]),
213  column_size };
214  std::unique_ptr<const Column<BinIdxType> > res;
215  if (type_[fid] == ColumnType::kDenseColumn) {
216  res.reset(new DenseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
217  missing_flags_, feature_offset));
218  } else {
219  res.reset(new SparseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
220  {&row_ind_[feature_offset], column_size}));
221  }
222  return res;
223  }
224 
225  template<typename T>
226  inline void SetIndexAllDense(T* index, const GHistIndexMatrix& gmat, const size_t nrow,
227  const size_t nfeature, const bool noMissingValues) {
228  T* local_index = reinterpret_cast<T*>(&index_[0]);
229 
230  /* missing values make sense only for column with type kDenseColumn,
231  and if no missing values were observed it could be handled much faster. */
232  if (noMissingValues) {
233  ParallelFor(omp_ulong(nrow), [&](omp_ulong rid) {
234  const size_t ibegin = rid*nfeature;
235  const size_t iend = (rid+1)*nfeature;
236  size_t j = 0;
237  for (size_t i = ibegin; i < iend; ++i, ++j) {
238  const size_t idx = feature_offsets_[j];
239  local_index[idx + rid] = index[i];
240  }
241  });
242  } else {
243  /* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
244  size_t rbegin = 0;
245  for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
246  const xgboost::Entry* data_ptr = batch.data.HostVector().data();
247  const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
248  const size_t batch_size = batch.Size();
249  CHECK_LT(batch_size, offset_vec.size());
250  for (size_t rid = 0; rid < batch_size; ++rid) {
251  const size_t size = offset_vec[rid + 1] - offset_vec[rid];
252  SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
253  const size_t ibegin = gmat.row_ptr[rbegin + rid];
254  const size_t iend = gmat.row_ptr[rbegin + rid + 1];
255  CHECK_EQ(ibegin + inst.size(), iend);
256  size_t j = 0;
257  size_t fid = 0;
258  for (size_t i = ibegin; i < iend; ++i, ++j) {
259  fid = inst[j].index;
260  const size_t idx = feature_offsets_[fid];
261  /* rbegin allows to store indexes from specific SparsePage batch */
262  local_index[idx + rbegin + rid] = index[i];
263  missing_flags_[idx + rbegin + rid] = false;
264  }
265  }
266  rbegin += batch.Size();
267  }
268  }
269  }
270 
271  template<typename T>
272  inline void SetIndex(uint32_t* index, const GHistIndexMatrix& gmat,
273  const size_t nfeature) {
274  std::vector<size_t> num_nonzeros;
275  num_nonzeros.resize(nfeature);
276  std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
277 
278  T* local_index = reinterpret_cast<T*>(&index_[0]);
279  size_t rbegin = 0;
280  for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
281  const xgboost::Entry* data_ptr = batch.data.HostVector().data();
282  const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
283  const size_t batch_size = batch.Size();
284  CHECK_LT(batch_size, offset_vec.size());
285  for (size_t rid = 0; rid < batch_size; ++rid) {
286  const size_t ibegin = gmat.row_ptr[rbegin + rid];
287  const size_t iend = gmat.row_ptr[rbegin + rid + 1];
288  size_t fid = 0;
289  const size_t size = offset_vec[rid + 1] - offset_vec[rid];
290  SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
291 
292  CHECK_EQ(ibegin + inst.size(), iend);
293  size_t j = 0;
294  for (size_t i = ibegin; i < iend; ++i, ++j) {
295  const uint32_t bin_id = index[i];
296 
297  fid = inst[j].index;
298  if (type_[fid] == kDenseColumn) {
299  T* begin = &local_index[feature_offsets_[fid]];
300  begin[rid + rbegin] = bin_id - index_base_[fid];
301  missing_flags_[feature_offsets_[fid] + rid + rbegin] = false;
302  } else {
303  T* begin = &local_index[feature_offsets_[fid]];
304  begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
305  row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid + rbegin;
306  ++num_nonzeros[fid];
307  }
308  }
309  }
310  rbegin += batch.Size();
311  }
312  }
314  return bins_type_size_;
315  }
316 
317  // This is just an utility function
318  bool NoMissingValues(const size_t n_elements,
319  const size_t n_row, const size_t n_features) {
320  return n_elements == n_features * n_row;
321  }
322 
323  // And this returns part of state
324  bool AnyMissing() const {
325  return any_missing_;
326  }
327 
328  private:
329  std::vector<uint8_t> index_;
330 
331  std::vector<size_t> feature_counts_;
332  std::vector<ColumnType> type_;
333  std::vector<size_t> row_ind_;
334  /* indicate where each column's index and row_ind is stored. */
335  std::vector<size_t> feature_offsets_;
336 
337  // index_base_[fid]: least bin id for feature fid
338  uint32_t* index_base_;
339  std::vector<bool> missing_flags_;
340  BinTypeSize bins_type_size_;
341  bool any_missing_;
342 };
343 
344 } // namespace common
345 } // namespace xgboost
346 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_
xgboost::common::GHistIndexMatrix::row_ptr
std::vector< size_t > row_ptr
row pointer to rows by element position
Definition: hist_util.h:238
xgboost::common::BinTypeSize
BinTypeSize
Definition: hist_util.h:136
xgboost::common::ColumnMatrix::GetColumn
std::unique_ptr< const Column< BinIdxType > > GetColumn(unsigned fid) const
Definition: column_matrix.h:206
xgboost::common::SparseColumn::SparseColumn
SparseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, common::Span< const size_t > row_ind)
Definition: column_matrix.h:67
hist_util.h
Utility for fast histogram aggregation.
xgboost::common::Column
a column storage, to be used with ApplySplit. Note that each bin id is stored as index[i] + index_bas...
Definition: column_matrix.h:31
xgboost::SparsePage
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:243
xgboost::common::kUint16BinsTypeSize
@ kUint16BinsTypeSize
Definition: hist_util.h:138
xgboost::Entry
Element from a sparse vector.
Definition: data.h:184
xgboost::common::GHistIndexMatrix::p_fmat
DMatrix * p_fmat
Definition: hist_util.h:245
xgboost::common::Column::GetFeatureBinIdx
BinIdxType GetFeatureBinIdx(size_t idx) const
Definition: column_matrix.h:44
xgboost::common::Column::GetFeatureBinIdxPtr
common::Span< const BinIdxType > GetFeatureBinIdxPtr() const
Definition: column_matrix.h:48
xgboost::common::ColumnMatrix::GetTypeSize
BinTypeSize GetTypeSize() const
Definition: column_matrix.h:313
xgboost::common::GHistIndexMatrix::index
Index index
The index data.
Definition: hist_util.h:240
xgboost::common::GHistIndexMatrix::cut
HistogramCuts cut
The corresponding cuts.
Definition: hist_util.h:244
xgboost::common::GHistIndexMatrix::GetFeatureCounts
void GetFeatureCounts(size_t *counts) const
Definition: hist_util.h:278
xgboost::common::ColumnMatrix
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:101
xgboost::omp_ulong
dmlc::omp_ulong omp_ulong
define unsigned long for openmp loop
Definition: base.h:268
xgboost::common::ColumnMatrix::SetTypeSize
void SetTypeSize(size_t max_num_bins)
Definition: column_matrix.h:193
xgboost::common::SparseColumn::GetRowIdx
size_t GetRowIdx(size_t idx) const
Definition: column_matrix.h:74
xgboost::common::Index::data
T * data() const
Definition: hist_util.h:179
xgboost::common::GHistIndexMatrix
preprocessed global index matrix, in CSR format
Definition: hist_util.h:236
xgboost::common::HistogramCuts::Ptrs
std::vector< uint32_t > const & Ptrs() const
Definition: hist_util.h:86
xgboost::common::ColumnMatrix::SetIndexAllDense
void SetIndexAllDense(T *index, const GHistIndexMatrix &gmat, const size_t nrow, const size_t nfeature, const bool noMissingValues)
Definition: column_matrix.h:226
xgboost::common::ColumnMatrix::GetNumFeature
bst_uint GetNumFeature() const
Definition: column_matrix.h:104
xgboost::common::ColumnMatrix::AnyMissing
bool AnyMissing() const
Definition: column_matrix.h:324
xgboost::common::ColumnMatrix::NoMissingValues
bool NoMissingValues(const size_t n_elements, const size_t n_row, const size_t n_features)
Definition: column_matrix.h:318
xgboost::common::kDenseColumn
@ kDenseColumn
Definition: column_matrix.h:22
xgboost::common::DenseColumn
Definition: column_matrix.h:84
xgboost::common::SparseColumn
Definition: column_matrix.h:65
xgboost::common::Column::GetBaseIdx
uint32_t GetBaseIdx() const
Definition: column_matrix.h:46
xgboost::common::ParallelFor
void ParallelFor(Index size, size_t nthreads, Func fn)
Definition: threading_utils.h:137
xgboost::common::DenseColumn::DenseColumn
DenseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, const std::vector< bool > &missing_flags, size_t feature_offset)
Definition: column_matrix.h:86
xgboost::common::GHistIndexMatrix::IsDense
bool IsDense() const
Definition: hist_util.h:288
xgboost::common::Column::GetGlobalBinIdx
uint32_t GetGlobalBinIdx(size_t idx) const
Definition: column_matrix.h:40
xgboost::common::Index::GetBinTypeSize
BinTypeSize GetBinTypeSize() const
Definition: hist_util.h:175
xgboost::common::GHistIndexMatrix::max_num_bins
size_t max_num_bins
Definition: hist_util.h:246
xgboost::common::Span< const BinIdxType >
xgboost::common::kUint32BinsTypeSize
@ kUint32BinsTypeSize
Definition: hist_util.h:139
xgboost::common::ColumnType
ColumnType
column type
Definition: column_matrix.h:21
xgboost::common::kUint8BinsTypeSize
@ kUint8BinsTypeSize
Definition: hist_util.h:137
xgboost::common::Column::Column
Column(ColumnType type, common::Span< const BinIdxType > index, const uint32_t index_base)
Definition: column_matrix.h:33
xgboost::common::Span::size
constexpr XGBOOST_DEVICE index_type size() const __span_noexcept
Definition: span.h:547
xgboost::common::Span::data
constexpr XGBOOST_DEVICE pointer data() const __span_noexcept
Definition: span.h:542
xgboost::common::SparseColumn::GetRowData
const size_t * GetRowData() const
Definition: column_matrix.h:72
xgboost::common::ColumnMatrix::Init
void Init(const GHistIndexMatrix &gmat, double sparse_threshold)
Definition: column_matrix.h:109
xgboost::bst_uint
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:113
xgboost::common::Column::Size
size_t Size() const
Definition: column_matrix.h:53
xgboost::common::kSparseColumn
@ kSparseColumn
Definition: column_matrix.h:23
xgboost::DMatrix::GetBatches
BatchSet< T > GetBatches(const BatchParam &param={})
Gets batches. Use range based for loop over BatchSet to access individual batches.
xgboost::common::Column::~Column
virtual ~Column()=default
xgboost::common::Column::GetType
ColumnType GetType() const
Definition: column_matrix.h:50
xgboost
namespace of xgboost
Definition: base.h:110
xgboost::common::DenseColumn::IsMissing
bool IsMissing(size_t idx) const
Definition: column_matrix.h:92
xgboost::common::ColumnMatrix::SetIndex
void SetIndex(uint32_t *index, const GHistIndexMatrix &gmat, const size_t nfeature)
Definition: column_matrix.h:272