xgboost
column_matrix.h
Go to the documentation of this file.
1 
8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
10 
11 #include <limits>
12 #include <vector>
13 #include <memory>
14 #include "hist_util.h"
15 
16 namespace xgboost {
17 namespace common {
18 
19 class ColumnMatrix;
21 enum ColumnType {
24 };
25 
30 template <typename BinIdxType>
31 class Column {
32  public:
33  Column(ColumnType type, common::Span<const BinIdxType> index, const uint32_t index_base)
34  : type_(type),
35  index_(index),
36  index_base_(index_base) {}
37 
38  uint32_t GetGlobalBinIdx(size_t idx) const {
39  return index_base_ + static_cast<uint32_t>(index_[idx]);
40  }
41 
42  BinIdxType GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
43 
44  const uint32_t GetBaseIdx() const { return index_base_; }
45 
47 
48  ColumnType GetType() const { return type_; }
49 
50  /* returns number of elements in column */
51  size_t Size() const { return index_.size(); }
52 
53  private:
54  /* type of column */
55  ColumnType type_;
56  /* bin indexes in range [0, max_bins - 1] */
58  /* bin index offset for specific feature */
59  const uint32_t index_base_;
60 };
61 
62 template <typename BinIdxType>
63 class SparseColumn: public Column<BinIdxType> {
64  public:
66  uint32_t index_base, common::Span<const size_t> row_ind)
67  : Column<BinIdxType>(type, index, index_base),
68  row_ind_(row_ind) {}
69 
70  const size_t* GetRowData() const { return row_ind_.data(); }
71 
72  size_t GetRowIdx(size_t idx) const {
73  return row_ind_.data()[idx];
74  }
75 
76  private:
77  /* indexes of rows */
79 };
80 
81 template <typename BinIdxType>
82 class DenseColumn: public Column<BinIdxType> {
83  public:
85  uint32_t index_base, const std::vector<bool>& missing_flags,
86  size_t feature_offset)
87  : Column<BinIdxType>(type, index, index_base),
88  missing_flags_(missing_flags),
89  feature_offset_(feature_offset) {}
90  bool IsMissing(size_t idx) const { return missing_flags_[feature_offset_ + idx]; }
91  private:
92  /* flags for missing values in dense columns */
93  const std::vector<bool>& missing_flags_;
94  size_t feature_offset_;
95 };
96 
99 class ColumnMatrix {
100  public:
101  // get number of features
102  inline bst_uint GetNumFeature() const {
103  return static_cast<bst_uint>(type_.size());
104  }
105 
106  // construct column matrix from GHistIndexMatrix
107  inline void Init(const GHistIndexMatrix& gmat,
108  double sparse_threshold) {
109  const int32_t nfeature = static_cast<int32_t>(gmat.cut.Ptrs().size() - 1);
110  const size_t nrow = gmat.row_ptr.size() - 1;
111  // identify type of each column
112  feature_counts_.resize(nfeature);
113  type_.resize(nfeature);
114  std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
115  uint32_t max_val = std::numeric_limits<uint32_t>::max();
116  for (int32_t fid = 0; fid < nfeature; ++fid) {
117  CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
118  }
119  bool all_dense = gmat.IsDense();
120  gmat.GetFeatureCounts(&feature_counts_[0]);
121  // classify features
122  for (int32_t fid = 0; fid < nfeature; ++fid) {
123  if (static_cast<double>(feature_counts_[fid])
124  < sparse_threshold * nrow) {
125  type_[fid] = kSparseColumn;
126  all_dense = false;
127  } else {
128  type_[fid] = kDenseColumn;
129  }
130  }
131 
132  // want to compute storage boundary for each feature
133  // using variants of prefix sum scan
134  feature_offsets_.resize(nfeature + 1);
135  size_t accum_index_ = 0;
136  feature_offsets_[0] = accum_index_;
137  for (int32_t fid = 1; fid < nfeature + 1; ++fid) {
138  if (type_[fid - 1] == kDenseColumn) {
139  accum_index_ += static_cast<size_t>(nrow);
140  } else {
141  accum_index_ += feature_counts_[fid - 1];
142  }
143  feature_offsets_[fid] = accum_index_;
144  }
145 
146  SetTypeSize(gmat.max_num_bins);
147 
148  index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
149  if (!all_dense) {
150  row_ind_.resize(feature_offsets_[nfeature]);
151  }
152 
153  // store least bin id for each feature
154  index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());
155 
156  const bool noMissingValues = NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature);
157 
158  if (noMissingValues) {
159  missing_flags_.resize(feature_offsets_[nfeature], false);
160  } else {
161  missing_flags_.resize(feature_offsets_[nfeature], true);
162  }
163 
164  // pre-fill index_ for dense columns
165  if (all_dense) {
166  BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize();
167  if (gmat_bin_size == kUint8BinsTypeSize) {
168  SetIndexAllDense(gmat.index.data<uint8_t>(), gmat, nrow, nfeature, noMissingValues);
169  } else if (gmat_bin_size == kUint16BinsTypeSize) {
170  SetIndexAllDense(gmat.index.data<uint16_t>(), gmat, nrow, nfeature, noMissingValues);
171  } else {
172  CHECK_EQ(gmat_bin_size, kUint32BinsTypeSize);
173  SetIndexAllDense(gmat.index.data<uint32_t>(), gmat, nrow, nfeature, noMissingValues);
174  }
175  /* For sparse DMatrix gmat.index.getBinTypeSize() returns always kUint32BinsTypeSize
176  but for ColumnMatrix we still have a chance to reduce the memory consumption */
177  } else {
178  if (bins_type_size_ == kUint8BinsTypeSize) {
179  SetIndex<uint8_t>(gmat.index.data<uint32_t>(), gmat, nrow, nfeature);
180  } else if (bins_type_size_ == kUint16BinsTypeSize) {
181  SetIndex<uint16_t>(gmat.index.data<uint32_t>(), gmat, nrow, nfeature);
182  } else {
183  CHECK_EQ(bins_type_size_, kUint32BinsTypeSize);
184  SetIndex<uint32_t>(gmat.index.data<uint32_t>(), gmat, nrow, nfeature);
185  }
186  }
187  }
188 
189  /* Set the number of bytes based on numeric limit of maximum number of bins provided by user */
190  void SetTypeSize(size_t max_num_bins) {
191  if ( (max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint8_t>::max()) ) {
192  bins_type_size_ = kUint8BinsTypeSize;
193  } else if ((max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint16_t>::max())) {
194  bins_type_size_ = kUint16BinsTypeSize;
195  } else {
196  bins_type_size_ = kUint32BinsTypeSize;
197  }
198  }
199 
200  /* Fetch an individual column. This code should be used with type swith
201  to determine type of bin id's */
202  template <typename BinIdxType>
203  std::unique_ptr<const Column<BinIdxType> > GetColumn(unsigned fid) const {
204  CHECK_EQ(sizeof(BinIdxType), bins_type_size_);
205 
206  const size_t feature_offset = feature_offsets_[fid]; // to get right place for certain feature
207  const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
208  common::Span<const BinIdxType> bin_index = { reinterpret_cast<const BinIdxType*>(
209  &index_[feature_offset * bins_type_size_]),
210  column_size };
211  std::unique_ptr<const Column<BinIdxType> > res;
212  if (type_[fid] == ColumnType::kDenseColumn) {
213  res.reset(new DenseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
214  missing_flags_, feature_offset));
215  } else {
216  res.reset(new SparseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
217  {&row_ind_[feature_offset], column_size}));
218  }
219  return res;
220  }
221 
222  template<typename T>
223  inline void SetIndexAllDense(T* index, const GHistIndexMatrix& gmat, const size_t nrow,
224  const size_t nfeature, const bool noMissingValues) {
225  T* local_index = reinterpret_cast<T*>(&index_[0]);
226 
227  /* missing values make sense only for column with type kDenseColumn,
228  and if no missing values were observed it could be handled much faster. */
229  if (noMissingValues) {
230 #pragma omp parallel for num_threads(omp_get_max_threads())
231  for (omp_ulong rid = 0; rid < nrow; ++rid) {
232  const size_t ibegin = rid*nfeature;
233  const size_t iend = (rid+1)*nfeature;
234  size_t j = 0;
235  for (size_t i = ibegin; i < iend; ++i, ++j) {
236  const size_t idx = feature_offsets_[j];
237  local_index[idx + rid] = index[i];
238  }
239  }
240  } else {
241  /* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
242  size_t rbegin = 0;
243  for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
244  const xgboost::Entry* data_ptr = batch.data.HostVector().data();
245  const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
246  const size_t batch_size = batch.Size();
247  CHECK_LT(batch_size, offset_vec.size());
248  for (size_t rid = 0; rid < batch_size; ++rid) {
249  const size_t size = offset_vec[rid + 1] - offset_vec[rid];
250  SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
251  const size_t ibegin = gmat.row_ptr[rbegin + rid];
252  const size_t iend = gmat.row_ptr[rbegin + rid + 1];
253  CHECK_EQ(ibegin + inst.size(), iend);
254  size_t j = 0;
255  size_t fid = 0;
256  for (size_t i = ibegin; i < iend; ++i, ++j) {
257  fid = inst[j].index;
258  const size_t idx = feature_offsets_[fid];
259  /* rbegin allows to store indexes from specific SparsePage batch */
260  local_index[idx + rbegin + rid] = index[i];
261  missing_flags_[idx + rbegin + rid] = false;
262  }
263  }
264  rbegin += batch.Size();
265  }
266  }
267  }
268 
269  template<typename T>
270  inline void SetIndex(uint32_t* index, const GHistIndexMatrix& gmat,
271  const size_t nrow, const size_t nfeature) {
272  std::vector<size_t> num_nonzeros;
273  num_nonzeros.resize(nfeature);
274  std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
275 
276  T* local_index = reinterpret_cast<T*>(&index_[0]);
277  size_t rbegin = 0;
278  for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
279  const xgboost::Entry* data_ptr = batch.data.HostVector().data();
280  const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
281  const size_t batch_size = batch.Size();
282  CHECK_LT(batch_size, offset_vec.size());
283  for (size_t rid = 0; rid < batch_size; ++rid) {
284  const size_t ibegin = gmat.row_ptr[rbegin + rid];
285  const size_t iend = gmat.row_ptr[rbegin + rid + 1];
286  size_t fid = 0;
287  const size_t size = offset_vec[rid + 1] - offset_vec[rid];
288  SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
289 
290  CHECK_EQ(ibegin + inst.size(), iend);
291  size_t j = 0;
292  for (size_t i = ibegin; i < iend; ++i, ++j) {
293  const uint32_t bin_id = index[i];
294 
295  fid = inst[j].index;
296  if (type_[fid] == kDenseColumn) {
297  T* begin = &local_index[feature_offsets_[fid]];
298  begin[rid + rbegin] = bin_id - index_base_[fid];
299  missing_flags_[feature_offsets_[fid] + rid + rbegin] = false;
300  } else {
301  T* begin = &local_index[feature_offsets_[fid]];
302  begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
303  row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid + rbegin;
304  ++num_nonzeros[fid];
305  }
306  }
307  }
308  rbegin += batch.Size();
309  }
310  }
311  const BinTypeSize GetTypeSize() const {
312  return bins_type_size_;
313  }
314  const bool NoMissingValues(const size_t n_elements,
315  const size_t n_row, const size_t n_features) {
316  return n_elements == n_features * n_row;
317  }
318 
319  private:
320  std::vector<uint8_t> index_;
321 
322  std::vector<size_t> feature_counts_;
323  std::vector<ColumnType> type_;
324  std::vector<size_t> row_ind_;
325  /* indicate where each column's index and row_ind is stored. */
326  std::vector<size_t> feature_offsets_;
327 
328  // index_base_[fid]: least bin id for feature fid
329  uint32_t* index_base_;
330  std::vector<bool> missing_flags_;
331  BinTypeSize bins_type_size_;
332 };
333 
334 } // namespace common
335 } // namespace xgboost
336 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_
Index index
The index data.
Definition: hist_util.h:318
SparseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, common::Span< const size_t > row_ind)
Definition: column_matrix.h:65
XGBOOST_DEVICE constexpr index_type size() const __span_noexcept
Definition: span.h:531
Definition: column_matrix.h:82
std::vector< uint32_t > const & Ptrs() const
Definition: hist_util.h:96
Definition: hist_util.h:215
const size_t * GetRowData() const
Definition: column_matrix.h:70
const uint32_t GetBaseIdx() const
Definition: column_matrix.h:44
T * data() const
Definition: hist_util.h:257
bst_uint GetNumFeature() const
Definition: column_matrix.h:102
dmlc::omp_ulong omp_ulong
define unsigned long for openmp loop
Definition: base.h:251
HistogramCuts cut
The corresponding cuts.
Definition: hist_util.h:322
Column(ColumnType type, common::Span< const BinIdxType > index, const uint32_t index_base)
Definition: column_matrix.h:33
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:211
ColumnType GetType() const
Definition: column_matrix.h:48
bool IsDense() const
Definition: hist_util.h:353
a column storage, to be used with ApplySplit. Note that each bin id is stored as index[i] + index_bas...
Definition: column_matrix.h:31
Definition: hist_util.h:216
BatchSet< T > GetBatches(const BatchParam &param={})
Gets batches. Use range based for loop over BatchSet to access individual batches.
const BinTypeSize GetTypeSize() const
Definition: column_matrix.h:311
BinTypeSize GetBinTypeSize() const
Definition: hist_util.h:253
Definition: column_matrix.h:63
void SetTypeSize(size_t max_num_bins)
Definition: column_matrix.h:190
Utility for fast histogram aggregation.
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:99
BinTypeSize
Definition: hist_util.h:214
const bool NoMissingValues(const size_t n_elements, const size_t n_row, const size_t n_features)
Definition: column_matrix.h:314
namespace of xgboost
Definition: base.h:102
std::unique_ptr< const Column< BinIdxType > > GetColumn(unsigned fid) const
Definition: column_matrix.h:203
common::Span< const BinIdxType > GetFeatureBinIdxPtr() const
Definition: column_matrix.h:46
DenseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, const std::vector< bool > &missing_flags, size_t feature_offset)
Definition: column_matrix.h:84
size_t max_num_bins
Definition: hist_util.h:324
void Init(const GHistIndexMatrix &gmat, double sparse_threshold)
Definition: column_matrix.h:107
size_t Size() const
Definition: column_matrix.h:51
std::vector< size_t > row_ptr
row pointer to rows by element position
Definition: hist_util.h:316
uint32_t GetGlobalBinIdx(size_t idx) const
Definition: column_matrix.h:38
Element from a sparse vector.
Definition: data.h:167
ColumnType
column type
Definition: column_matrix.h:21
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:105
Definition: column_matrix.h:22
size_t GetRowIdx(size_t idx) const
Definition: column_matrix.h:72
preprocessed global index matrix, in CSR format
Definition: hist_util.h:314
BinIdxType GetFeatureBinIdx(size_t idx) const
Definition: column_matrix.h:42
void SetIndex(uint32_t *index, const GHistIndexMatrix &gmat, const size_t nrow, const size_t nfeature)
Definition: column_matrix.h:270
void GetFeatureCounts(size_t *counts) const
Definition: hist_util.h:343
void SetIndexAllDense(T *index, const GHistIndexMatrix &gmat, const size_t nrow, const size_t nfeature, const bool noMissingValues)
Definition: column_matrix.h:223
Definition: column_matrix.h:23
Definition: hist_util.h:217
bool IsMissing(size_t idx) const
Definition: column_matrix.h:90
DMatrix * p_fmat
Definition: hist_util.h:323