xgboost
column_matrix.h
Go to the documentation of this file.
1 
8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
10 
11 #include <limits>
12 #include <vector>
13 #include <memory>
14 #include "hist_util.h"
15 
16 namespace xgboost {
17 namespace common {
18 
19 class ColumnMatrix;
21 enum ColumnType {
24 };
25 
30 template <typename BinIdxType>
31 class Column {
32  public:
33  Column(ColumnType type, common::Span<const BinIdxType> index, const uint32_t index_base)
34  : type_(type),
35  index_(index),
36  index_base_(index_base) {}
37 
38  virtual ~Column() = default;
39 
40  uint32_t GetGlobalBinIdx(size_t idx) const {
41  return index_base_ + static_cast<uint32_t>(index_[idx]);
42  }
43 
44  BinIdxType GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
45 
46  const uint32_t GetBaseIdx() const { return index_base_; }
47 
49 
50  ColumnType GetType() const { return type_; }
51 
52  /* returns number of elements in column */
53  size_t Size() const { return index_.size(); }
54 
55  private:
56  /* type of column */
57  ColumnType type_;
58  /* bin indexes in range [0, max_bins - 1] */
60  /* bin index offset for specific feature */
61  const uint32_t index_base_;
62 };
63 
64 template <typename BinIdxType>
65 class SparseColumn: public Column<BinIdxType> {
66  public:
68  uint32_t index_base, common::Span<const size_t> row_ind)
69  : Column<BinIdxType>(type, index, index_base),
70  row_ind_(row_ind) {}
71 
72  const size_t* GetRowData() const { return row_ind_.data(); }
73 
74  size_t GetRowIdx(size_t idx) const {
75  return row_ind_.data()[idx];
76  }
77 
78  private:
79  /* indexes of rows */
81 };
82 
83 template <typename BinIdxType>
84 class DenseColumn: public Column<BinIdxType> {
85  public:
87  uint32_t index_base, const std::vector<bool>& missing_flags,
88  size_t feature_offset)
89  : Column<BinIdxType>(type, index, index_base),
90  missing_flags_(missing_flags),
91  feature_offset_(feature_offset) {}
92  bool IsMissing(size_t idx) const { return missing_flags_[feature_offset_ + idx]; }
93  private:
94  /* flags for missing values in dense columns */
95  const std::vector<bool>& missing_flags_;
96  size_t feature_offset_;
97 };
98 
102  public:
103  // get number of features
104  inline bst_uint GetNumFeature() const {
105  return static_cast<bst_uint>(type_.size());
106  }
107 
108  // construct column matrix from GHistIndexMatrix
109  inline void Init(const GHistIndexMatrix& gmat,
110  double sparse_threshold) {
111  const int32_t nfeature = static_cast<int32_t>(gmat.cut.Ptrs().size() - 1);
112  const size_t nrow = gmat.row_ptr.size() - 1;
113  // identify type of each column
114  feature_counts_.resize(nfeature);
115  type_.resize(nfeature);
116  std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
117  uint32_t max_val = std::numeric_limits<uint32_t>::max();
118  for (int32_t fid = 0; fid < nfeature; ++fid) {
119  CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
120  }
121  bool all_dense = gmat.IsDense();
122  gmat.GetFeatureCounts(&feature_counts_[0]);
123  // classify features
124  for (int32_t fid = 0; fid < nfeature; ++fid) {
125  if (static_cast<double>(feature_counts_[fid])
126  < sparse_threshold * nrow) {
127  type_[fid] = kSparseColumn;
128  all_dense = false;
129  } else {
130  type_[fid] = kDenseColumn;
131  }
132  }
133 
134  // want to compute storage boundary for each feature
135  // using variants of prefix sum scan
136  feature_offsets_.resize(nfeature + 1);
137  size_t accum_index_ = 0;
138  feature_offsets_[0] = accum_index_;
139  for (int32_t fid = 1; fid < nfeature + 1; ++fid) {
140  if (type_[fid - 1] == kDenseColumn) {
141  accum_index_ += static_cast<size_t>(nrow);
142  } else {
143  accum_index_ += feature_counts_[fid - 1];
144  }
145  feature_offsets_[fid] = accum_index_;
146  }
147 
148  SetTypeSize(gmat.max_num_bins);
149 
150  index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
151  if (!all_dense) {
152  row_ind_.resize(feature_offsets_[nfeature]);
153  }
154 
155  // store least bin id for each feature
156  index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());
157 
158  const bool noMissingValues = NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature);
159  any_missing_ = !noMissingValues;
160 
161  if (noMissingValues) {
162  missing_flags_.resize(feature_offsets_[nfeature], false);
163  } else {
164  missing_flags_.resize(feature_offsets_[nfeature], true);
165  }
166 
167  // pre-fill index_ for dense columns
168  if (all_dense) {
169  BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize();
170  if (gmat_bin_size == kUint8BinsTypeSize) {
171  SetIndexAllDense(gmat.index.data<uint8_t>(), gmat, nrow, nfeature, noMissingValues);
172  } else if (gmat_bin_size == kUint16BinsTypeSize) {
173  SetIndexAllDense(gmat.index.data<uint16_t>(), gmat, nrow, nfeature, noMissingValues);
174  } else {
175  CHECK_EQ(gmat_bin_size, kUint32BinsTypeSize);
176  SetIndexAllDense(gmat.index.data<uint32_t>(), gmat, nrow, nfeature, noMissingValues);
177  }
178  /* For sparse DMatrix gmat.index.getBinTypeSize() returns always kUint32BinsTypeSize
179  but for ColumnMatrix we still have a chance to reduce the memory consumption */
180  } else {
181  if (bins_type_size_ == kUint8BinsTypeSize) {
182  SetIndex<uint8_t>(gmat.index.data<uint32_t>(), gmat, nrow, nfeature);
183  } else if (bins_type_size_ == kUint16BinsTypeSize) {
184  SetIndex<uint16_t>(gmat.index.data<uint32_t>(), gmat, nrow, nfeature);
185  } else {
186  CHECK_EQ(bins_type_size_, kUint32BinsTypeSize);
187  SetIndex<uint32_t>(gmat.index.data<uint32_t>(), gmat, nrow, nfeature);
188  }
189  }
190  }
191 
192  /* Set the number of bytes based on numeric limit of maximum number of bins provided by user */
193  void SetTypeSize(size_t max_num_bins) {
194  if ( (max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint8_t>::max()) ) {
195  bins_type_size_ = kUint8BinsTypeSize;
196  } else if ((max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint16_t>::max())) {
197  bins_type_size_ = kUint16BinsTypeSize;
198  } else {
199  bins_type_size_ = kUint32BinsTypeSize;
200  }
201  }
202 
203  /* Fetch an individual column. This code should be used with type swith
204  to determine type of bin id's */
205  template <typename BinIdxType>
206  std::unique_ptr<const Column<BinIdxType> > GetColumn(unsigned fid) const {
207  CHECK_EQ(sizeof(BinIdxType), bins_type_size_);
208 
209  const size_t feature_offset = feature_offsets_[fid]; // to get right place for certain feature
210  const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
211  common::Span<const BinIdxType> bin_index = { reinterpret_cast<const BinIdxType*>(
212  &index_[feature_offset * bins_type_size_]),
213  column_size };
214  std::unique_ptr<const Column<BinIdxType> > res;
215  if (type_[fid] == ColumnType::kDenseColumn) {
216  res.reset(new DenseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
217  missing_flags_, feature_offset));
218  } else {
219  res.reset(new SparseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
220  {&row_ind_[feature_offset], column_size}));
221  }
222  return res;
223  }
224 
225  template<typename T>
226  inline void SetIndexAllDense(T* index, const GHistIndexMatrix& gmat, const size_t nrow,
227  const size_t nfeature, const bool noMissingValues) {
228  T* local_index = reinterpret_cast<T*>(&index_[0]);
229 
230  /* missing values make sense only for column with type kDenseColumn,
231  and if no missing values were observed it could be handled much faster. */
232  if (noMissingValues) {
233 #pragma omp parallel for num_threads(omp_get_max_threads())
234  for (omp_ulong rid = 0; rid < nrow; ++rid) {
235  const size_t ibegin = rid*nfeature;
236  const size_t iend = (rid+1)*nfeature;
237  size_t j = 0;
238  for (size_t i = ibegin; i < iend; ++i, ++j) {
239  const size_t idx = feature_offsets_[j];
240  local_index[idx + rid] = index[i];
241  }
242  }
243  } else {
244  /* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
245  size_t rbegin = 0;
246  for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
247  const xgboost::Entry* data_ptr = batch.data.HostVector().data();
248  const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
249  const size_t batch_size = batch.Size();
250  CHECK_LT(batch_size, offset_vec.size());
251  for (size_t rid = 0; rid < batch_size; ++rid) {
252  const size_t size = offset_vec[rid + 1] - offset_vec[rid];
253  SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
254  const size_t ibegin = gmat.row_ptr[rbegin + rid];
255  const size_t iend = gmat.row_ptr[rbegin + rid + 1];
256  CHECK_EQ(ibegin + inst.size(), iend);
257  size_t j = 0;
258  size_t fid = 0;
259  for (size_t i = ibegin; i < iend; ++i, ++j) {
260  fid = inst[j].index;
261  const size_t idx = feature_offsets_[fid];
262  /* rbegin allows to store indexes from specific SparsePage batch */
263  local_index[idx + rbegin + rid] = index[i];
264  missing_flags_[idx + rbegin + rid] = false;
265  }
266  }
267  rbegin += batch.Size();
268  }
269  }
270  }
271 
272  template<typename T>
273  inline void SetIndex(uint32_t* index, const GHistIndexMatrix& gmat,
274  const size_t nrow, const size_t nfeature) {
275  std::vector<size_t> num_nonzeros;
276  num_nonzeros.resize(nfeature);
277  std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
278 
279  T* local_index = reinterpret_cast<T*>(&index_[0]);
280  size_t rbegin = 0;
281  for (const auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
282  const xgboost::Entry* data_ptr = batch.data.HostVector().data();
283  const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
284  const size_t batch_size = batch.Size();
285  CHECK_LT(batch_size, offset_vec.size());
286  for (size_t rid = 0; rid < batch_size; ++rid) {
287  const size_t ibegin = gmat.row_ptr[rbegin + rid];
288  const size_t iend = gmat.row_ptr[rbegin + rid + 1];
289  size_t fid = 0;
290  const size_t size = offset_vec[rid + 1] - offset_vec[rid];
291  SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
292 
293  CHECK_EQ(ibegin + inst.size(), iend);
294  size_t j = 0;
295  for (size_t i = ibegin; i < iend; ++i, ++j) {
296  const uint32_t bin_id = index[i];
297 
298  fid = inst[j].index;
299  if (type_[fid] == kDenseColumn) {
300  T* begin = &local_index[feature_offsets_[fid]];
301  begin[rid + rbegin] = bin_id - index_base_[fid];
302  missing_flags_[feature_offsets_[fid] + rid + rbegin] = false;
303  } else {
304  T* begin = &local_index[feature_offsets_[fid]];
305  begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
306  row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid + rbegin;
307  ++num_nonzeros[fid];
308  }
309  }
310  }
311  rbegin += batch.Size();
312  }
313  }
314  const BinTypeSize GetTypeSize() const {
315  return bins_type_size_;
316  }
317 
318  // This is just an utility function
319  const bool NoMissingValues(const size_t n_elements,
320  const size_t n_row, const size_t n_features) {
321  return n_elements == n_features * n_row;
322  }
323 
324  // And this returns part of state
325  const bool AnyMissing() const {
326  return any_missing_;
327  }
328 
329  private:
330  std::vector<uint8_t> index_;
331 
332  std::vector<size_t> feature_counts_;
333  std::vector<ColumnType> type_;
334  std::vector<size_t> row_ind_;
335  /* indicate where each column's index and row_ind is stored. */
336  std::vector<size_t> feature_offsets_;
337 
338  // index_base_[fid]: least bin id for feature fid
339  uint32_t* index_base_;
340  std::vector<bool> missing_flags_;
341  BinTypeSize bins_type_size_;
342  bool any_missing_;
343 };
344 
345 } // namespace common
346 } // namespace xgboost
347 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_
virtual ~Column()=default
Index index
The index data.
Definition: hist_util.h:306
SparseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, common::Span< const size_t > row_ind)
Definition: column_matrix.h:67
XGBOOST_DEVICE constexpr index_type size() const __span_noexcept
Definition: span.h:531
Definition: column_matrix.h:84
std::vector< uint32_t > const & Ptrs() const
Definition: hist_util.h:96
Definition: hist_util.h:203
const size_t * GetRowData() const
Definition: column_matrix.h:72
const uint32_t GetBaseIdx() const
Definition: column_matrix.h:46
T * data() const
Definition: hist_util.h:245
bst_uint GetNumFeature() const
Definition: column_matrix.h:104
dmlc::omp_ulong omp_ulong
define unsigned long for openmp loop
Definition: base.h:259
HistogramCuts cut
The corresponding cuts.
Definition: hist_util.h:310
Column(ColumnType type, common::Span< const BinIdxType > index, const uint32_t index_base)
Definition: column_matrix.h:33
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:245
ColumnType GetType() const
Definition: column_matrix.h:50
bool IsDense() const
Definition: hist_util.h:341
a column storage, to be used with ApplySplit. Note that each bin id is stored as index[i] + index_bas...
Definition: column_matrix.h:31
Definition: hist_util.h:204
BatchSet< T > GetBatches(const BatchParam &param={})
Gets batches. Use range based for loop over BatchSet to access individual batches.
const BinTypeSize GetTypeSize() const
Definition: column_matrix.h:314
BinTypeSize GetBinTypeSize() const
Definition: hist_util.h:241
Definition: column_matrix.h:65
void SetTypeSize(size_t max_num_bins)
Definition: column_matrix.h:193
Utility for fast histogram aggregation.
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:101
BinTypeSize
Definition: hist_util.h:202
const bool AnyMissing() const
Definition: column_matrix.h:325
const bool NoMissingValues(const size_t n_elements, const size_t n_row, const size_t n_features)
Definition: column_matrix.h:319
namespace of xgboost
Definition: base.h:102
std::unique_ptr< const Column< BinIdxType > > GetColumn(unsigned fid) const
Definition: column_matrix.h:206
common::Span< const BinIdxType > GetFeatureBinIdxPtr() const
Definition: column_matrix.h:48
DenseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, const std::vector< bool > &missing_flags, size_t feature_offset)
Definition: column_matrix.h:86
size_t max_num_bins
Definition: hist_util.h:312
void Init(const GHistIndexMatrix &gmat, double sparse_threshold)
Definition: column_matrix.h:109
size_t Size() const
Definition: column_matrix.h:53
std::vector< size_t > row_ptr
row pointer to rows by element position
Definition: hist_util.h:304
uint32_t GetGlobalBinIdx(size_t idx) const
Definition: column_matrix.h:40
Element from a sparse vector.
Definition: data.h:201
ColumnType
column type
Definition: column_matrix.h:21
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:105
Definition: column_matrix.h:22
size_t GetRowIdx(size_t idx) const
Definition: column_matrix.h:74
preprocessed global index matrix, in CSR format
Definition: hist_util.h:302
BinIdxType GetFeatureBinIdx(size_t idx) const
Definition: column_matrix.h:44
void SetIndex(uint32_t *index, const GHistIndexMatrix &gmat, const size_t nrow, const size_t nfeature)
Definition: column_matrix.h:273
void GetFeatureCounts(size_t *counts) const
Definition: hist_util.h:331
void SetIndexAllDense(T *index, const GHistIndexMatrix &gmat, const size_t nrow, const size_t nfeature, const bool noMissingValues)
Definition: column_matrix.h:226
Definition: column_matrix.h:23
Definition: hist_util.h:205
bool IsMissing(size_t idx) const
Definition: column_matrix.h:92
DMatrix * p_fmat
Definition: hist_util.h:311