xgboost
column_matrix.h
Go to the documentation of this file.
1 
8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
10 
11 #include <limits>
12 #include <vector>
13 #include "hist_util.h"
14 
15 
16 namespace xgboost {
17 namespace common {
18 
19 
21 enum ColumnType {
24 };
25 
28 class Column {
29  public:
30  Column(ColumnType type, const uint32_t* index, uint32_t index_base,
31  const size_t* row_ind, size_t len)
32  : type_(type),
33  index_(index),
34  index_base_(index_base),
35  row_ind_(row_ind),
36  len_(len) {}
37  size_t Size() const { return len_; }
38  uint32_t GetGlobalBinIdx(size_t idx) const { return index_base_ + index_[idx]; }
39  uint32_t GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
40  // column.GetFeatureBinIdx(idx) + column.GetBaseIdx(idx) ==
41  // column.GetGlobalBinIdx(idx)
42  uint32_t GetBaseIdx() const { return index_base_; }
43  ColumnType GetType() const { return type_; }
44  size_t GetRowIdx(size_t idx) const {
45  // clang-tidy worries that row_ind_ might be a nullptr, which is possible,
46  // but low level structure is not safe anyway.
47  return type_ == ColumnType::kDenseColumn ? idx : row_ind_[idx]; // NOLINT
48  }
49  bool IsMissing(size_t idx) const {
50  return index_[idx] == std::numeric_limits<uint32_t>::max();
51  }
52  const size_t* GetRowData() const { return row_ind_; }
53 
54  private:
55  ColumnType type_;
56  const uint32_t* index_;
57  uint32_t index_base_;
58  const size_t* row_ind_;
59  const size_t len_;
60 };
61 
64 class ColumnMatrix {
65  public:
66  // get number of features
67  inline bst_uint GetNumFeature() const {
68  return static_cast<bst_uint>(type_.size());
69  }
70 
71  // construct column matrix from GHistIndexMatrix
72  inline void Init(const GHistIndexMatrix& gmat,
73  double sparse_threshold) {
74  const int32_t nfeature = static_cast<int32_t>(gmat.cut.Ptrs().size() - 1);
75  const size_t nrow = gmat.row_ptr.size() - 1;
76 
77  // identify type of each column
78  feature_counts_.resize(nfeature);
79  type_.resize(nfeature);
80  std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
81 
82  uint32_t max_val = std::numeric_limits<uint32_t>::max();
83  for (int32_t fid = 0; fid < nfeature; ++fid) {
84  CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
85  }
86 
87  gmat.GetFeatureCounts(&feature_counts_[0]);
88  // classify features
89  for (int32_t fid = 0; fid < nfeature; ++fid) {
90  if (static_cast<double>(feature_counts_[fid])
91  < sparse_threshold * nrow) {
92  type_[fid] = kSparseColumn;
93  } else {
94  type_[fid] = kDenseColumn;
95  }
96  }
97 
98  // want to compute storage boundary for each feature
99  // using variants of prefix sum scan
100  boundary_.resize(nfeature);
101  size_t accum_index_ = 0;
102  size_t accum_row_ind_ = 0;
103  for (int32_t fid = 0; fid < nfeature; ++fid) {
104  boundary_[fid].index_begin = accum_index_;
105  boundary_[fid].row_ind_begin = accum_row_ind_;
106  if (type_[fid] == kDenseColumn) {
107  accum_index_ += static_cast<size_t>(nrow);
108  accum_row_ind_ += static_cast<size_t>(nrow);
109  } else {
110  accum_index_ += feature_counts_[fid];
111  accum_row_ind_ += feature_counts_[fid];
112  }
113  boundary_[fid].index_end = accum_index_;
114  boundary_[fid].row_ind_end = accum_row_ind_;
115  }
116 
117  index_.resize(boundary_[nfeature - 1].index_end);
118  row_ind_.resize(boundary_[nfeature - 1].row_ind_end);
119 
120  // store least bin id for each feature
121  index_base_.resize(nfeature);
122  for (int32_t fid = 0; fid < nfeature; ++fid) {
123  index_base_[fid] = gmat.cut.Ptrs()[fid];
124  }
125 
126  // pre-fill index_ for dense columns
127 
128  #pragma omp parallel for
129  for (int32_t fid = 0; fid < nfeature; ++fid) {
130  if (type_[fid] == kDenseColumn) {
131  const size_t ibegin = boundary_[fid].index_begin;
132  uint32_t* begin = &index_[ibegin];
133  uint32_t* end = begin + nrow;
134  std::fill(begin, end, std::numeric_limits<uint32_t>::max());
135  // max() indicates missing values
136  }
137  }
138 
139  // loop over all rows and fill column entries
140  // num_nonzeros[fid] = how many nonzeros have this feature accumulated so far?
141  std::vector<size_t> num_nonzeros;
142  num_nonzeros.resize(nfeature);
143  std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
144  for (size_t rid = 0; rid < nrow; ++rid) {
145  const size_t ibegin = gmat.row_ptr[rid];
146  const size_t iend = gmat.row_ptr[rid + 1];
147  size_t fid = 0;
148  for (size_t i = ibegin; i < iend; ++i) {
149  const uint32_t bin_id = gmat.index[i];
150  auto iter = std::upper_bound(gmat.cut.Ptrs().cbegin() + fid,
151  gmat.cut.Ptrs().cend(), bin_id);
152  fid = std::distance(gmat.cut.Ptrs().cbegin(), iter) - 1;
153  if (type_[fid] == kDenseColumn) {
154  uint32_t* begin = &index_[boundary_[fid].index_begin];
155  begin[rid] = bin_id - index_base_[fid];
156  } else {
157  uint32_t* begin = &index_[boundary_[fid].index_begin];
158  begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
159  row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
160  ++num_nonzeros[fid];
161  }
162  }
163  }
164  }
165 
166  /* Fetch an individual column. This code should be used with XGBOOST_TYPE_SWITCH
167  to determine type of bin id's */
168  inline Column GetColumn(unsigned fid) const {
169  Column c(type_[fid], &index_[boundary_[fid].index_begin], index_base_[fid],
170  (type_[fid] == ColumnType::kSparseColumn ?
171  &row_ind_[boundary_[fid].row_ind_begin] : nullptr),
172  boundary_[fid].index_end - boundary_[fid].index_begin);
173  return c;
174  }
175 
176  private:
177  struct ColumnBoundary {
178  // indicate where each column's index and row_ind is stored.
179  // index_begin and index_end are logical offsets, so they should be converted to
180  // actual offsets by scaling with packing_factor_
181  size_t index_begin;
182  size_t index_end;
183  size_t row_ind_begin;
184  size_t row_ind_end;
185  };
186 
187  std::vector<size_t> feature_counts_;
188  std::vector<ColumnType> type_;
189  SimpleArray<uint32_t> index_; // index_: may store smaller integers; needs padding
190  SimpleArray<size_t> row_ind_;
191  std::vector<ColumnBoundary> boundary_;
192 
193  // index_base_[fid]: least bin id for feature fid
194  std::vector<uint32_t> index_base_;
195 };
196 
197 } // namespace common
198 } // namespace xgboost
199 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_
std::vector< uint32_t > const & Ptrs() const
Definition: hist_util.h:144
uint32_t GetGlobalBinIdx(size_t idx) const
Definition: column_matrix.h:38
uint32_t GetBaseIdx() const
Definition: column_matrix.h:42
bst_uint GetNumFeature() const
Definition: column_matrix.h:67
HistogramCuts cut
The corresponding cuts.
Definition: hist_util.h:281
std::vector< uint32_t > index
The index data.
Definition: hist_util.h:277
a column storage, to be used with ApplySplit. Note that each bin id is stored as index[i] + index_bas...
Definition: column_matrix.h:28
size_t GetRowIdx(size_t idx) const
Definition: column_matrix.h:44
bool IsMissing(size_t idx) const
Definition: column_matrix.h:49
const size_t * GetRowData() const
Definition: column_matrix.h:52
size_t Size() const
Definition: column_matrix.h:37
Utility for fast histogram aggregation.
uint32_t GetFeatureBinIdx(size_t idx) const
Definition: column_matrix.h:39
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:64
namespace of xgboost
Definition: base.h:102
void Init(const GHistIndexMatrix &gmat, double sparse_threshold)
Definition: column_matrix.h:72
std::vector< size_t > row_ptr
row pointer to rows by element position
Definition: hist_util.h:275
ColumnType GetType() const
Definition: column_matrix.h:43
ColumnType
column type
Definition: column_matrix.h:21
Column(ColumnType type, const uint32_t *index, uint32_t index_base, const size_t *row_ind, size_t len)
Definition: column_matrix.h:30
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:105
Definition: column_matrix.h:22
preprocessed global index matrix, in CSR format Transform floating values to integer index in histogr...
Definition: hist_util.h:273
void GetFeatureCounts(size_t *counts) const
Definition: hist_util.h:290
Definition: column_matrix.h:23
Column GetColumn(unsigned fid) const
Definition: column_matrix.h:168