8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
15 #include "../data/gradient_index.h"
31 template <
typename BinIdxType>
39 index_base_(index_base) {}
44 return index_base_ +
static_cast<uint32_t
>(index_[idx]);
64 const uint32_t index_base_;
67 template <
typename BinIdxType>
72 :
Column<BinIdxType>(type, index, index_base),
77 int32_t
GetBinIdx(
size_t rid,
size_t* state)
const {
78 const size_t column_size = this->
Size();
79 if (!((*state) < column_size)) {
82 while ((*state) < column_size &&
GetRowIdx(*state) < rid) {
85 if (((*state) < column_size) &&
GetRowIdx(*state) == rid) {
94 const size_t column_size = this->
Size();
96 const size_t* p = std::lower_bound(row_data, row_data + column_size, first_row_id);
102 return row_ind_.
data()[idx];
110 template <
typename BinIdxType,
bool any_missing>
114 uint32_t index_base,
const std::vector<bool>& missing_flags,
115 size_t feature_offset)
116 :
Column<BinIdxType>(type, index, index_base),
117 missing_flags_(missing_flags),
118 feature_offset_(feature_offset) {}
119 bool IsMissing(
size_t idx)
const {
return missing_flags_[feature_offset_ + idx]; }
135 const std::vector<bool>& missing_flags_;
136 size_t feature_offset_;
145 return static_cast<bst_uint>(type_.size());
149 inline void Init(
const GHistIndexMatrix& gmat,
150 double sparse_threshold) {
151 const int32_t nfeature =
static_cast<int32_t
>(gmat.cut.Ptrs().size() - 1);
152 const size_t nrow = gmat.row_ptr.size() - 1;
154 feature_counts_.resize(nfeature);
155 type_.resize(nfeature);
156 std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
157 uint32_t max_val = std::numeric_limits<uint32_t>::max();
158 for (int32_t fid = 0; fid < nfeature; ++fid) {
159 CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
161 bool all_dense = gmat.IsDense();
162 gmat.GetFeatureCounts(&feature_counts_[0]);
164 for (int32_t fid = 0; fid < nfeature; ++fid) {
165 if (
static_cast<double>(feature_counts_[fid])
166 < sparse_threshold * nrow) {
176 feature_offsets_.resize(nfeature + 1);
177 size_t accum_index_ = 0;
178 feature_offsets_[0] = accum_index_;
179 for (int32_t fid = 1; fid < nfeature + 1; ++fid) {
181 accum_index_ +=
static_cast<size_t>(nrow);
183 accum_index_ += feature_counts_[fid - 1];
185 feature_offsets_[fid] = accum_index_;
190 index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
192 row_ind_.resize(feature_offsets_[nfeature]);
196 index_base_ =
const_cast<uint32_t*
>(gmat.cut.Ptrs().data());
198 const bool noMissingValues =
NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature);
199 any_missing_ = !noMissingValues;
201 if (noMissingValues) {
202 missing_flags_.resize(feature_offsets_[nfeature],
false);
204 missing_flags_.resize(feature_offsets_[nfeature],
true);
209 BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize();
211 SetIndexAllDense(gmat.index.data<uint8_t>(), gmat, nrow, nfeature, noMissingValues);
213 SetIndexAllDense(gmat.index.data<uint16_t>(), gmat, nrow, nfeature, noMissingValues);
216 SetIndexAllDense(gmat.index.data<uint32_t>(), gmat, nrow, nfeature, noMissingValues);
222 SetIndex<uint8_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
224 SetIndex<uint16_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
227 SetIndex<uint32_t>(gmat.index.data<uint32_t>(), gmat, nfeature);
234 if ( (max_num_bins - 1) <=
static_cast<int>(std::numeric_limits<uint8_t>::max()) ) {
236 }
else if ((max_num_bins - 1) <=
static_cast<int>(std::numeric_limits<uint16_t>::max())) {
245 template <
typename BinIdxType,
bool any_missing>
246 std::unique_ptr<const Column<BinIdxType> >
GetColumn(
unsigned fid)
const {
247 CHECK_EQ(
sizeof(BinIdxType), bins_type_size_);
249 const size_t feature_offset = feature_offsets_[fid];
250 const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
252 &index_[feature_offset * bins_type_size_]),
254 std::unique_ptr<const Column<BinIdxType> > res;
256 CHECK_EQ(any_missing, any_missing_);
258 missing_flags_, feature_offset));
261 {&row_ind_[feature_offset], column_size}));
266 template <
typename T>
268 const size_t nrow,
const size_t nfeature,
269 const bool noMissingValues) {
270 T* local_index =
reinterpret_cast<T*
>(&index_[0]);
274 if (noMissingValues) {
276 const size_t ibegin = rid*nfeature;
277 const size_t iend = (rid+1)*nfeature;
279 for (
size_t i = ibegin; i < iend; ++i, ++j) {
280 const size_t idx = feature_offsets_[j];
281 local_index[idx + rid] = index[i];
287 for (
const auto &batch : gmat.p_fmat->GetBatches<
SparsePage>()) {
289 const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
290 const size_t batch_size = batch.Size();
291 CHECK_LT(batch_size, offset_vec.size());
292 for (
size_t rid = 0; rid < batch_size; ++rid) {
293 const size_t size = offset_vec[rid + 1] - offset_vec[rid];
295 const size_t ibegin = gmat.row_ptr[rbegin + rid];
296 const size_t iend = gmat.row_ptr[rbegin + rid + 1];
297 CHECK_EQ(ibegin + inst.
size(), iend);
300 for (
size_t i = ibegin; i < iend; ++i, ++j) {
302 const size_t idx = feature_offsets_[fid];
304 local_index[idx + rbegin + rid] = index[i];
305 missing_flags_[idx + rbegin + rid] =
false;
308 rbegin += batch.Size();
314 inline void SetIndex(uint32_t* index,
const GHistIndexMatrix& gmat,
315 const size_t nfeature) {
316 std::vector<size_t> num_nonzeros;
317 num_nonzeros.resize(nfeature);
318 std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
320 T* local_index =
reinterpret_cast<T*
>(&index_[0]);
322 for (
const auto &batch : gmat.p_fmat->GetBatches<
SparsePage>()) {
324 const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
325 const size_t batch_size = batch.Size();
326 CHECK_LT(batch_size, offset_vec.size());
327 for (
size_t rid = 0; rid < batch_size; ++rid) {
328 const size_t ibegin = gmat.row_ptr[rbegin + rid];
329 const size_t iend = gmat.row_ptr[rbegin + rid + 1];
331 const size_t size = offset_vec[rid + 1] - offset_vec[rid];
334 CHECK_EQ(ibegin + inst.
size(), iend);
336 for (
size_t i = ibegin; i < iend; ++i, ++j) {
337 const uint32_t bin_id = index[i];
341 T* begin = &local_index[feature_offsets_[fid]];
342 begin[rid + rbegin] = bin_id - index_base_[fid];
343 missing_flags_[feature_offsets_[fid] + rid + rbegin] =
false;
345 T* begin = &local_index[feature_offsets_[fid]];
346 begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
347 row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid + rbegin;
352 rbegin += batch.Size();
356 return bins_type_size_;
361 const size_t n_row,
const size_t n_features) {
362 return n_elements == n_features * n_row;
371 std::vector<uint8_t> index_;
373 std::vector<size_t> feature_counts_;
374 std::vector<ColumnType> type_;
375 std::vector<size_t> row_ind_;
377 std::vector<size_t> feature_offsets_;
380 uint32_t* index_base_;
381 std::vector<bool> missing_flags_;
388 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_