8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
11 #include <dmlc/endian.h>
18 #include "../data/gradient_index.h"
32 template <
typename BinIdxType>
38 : type_(type), index_(index), index_base_(index_base) {}
43 return index_base_ +
static_cast<uint32_t
>(index_[idx]);
63 const uint32_t index_base_;
66 template <
typename BinIdxType>
71 :
Column<BinIdxType>(type, index, index_base), row_ind_(row_ind) {}
75 int32_t
GetBinIdx(
size_t rid,
size_t* state)
const {
76 const size_t column_size = this->
Size();
77 if (!((*state) < column_size)) {
80 while ((*state) < column_size &&
GetRowIdx(*state) < rid) {
83 if (((*state) < column_size) &&
GetRowIdx(*state) == rid) {
92 const size_t column_size = this->
Size();
94 const size_t* p = std::lower_bound(row_data, row_data + column_size, first_row_id);
106 template <
typename BinIdxType,
bool any_missing>
110 const std::vector<bool>& missing_flags,
size_t feature_offset)
111 :
Column<BinIdxType>(type, index, index_base),
112 missing_flags_(missing_flags),
113 feature_offset_(feature_offset) {}
114 bool IsMissing(
size_t idx)
const {
return missing_flags_[feature_offset_ + idx]; }
128 const std::vector<bool>& missing_flags_;
129 size_t feature_offset_;
140 inline void Init(
SparsePage const& page,
const GHistIndexMatrix& gmat,
double sparse_threshold,
142 auto const nfeature =
static_cast<bst_feature_t>(gmat.cut.Ptrs().size() - 1);
143 const size_t nrow = gmat.row_ptr.size() - 1;
145 feature_counts_.resize(nfeature);
146 type_.resize(nfeature);
147 std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
148 uint32_t max_val = std::numeric_limits<uint32_t>::max();
150 CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
152 bool all_dense = gmat.IsDense();
153 gmat.GetFeatureCounts(&feature_counts_[0]);
156 if (
static_cast<double>(feature_counts_[fid]) < sparse_threshold * nrow) {
166 feature_offsets_.resize(nfeature + 1);
167 size_t accum_index_ = 0;
168 feature_offsets_[0] = accum_index_;
171 accum_index_ +=
static_cast<size_t>(nrow);
173 accum_index_ += feature_counts_[fid - 1];
175 feature_offsets_[fid] = accum_index_;
180 index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
182 row_ind_.resize(feature_offsets_[nfeature]);
186 index_base_ =
const_cast<uint32_t*
>(gmat.cut.Ptrs().data());
188 const bool noMissingValues =
NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature);
189 any_missing_ = !noMissingValues;
191 missing_flags_.clear();
192 if (noMissingValues) {
193 missing_flags_.resize(feature_offsets_[nfeature],
false);
195 missing_flags_.resize(feature_offsets_[nfeature],
true);
200 BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize();
216 SetIndex<uint8_t>(page, gmat.index.
data<uint32_t>(), gmat, nfeature);
218 SetIndex<uint16_t>(page, gmat.index.
data<uint32_t>(), gmat, nfeature);
221 SetIndex<uint32_t>(page, gmat.index.
data<uint32_t>(), gmat, nfeature);
228 if ((max_num_bins - 1) <=
static_cast<int>(std::numeric_limits<uint8_t>::max())) {
230 }
else if ((max_num_bins - 1) <=
static_cast<int>(std::numeric_limits<uint16_t>::max())) {
239 template <
typename BinIdxType,
bool any_missing>
240 std::unique_ptr<const Column<BinIdxType> >
GetColumn(
unsigned fid)
const {
241 CHECK_EQ(
sizeof(BinIdxType), bins_type_size_);
243 const size_t feature_offset = feature_offsets_[fid];
244 const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
246 reinterpret_cast<const BinIdxType*
>(&index_[feature_offset * bins_type_size_]),
248 std::unique_ptr<const Column<BinIdxType> > res;
250 CHECK_EQ(any_missing, any_missing_);
252 missing_flags_, feature_offset));
255 {&row_ind_[feature_offset], column_size}));
260 template <
typename T>
262 const size_t nrow,
const size_t nfeature,
const bool noMissingValues,
264 T* local_index =
reinterpret_cast<T*
>(&index_[0]);
268 if (noMissingValues) {
270 const size_t ibegin = rid * nfeature;
271 const size_t iend = (rid + 1) * nfeature;
273 for (
size_t i = ibegin; i < iend; ++i, ++j) {
274 const size_t idx = feature_offsets_[j];
275 local_index[idx + rid] = index[i];
280 auto get_bin_idx = [&](
auto bin_id,
auto rid,
bst_feature_t fid) {
282 const size_t idx = feature_offsets_[fid];
284 local_index[idx + rid] = bin_id;
286 missing_flags_[idx + rid] =
false;
295 template <
typename T,
typename BinFn>
297 const size_t nfeature, BinFn&& assign_bin) {
298 std::vector<size_t> num_nonzeros(nfeature, 0ul);
302 const size_t batch_size = gmat.Size();
303 CHECK_LT(batch_size, offset_vec.size());
305 for (
size_t rid = 0; rid < batch_size; ++rid) {
306 const size_t ibegin = gmat.row_ptr[rbegin + rid];
307 const size_t iend = gmat.row_ptr[rbegin + rid + 1];
308 const size_t size = offset_vec[rid + 1] - offset_vec[rid];
311 CHECK_EQ(ibegin + inst.
size(), iend);
313 for (
size_t i = ibegin; i < iend; ++i, ++j) {
314 const uint32_t bin_id = index[i];
315 auto fid = inst[j].index;
316 assign_bin(bin_id, rid, fid);
321 template <
typename T>
323 const size_t nfeature) {
324 T* local_index =
reinterpret_cast<T*
>(&index_[0]);
325 std::vector<size_t> num_nonzeros;
326 num_nonzeros.resize(nfeature);
327 std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
329 auto get_bin_idx = [&](
auto bin_id,
auto rid,
bst_feature_t fid) {
331 T* begin = &local_index[feature_offsets_[fid]];
332 begin[rid] = bin_id - index_base_[fid];
333 missing_flags_[feature_offsets_[fid] + rid] =
false;
335 T* begin = &local_index[feature_offsets_[fid]];
336 begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
337 row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid;
347 bool NoMissingValues(
const size_t n_elements,
const size_t n_row,
const size_t n_features) {
348 return n_elements == n_features * n_row;
355 bool Read(dmlc::SeekStream* fi, uint32_t
const* index_base) {
357 fi->Read(&feature_counts_);
358 #if !DMLC_LITTLE_ENDIAN
360 std::vector<std::underlying_type<ColumnType>::type> int_types;
361 fi->Read(&int_types);
362 type_.resize(int_types.size());
364 int_types.begin(), int_types.end(), type_.begin(),
365 [](std::underlying_type<ColumnType>::type i) { return static_cast<ColumnType>(i); });
368 #endif // !DMLC_LITTLE_ENDIAN
371 fi->Read(&feature_offsets_);
372 index_base_ = index_base;
373 #if !DMLC_LITTLE_ENDIAN
374 std::underlying_type<BinTypeSize>::type v;
378 fi->Read(&bins_type_size_);
381 fi->Read(&any_missing_);
385 size_t Write(dmlc::Stream* fo)
const {
388 auto write_vec = [&](
auto const& vec) {
390 bytes += vec.size() *
sizeof(
typename std::remove_reference_t<decltype(vec)>::value_type) +
394 write_vec(feature_counts_);
395 #if !DMLC_LITTLE_ENDIAN
397 std::vector<std::underlying_type<ColumnType>::type> int_types(type_.size());
398 std::transform(type_.begin(), type_.end(), int_types.begin(), [](
ColumnType t) {
399 return static_cast<std::underlying_type<ColumnType>::type>(t);
401 write_vec(int_types);
404 #endif // !DMLC_LITTLE_ENDIAN
406 write_vec(feature_offsets_);
408 #if !DMLC_LITTLE_ENDIAN
409 auto v =
static_cast<std::underlying_type<BinTypeSize>::type
>(bins_type_size_);
412 fo->Write(bins_type_size_);
413 #endif // DMLC_LITTLE_ENDIAN
414 bytes +=
sizeof(bins_type_size_);
415 fo->Write(any_missing_);
416 bytes +=
sizeof(any_missing_);
422 std::vector<uint8_t> index_;
424 std::vector<size_t> feature_counts_;
425 std::vector<ColumnType> type_;
426 std::vector<size_t> row_ind_;
428 std::vector<size_t> feature_offsets_;
431 uint32_t
const* index_base_;
432 std::vector<bool> missing_flags_;
438 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_