8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
30 template <
typename BinIdxType>
36 index_base_(index_base) {}
41 return index_base_ +
static_cast<uint32_t
>(index_[idx]);
61 const uint32_t index_base_;
64 template <
typename BinIdxType>
69 :
Column<BinIdxType>(type, index, index_base),
75 return row_ind_.
data()[idx];
83 template <
typename BinIdxType>
87 uint32_t index_base,
const std::vector<bool>& missing_flags,
88 size_t feature_offset)
89 :
Column<BinIdxType>(type, index, index_base),
90 missing_flags_(missing_flags),
91 feature_offset_(feature_offset) {}
92 bool IsMissing(
size_t idx)
const {
return missing_flags_[feature_offset_ + idx]; }
95 const std::vector<bool>& missing_flags_;
96 size_t feature_offset_;
105 return static_cast<bst_uint>(type_.size());
110 double sparse_threshold) {
111 const int32_t nfeature =
static_cast<int32_t
>(gmat.
cut.
Ptrs().size() - 1);
112 const size_t nrow = gmat.
row_ptr.size() - 1;
114 feature_counts_.resize(nfeature);
115 type_.resize(nfeature);
116 std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
117 uint32_t max_val = std::numeric_limits<uint32_t>::max();
118 for (int32_t fid = 0; fid < nfeature; ++fid) {
119 CHECK_LE(gmat.
cut.
Ptrs()[fid + 1] - gmat.
cut.
Ptrs()[fid], max_val);
121 bool all_dense = gmat.
IsDense();
124 for (int32_t fid = 0; fid < nfeature; ++fid) {
125 if (
static_cast<double>(feature_counts_[fid])
126 < sparse_threshold * nrow) {
136 feature_offsets_.resize(nfeature + 1);
137 size_t accum_index_ = 0;
138 feature_offsets_[0] = accum_index_;
139 for (int32_t fid = 1; fid < nfeature + 1; ++fid) {
141 accum_index_ +=
static_cast<size_t>(nrow);
143 accum_index_ += feature_counts_[fid - 1];
145 feature_offsets_[fid] = accum_index_;
150 index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
152 row_ind_.resize(feature_offsets_[nfeature]);
156 index_base_ =
const_cast<uint32_t*
>(gmat.
cut.
Ptrs().data());
159 any_missing_ = !noMissingValues;
161 if (noMissingValues) {
162 missing_flags_.resize(feature_offsets_[nfeature],
false);
164 missing_flags_.resize(feature_offsets_[nfeature],
true);
182 SetIndex<uint8_t>(gmat.
index.
data<uint32_t>(), gmat, nfeature);
184 SetIndex<uint16_t>(gmat.
index.
data<uint32_t>(), gmat, nfeature);
187 SetIndex<uint32_t>(gmat.
index.
data<uint32_t>(), gmat, nfeature);
194 if ( (max_num_bins - 1) <=
static_cast<int>(std::numeric_limits<uint8_t>::max()) ) {
196 }
else if ((max_num_bins - 1) <=
static_cast<int>(std::numeric_limits<uint16_t>::max())) {
205 template <
typename BinIdxType>
206 std::unique_ptr<const Column<BinIdxType> >
GetColumn(
unsigned fid)
const {
207 CHECK_EQ(
sizeof(BinIdxType), bins_type_size_);
209 const size_t feature_offset = feature_offsets_[fid];
210 const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
212 &index_[feature_offset * bins_type_size_]),
214 std::unique_ptr<const Column<BinIdxType> > res;
217 missing_flags_, feature_offset));
220 {&row_ind_[feature_offset], column_size}));
227 const size_t nfeature,
const bool noMissingValues) {
228 T* local_index =
reinterpret_cast<T*
>(&index_[0]);
232 if (noMissingValues) {
234 const size_t ibegin = rid*nfeature;
235 const size_t iend = (rid+1)*nfeature;
237 for (
size_t i = ibegin; i < iend; ++i, ++j) {
238 const size_t idx = feature_offsets_[j];
239 local_index[idx + rid] = index[i];
247 const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
248 const size_t batch_size = batch.Size();
249 CHECK_LT(batch_size, offset_vec.size());
250 for (
size_t rid = 0; rid < batch_size; ++rid) {
251 const size_t size = offset_vec[rid + 1] - offset_vec[rid];
253 const size_t ibegin = gmat.
row_ptr[rbegin + rid];
254 const size_t iend = gmat.
row_ptr[rbegin + rid + 1];
255 CHECK_EQ(ibegin + inst.
size(), iend);
258 for (
size_t i = ibegin; i < iend; ++i, ++j) {
260 const size_t idx = feature_offsets_[fid];
262 local_index[idx + rbegin + rid] = index[i];
263 missing_flags_[idx + rbegin + rid] =
false;
266 rbegin += batch.Size();
273 const size_t nfeature) {
274 std::vector<size_t> num_nonzeros;
275 num_nonzeros.resize(nfeature);
276 std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
278 T* local_index =
reinterpret_cast<T*
>(&index_[0]);
282 const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
283 const size_t batch_size = batch.Size();
284 CHECK_LT(batch_size, offset_vec.size());
285 for (
size_t rid = 0; rid < batch_size; ++rid) {
286 const size_t ibegin = gmat.
row_ptr[rbegin + rid];
287 const size_t iend = gmat.
row_ptr[rbegin + rid + 1];
289 const size_t size = offset_vec[rid + 1] - offset_vec[rid];
292 CHECK_EQ(ibegin + inst.
size(), iend);
294 for (
size_t i = ibegin; i < iend; ++i, ++j) {
295 const uint32_t bin_id = index[i];
299 T* begin = &local_index[feature_offsets_[fid]];
300 begin[rid + rbegin] = bin_id - index_base_[fid];
301 missing_flags_[feature_offsets_[fid] + rid + rbegin] =
false;
303 T* begin = &local_index[feature_offsets_[fid]];
304 begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
305 row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid + rbegin;
310 rbegin += batch.Size();
314 return bins_type_size_;
319 const size_t n_row,
const size_t n_features) {
320 return n_elements == n_features * n_row;
329 std::vector<uint8_t> index_;
331 std::vector<size_t> feature_counts_;
332 std::vector<ColumnType> type_;
333 std::vector<size_t> row_ind_;
335 std::vector<size_t> feature_offsets_;
338 uint32_t* index_base_;
339 std::vector<bool> missing_flags_;
346 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_