8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_ 9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_ 30 template <
typename BinIdxType>
36 index_base_(index_base) {}
41 return index_base_ +
static_cast<uint32_t
>(index_[idx]);
46 const uint32_t
GetBaseIdx()
const {
return index_base_; }
61 const uint32_t index_base_;
64 template <
typename BinIdxType>
69 :
Column<BinIdxType>(type, index, index_base),
72 const size_t*
GetRowData()
const {
return row_ind_.data(); }
75 return row_ind_.data()[idx];
83 template <
typename BinIdxType>
87 uint32_t index_base,
const std::vector<bool>& missing_flags,
88 size_t feature_offset)
89 :
Column<BinIdxType>(type, index, index_base),
90 missing_flags_(missing_flags),
91 feature_offset_(feature_offset) {}
92 bool IsMissing(
size_t idx)
const {
return missing_flags_[feature_offset_ + idx]; }
95 const std::vector<bool>& missing_flags_;
96 size_t feature_offset_;
105 return static_cast<bst_uint>(type_.size());
110 double sparse_threshold) {
111 const int32_t nfeature =
static_cast<int32_t
>(gmat.
cut.
Ptrs().size() - 1);
112 const size_t nrow = gmat.
row_ptr.size() - 1;
114 feature_counts_.resize(nfeature);
115 type_.resize(nfeature);
116 std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
117 uint32_t max_val = std::numeric_limits<uint32_t>::max();
118 for (int32_t fid = 0; fid < nfeature; ++fid) {
119 CHECK_LE(gmat.
cut.
Ptrs()[fid + 1] - gmat.
cut.
Ptrs()[fid], max_val);
121 bool all_dense = gmat.
IsDense();
124 for (int32_t fid = 0; fid < nfeature; ++fid) {
125 if (static_cast<double>(feature_counts_[fid])
126 < sparse_threshold * nrow) {
136 feature_offsets_.resize(nfeature + 1);
137 size_t accum_index_ = 0;
138 feature_offsets_[0] = accum_index_;
139 for (int32_t fid = 1; fid < nfeature + 1; ++fid) {
141 accum_index_ +=
static_cast<size_t>(nrow);
143 accum_index_ += feature_counts_[fid - 1];
145 feature_offsets_[fid] = accum_index_;
150 index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
152 row_ind_.resize(feature_offsets_[nfeature]);
156 index_base_ =
const_cast<uint32_t*
>(gmat.
cut.
Ptrs().data());
158 const bool noMissingValues = NoMissingValues(gmat.
row_ptr[nrow], nrow, nfeature);
159 any_missing_ = !noMissingValues;
161 if (noMissingValues) {
162 missing_flags_.resize(feature_offsets_[nfeature],
false);
164 missing_flags_.resize(feature_offsets_[nfeature],
true);
171 SetIndexAllDense(gmat.
index.
data<uint8_t>(), gmat, nrow, nfeature, noMissingValues);
173 SetIndexAllDense(gmat.
index.
data<uint16_t>(), gmat, nrow, nfeature, noMissingValues);
176 SetIndexAllDense(gmat.
index.
data<uint32_t>(), gmat, nrow, nfeature, noMissingValues);
182 SetIndex<uint8_t>(gmat.
index.
data<uint32_t>(), gmat, nrow, nfeature);
184 SetIndex<uint16_t>(gmat.
index.
data<uint32_t>(), gmat, nrow, nfeature);
187 SetIndex<uint32_t>(gmat.
index.
data<uint32_t>(), gmat, nrow, nfeature);
194 if ( (max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint8_t>::max()) ) {
196 }
else if ((max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint16_t>::max())) {
205 template <
typename BinIdxType>
206 std::unique_ptr<const Column<BinIdxType> >
GetColumn(
unsigned fid)
const {
207 CHECK_EQ(
sizeof(BinIdxType), bins_type_size_);
209 const size_t feature_offset = feature_offsets_[fid];
210 const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
212 &index_[feature_offset * bins_type_size_]),
214 std::unique_ptr<const Column<BinIdxType> > res;
217 missing_flags_, feature_offset));
220 {&row_ind_[feature_offset], column_size}));
227 const size_t nfeature,
const bool noMissingValues) {
228 T* local_index =
reinterpret_cast<T*
>(&index_[0]);
232 if (noMissingValues) {
233 #pragma omp parallel for num_threads(omp_get_max_threads()) 234 for (
omp_ulong rid = 0; rid < nrow; ++rid) {
235 const size_t ibegin = rid*nfeature;
236 const size_t iend = (rid+1)*nfeature;
238 for (
size_t i = ibegin; i < iend; ++i, ++j) {
239 const size_t idx = feature_offsets_[j];
240 local_index[idx + rid] = index[i];
248 const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
249 const size_t batch_size = batch.Size();
250 CHECK_LT(batch_size, offset_vec.size());
251 for (
size_t rid = 0; rid < batch_size; ++rid) {
252 const size_t size = offset_vec[rid + 1] - offset_vec[rid];
254 const size_t ibegin = gmat.
row_ptr[rbegin + rid];
255 const size_t iend = gmat.
row_ptr[rbegin + rid + 1];
256 CHECK_EQ(ibegin + inst.
size(), iend);
259 for (
size_t i = ibegin; i < iend; ++i, ++j) {
261 const size_t idx = feature_offsets_[fid];
263 local_index[idx + rbegin + rid] = index[i];
264 missing_flags_[idx + rbegin + rid] =
false;
267 rbegin += batch.Size();
274 const size_t nrow,
const size_t nfeature) {
275 std::vector<size_t> num_nonzeros;
276 num_nonzeros.resize(nfeature);
277 std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
279 T* local_index =
reinterpret_cast<T*
>(&index_[0]);
283 const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
284 const size_t batch_size = batch.Size();
285 CHECK_LT(batch_size, offset_vec.size());
286 for (
size_t rid = 0; rid < batch_size; ++rid) {
287 const size_t ibegin = gmat.
row_ptr[rbegin + rid];
288 const size_t iend = gmat.
row_ptr[rbegin + rid + 1];
290 const size_t size = offset_vec[rid + 1] - offset_vec[rid];
293 CHECK_EQ(ibegin + inst.
size(), iend);
295 for (
size_t i = ibegin; i < iend; ++i, ++j) {
296 const uint32_t bin_id = index[i];
300 T* begin = &local_index[feature_offsets_[fid]];
301 begin[rid + rbegin] = bin_id - index_base_[fid];
302 missing_flags_[feature_offsets_[fid] + rid + rbegin] =
false;
304 T* begin = &local_index[feature_offsets_[fid]];
305 begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
306 row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid + rbegin;
311 rbegin += batch.Size();
315 return bins_type_size_;
320 const size_t n_row,
const size_t n_features) {
321 return n_elements == n_features * n_row;
330 std::vector<uint8_t> index_;
332 std::vector<size_t> feature_counts_;
333 std::vector<ColumnType> type_;
334 std::vector<size_t> row_ind_;
336 std::vector<size_t> feature_offsets_;
339 uint32_t* index_base_;
340 std::vector<bool> missing_flags_;
347 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_ virtual ~Column()=default
Index index
The index data.
Definition: hist_util.h:306
SparseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, common::Span< const size_t > row_ind)
Definition: column_matrix.h:67
XGBOOST_DEVICE constexpr index_type size() const __span_noexcept
Definition: span.h:531
Definition: column_matrix.h:84
std::vector< uint32_t > const & Ptrs() const
Definition: hist_util.h:96
Definition: hist_util.h:203
const size_t * GetRowData() const
Definition: column_matrix.h:72
const uint32_t GetBaseIdx() const
Definition: column_matrix.h:46
T * data() const
Definition: hist_util.h:245
bst_uint GetNumFeature() const
Definition: column_matrix.h:104
dmlc::omp_ulong omp_ulong
define unsigned long for openmp loop
Definition: base.h:259
HistogramCuts cut
The corresponding cuts.
Definition: hist_util.h:310
Column(ColumnType type, common::Span< const BinIdxType > index, const uint32_t index_base)
Definition: column_matrix.h:33
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:245
ColumnType GetType() const
Definition: column_matrix.h:50
bool IsDense() const
Definition: hist_util.h:341
a column storage, to be used with ApplySplit. Note that each bin id is stored as index[i] + index_bas...
Definition: column_matrix.h:31
Definition: hist_util.h:204
BatchSet< T > GetBatches(const BatchParam ¶m={})
Gets batches. Use range based for loop over BatchSet to access individual batches.
const BinTypeSize GetTypeSize() const
Definition: column_matrix.h:314
BinTypeSize GetBinTypeSize() const
Definition: hist_util.h:241
Definition: column_matrix.h:65
void SetTypeSize(size_t max_num_bins)
Definition: column_matrix.h:193
Utility for fast histogram aggregation.
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:101
BinTypeSize
Definition: hist_util.h:202
const bool AnyMissing() const
Definition: column_matrix.h:325
const bool NoMissingValues(const size_t n_elements, const size_t n_row, const size_t n_features)
Definition: column_matrix.h:319
namespace of xgboost
Definition: base.h:102
std::unique_ptr< const Column< BinIdxType > > GetColumn(unsigned fid) const
Definition: column_matrix.h:206
common::Span< const BinIdxType > GetFeatureBinIdxPtr() const
Definition: column_matrix.h:48
DenseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, const std::vector< bool > &missing_flags, size_t feature_offset)
Definition: column_matrix.h:86
size_t max_num_bins
Definition: hist_util.h:312
void Init(const GHistIndexMatrix &gmat, double sparse_threshold)
Definition: column_matrix.h:109
size_t Size() const
Definition: column_matrix.h:53
std::vector< size_t > row_ptr
row pointer to rows by element position
Definition: hist_util.h:304
uint32_t GetGlobalBinIdx(size_t idx) const
Definition: column_matrix.h:40
Element from a sparse vector.
Definition: data.h:201
ColumnType
column type
Definition: column_matrix.h:21
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:105
Definition: column_matrix.h:22
size_t GetRowIdx(size_t idx) const
Definition: column_matrix.h:74
preprocessed global index matrix, in CSR format
Definition: hist_util.h:302
BinIdxType GetFeatureBinIdx(size_t idx) const
Definition: column_matrix.h:44
void SetIndex(uint32_t *index, const GHistIndexMatrix &gmat, const size_t nrow, const size_t nfeature)
Definition: column_matrix.h:273
void GetFeatureCounts(size_t *counts) const
Definition: hist_util.h:331
void SetIndexAllDense(T *index, const GHistIndexMatrix &gmat, const size_t nrow, const size_t nfeature, const bool noMissingValues)
Definition: column_matrix.h:226
Definition: column_matrix.h:23
Definition: hist_util.h:205
bool IsMissing(size_t idx) const
Definition: column_matrix.h:92
DMatrix * p_fmat
Definition: hist_util.h:311