8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_ 9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_ 30 template <
typename BinIdxType>
36 index_base_(index_base) {}
39 return index_base_ +
static_cast<uint32_t
>(index_[idx]);
44 const uint32_t
GetBaseIdx()
const {
return index_base_; }
59 const uint32_t index_base_;
62 template <
typename BinIdxType>
67 :
Column<BinIdxType>(type, index, index_base),
70 const size_t*
GetRowData()
const {
return row_ind_.data(); }
73 return row_ind_.data()[idx];
81 template <
typename BinIdxType>
85 uint32_t index_base,
const std::vector<bool>& missing_flags,
86 size_t feature_offset)
87 :
Column<BinIdxType>(type, index, index_base),
88 missing_flags_(missing_flags),
89 feature_offset_(feature_offset) {}
90 bool IsMissing(
size_t idx)
const {
return missing_flags_[feature_offset_ + idx]; }
93 const std::vector<bool>& missing_flags_;
94 size_t feature_offset_;
103 return static_cast<bst_uint>(type_.size());
108 double sparse_threshold) {
109 const int32_t nfeature =
static_cast<int32_t
>(gmat.
cut.
Ptrs().size() - 1);
110 const size_t nrow = gmat.
row_ptr.size() - 1;
112 feature_counts_.resize(nfeature);
113 type_.resize(nfeature);
114 std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
115 uint32_t max_val = std::numeric_limits<uint32_t>::max();
116 for (int32_t fid = 0; fid < nfeature; ++fid) {
117 CHECK_LE(gmat.
cut.
Ptrs()[fid + 1] - gmat.
cut.
Ptrs()[fid], max_val);
119 bool all_dense = gmat.
IsDense();
122 for (int32_t fid = 0; fid < nfeature; ++fid) {
123 if (static_cast<double>(feature_counts_[fid])
124 < sparse_threshold * nrow) {
134 feature_offsets_.resize(nfeature + 1);
135 size_t accum_index_ = 0;
136 feature_offsets_[0] = accum_index_;
137 for (int32_t fid = 1; fid < nfeature + 1; ++fid) {
139 accum_index_ +=
static_cast<size_t>(nrow);
141 accum_index_ += feature_counts_[fid - 1];
143 feature_offsets_[fid] = accum_index_;
148 index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
150 row_ind_.resize(feature_offsets_[nfeature]);
154 index_base_ =
const_cast<uint32_t*
>(gmat.
cut.
Ptrs().data());
156 const bool noMissingValues = NoMissingValues(gmat.
row_ptr[nrow], nrow, nfeature);
158 if (noMissingValues) {
159 missing_flags_.resize(feature_offsets_[nfeature],
false);
161 missing_flags_.resize(feature_offsets_[nfeature],
true);
168 SetIndexAllDense(gmat.
index.
data<uint8_t>(), gmat, nrow, nfeature, noMissingValues);
170 SetIndexAllDense(gmat.
index.
data<uint16_t>(), gmat, nrow, nfeature, noMissingValues);
173 SetIndexAllDense(gmat.
index.
data<uint32_t>(), gmat, nrow, nfeature, noMissingValues);
179 SetIndex<uint8_t>(gmat.
index.
data<uint32_t>(), gmat, nrow, nfeature);
181 SetIndex<uint16_t>(gmat.
index.
data<uint32_t>(), gmat, nrow, nfeature);
184 SetIndex<uint32_t>(gmat.
index.
data<uint32_t>(), gmat, nrow, nfeature);
191 if ( (max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint8_t>::max()) ) {
193 }
else if ((max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint16_t>::max())) {
202 template <
typename BinIdxType>
203 std::unique_ptr<const Column<BinIdxType> >
GetColumn(
unsigned fid)
const {
204 CHECK_EQ(
sizeof(BinIdxType), bins_type_size_);
206 const size_t feature_offset = feature_offsets_[fid];
207 const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
209 &index_[feature_offset * bins_type_size_]),
211 std::unique_ptr<const Column<BinIdxType> > res;
214 missing_flags_, feature_offset));
217 {&row_ind_[feature_offset], column_size}));
224 const size_t nfeature,
const bool noMissingValues) {
225 T* local_index =
reinterpret_cast<T*
>(&index_[0]);
229 if (noMissingValues) {
230 #pragma omp parallel for num_threads(omp_get_max_threads()) 231 for (
omp_ulong rid = 0; rid < nrow; ++rid) {
232 const size_t ibegin = rid*nfeature;
233 const size_t iend = (rid+1)*nfeature;
235 for (
size_t i = ibegin; i < iend; ++i, ++j) {
236 const size_t idx = feature_offsets_[j];
237 local_index[idx + rid] = index[i];
245 const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
246 const size_t batch_size = batch.Size();
247 CHECK_LT(batch_size, offset_vec.size());
248 for (
size_t rid = 0; rid < batch_size; ++rid) {
249 const size_t size = offset_vec[rid + 1] - offset_vec[rid];
251 const size_t ibegin = gmat.
row_ptr[rbegin + rid];
252 const size_t iend = gmat.
row_ptr[rbegin + rid + 1];
253 CHECK_EQ(ibegin + inst.
size(), iend);
256 for (
size_t i = ibegin; i < iend; ++i, ++j) {
258 const size_t idx = feature_offsets_[fid];
260 local_index[idx + rbegin + rid] = index[i];
261 missing_flags_[idx + rbegin + rid] =
false;
264 rbegin += batch.Size();
271 const size_t nrow,
const size_t nfeature) {
272 std::vector<size_t> num_nonzeros;
273 num_nonzeros.resize(nfeature);
274 std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
276 T* local_index =
reinterpret_cast<T*
>(&index_[0]);
280 const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
281 const size_t batch_size = batch.Size();
282 CHECK_LT(batch_size, offset_vec.size());
283 for (
size_t rid = 0; rid < batch_size; ++rid) {
284 const size_t ibegin = gmat.
row_ptr[rbegin + rid];
285 const size_t iend = gmat.
row_ptr[rbegin + rid + 1];
287 const size_t size = offset_vec[rid + 1] - offset_vec[rid];
290 CHECK_EQ(ibegin + inst.
size(), iend);
292 for (
size_t i = ibegin; i < iend; ++i, ++j) {
293 const uint32_t bin_id = index[i];
297 T* begin = &local_index[feature_offsets_[fid]];
298 begin[rid + rbegin] = bin_id - index_base_[fid];
299 missing_flags_[feature_offsets_[fid] + rid + rbegin] =
false;
301 T* begin = &local_index[feature_offsets_[fid]];
302 begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
303 row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid + rbegin;
308 rbegin += batch.Size();
312 return bins_type_size_;
315 const size_t n_row,
const size_t n_features) {
316 return n_elements == n_features * n_row;
320 std::vector<uint8_t> index_;
322 std::vector<size_t> feature_counts_;
323 std::vector<ColumnType> type_;
324 std::vector<size_t> row_ind_;
326 std::vector<size_t> feature_offsets_;
329 uint32_t* index_base_;
330 std::vector<bool> missing_flags_;
336 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_ Index index
The index data.
Definition: hist_util.h:318
SparseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, common::Span< const size_t > row_ind)
Definition: column_matrix.h:65
XGBOOST_DEVICE constexpr index_type size() const __span_noexcept
Definition: span.h:531
Definition: column_matrix.h:82
std::vector< uint32_t > const & Ptrs() const
Definition: hist_util.h:96
Definition: hist_util.h:215
const size_t * GetRowData() const
Definition: column_matrix.h:70
const uint32_t GetBaseIdx() const
Definition: column_matrix.h:44
T * data() const
Definition: hist_util.h:257
bst_uint GetNumFeature() const
Definition: column_matrix.h:102
dmlc::omp_ulong omp_ulong
define unsigned long for openmp loop
Definition: base.h:251
HistogramCuts cut
The corresponding cuts.
Definition: hist_util.h:322
Column(ColumnType type, common::Span< const BinIdxType > index, const uint32_t index_base)
Definition: column_matrix.h:33
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:211
ColumnType GetType() const
Definition: column_matrix.h:48
bool IsDense() const
Definition: hist_util.h:353
a column storage, to be used with ApplySplit. Note that each bin id is stored as index[i] + index_bas...
Definition: column_matrix.h:31
Definition: hist_util.h:216
BatchSet< T > GetBatches(const BatchParam ¶m={})
Gets batches. Use range based for loop over BatchSet to access individual batches.
const BinTypeSize GetTypeSize() const
Definition: column_matrix.h:311
BinTypeSize GetBinTypeSize() const
Definition: hist_util.h:253
Definition: column_matrix.h:63
void SetTypeSize(size_t max_num_bins)
Definition: column_matrix.h:190
Utility for fast histogram aggregation.
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:99
BinTypeSize
Definition: hist_util.h:214
const bool NoMissingValues(const size_t n_elements, const size_t n_row, const size_t n_features)
Definition: column_matrix.h:314
namespace of xgboost
Definition: base.h:102
std::unique_ptr< const Column< BinIdxType > > GetColumn(unsigned fid) const
Definition: column_matrix.h:203
common::Span< const BinIdxType > GetFeatureBinIdxPtr() const
Definition: column_matrix.h:46
DenseColumn(ColumnType type, common::Span< const BinIdxType > index, uint32_t index_base, const std::vector< bool > &missing_flags, size_t feature_offset)
Definition: column_matrix.h:84
size_t max_num_bins
Definition: hist_util.h:324
void Init(const GHistIndexMatrix &gmat, double sparse_threshold)
Definition: column_matrix.h:107
size_t Size() const
Definition: column_matrix.h:51
std::vector< size_t > row_ptr
row pointer to rows by element position
Definition: hist_util.h:316
uint32_t GetGlobalBinIdx(size_t idx) const
Definition: column_matrix.h:38
Element from a sparse vector.
Definition: data.h:167
ColumnType
column type
Definition: column_matrix.h:21
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:105
Definition: column_matrix.h:22
size_t GetRowIdx(size_t idx) const
Definition: column_matrix.h:72
preprocessed global index matrix, in CSR format
Definition: hist_util.h:314
BinIdxType GetFeatureBinIdx(size_t idx) const
Definition: column_matrix.h:42
void SetIndex(uint32_t *index, const GHistIndexMatrix &gmat, const size_t nrow, const size_t nfeature)
Definition: column_matrix.h:270
void GetFeatureCounts(size_t *counts) const
Definition: hist_util.h:343
void SetIndexAllDense(T *index, const GHistIndexMatrix &gmat, const size_t nrow, const size_t nfeature, const bool noMissingValues)
Definition: column_matrix.h:223
Definition: column_matrix.h:23
Definition: hist_util.h:217
bool IsMissing(size_t idx) const
Definition: column_matrix.h:90
DMatrix * p_fmat
Definition: hist_util.h:323