8 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_ 9 #define XGBOOST_COMMON_COLUMN_MATRIX_H_ 31 const size_t* row_ind,
size_t len)
34 index_base_(index_base),
37 size_t Size()
const {
return len_; }
50 return index_[idx] == std::numeric_limits<uint32_t>::max();
56 const uint32_t* index_;
58 const size_t* row_ind_;
68 return static_cast<bst_uint>(type_.size());
73 double sparse_threshold) {
74 const int32_t nfeature =
static_cast<int32_t
>(gmat.
cut.
row_ptr.size() - 1);
75 const size_t nrow = gmat.
row_ptr.size() - 1;
78 feature_counts_.resize(nfeature);
79 type_.resize(nfeature);
80 std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
82 uint32_t max_val = std::numeric_limits<uint32_t>::max();
83 for (
bst_uint fid = 0; fid < nfeature; ++fid) {
89 for (int32_t fid = 0; fid < nfeature; ++fid) {
90 if (static_cast<double>(feature_counts_[fid])
91 < sparse_threshold * nrow) {
100 boundary_.resize(nfeature);
101 size_t accum_index_ = 0;
102 size_t accum_row_ind_ = 0;
103 for (int32_t fid = 0; fid < nfeature; ++fid) {
104 boundary_[fid].index_begin = accum_index_;
105 boundary_[fid].row_ind_begin = accum_row_ind_;
107 accum_index_ +=
static_cast<size_t>(nrow);
108 accum_row_ind_ +=
static_cast<size_t>(nrow);
110 accum_index_ += feature_counts_[fid];
111 accum_row_ind_ += feature_counts_[fid];
113 boundary_[fid].index_end = accum_index_;
114 boundary_[fid].row_ind_end = accum_row_ind_;
117 index_.resize(boundary_[nfeature - 1].index_end);
118 row_ind_.resize(boundary_[nfeature - 1].row_ind_end);
121 index_base_.resize(nfeature);
122 for (
bst_uint fid = 0; fid < nfeature; ++fid) {
128 #pragma omp parallel for 129 for (int32_t fid = 0; fid < nfeature; ++fid) {
131 const size_t ibegin = boundary_[fid].index_begin;
132 uint32_t* begin = &index_[ibegin];
133 uint32_t* end = begin + nrow;
134 std::fill(begin, end, std::numeric_limits<uint32_t>::max());
141 std::vector<size_t> num_nonzeros;
142 num_nonzeros.resize(nfeature);
143 std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
144 for (
size_t rid = 0; rid < nrow; ++rid) {
145 const size_t ibegin = gmat.
row_ptr[rid];
146 const size_t iend = gmat.
row_ptr[rid + 1];
148 for (
size_t i = ibegin; i < iend; ++i) {
149 const uint32_t bin_id = gmat.
index[i];
154 uint32_t* begin = &index_[boundary_[fid].index_begin];
155 begin[rid] = bin_id - index_base_[fid];
157 uint32_t* begin = &index_[boundary_[fid].index_begin];
158 begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
159 row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
169 Column c(type_[fid], &index_[boundary_[fid].index_begin], index_base_[fid],
171 &row_ind_[boundary_[fid].row_ind_begin] :
nullptr),
172 boundary_[fid].index_end - boundary_[fid].index_begin);
177 struct ColumnBoundary {
183 size_t row_ind_begin;
187 std::vector<size_t> feature_counts_;
188 std::vector<ColumnType> type_;
191 std::vector<ColumnBoundary> boundary_;
194 std::vector<uint32_t> index_base_;
199 #endif // XGBOOST_COMMON_COLUMN_MATRIX_H_ uint32_t GetGlobalBinIdx(size_t idx) const
Definition: column_matrix.h:38
uint32_t GetBaseIdx() const
Definition: column_matrix.h:42
bst_uint GetNumFeature() const
Definition: column_matrix.h:67
std::vector< uint32_t > index
The index data.
Definition: hist_util.h:138
std::vector< uint32_t > row_ptr
Unit pointer to rows by element position.
Definition: hist_util.h:93
a column storage, to be used with ApplySplit. Note that each bin id is stored as index[i] + index_bas...
Definition: column_matrix.h:28
size_t GetRowIdx(size_t idx) const
Definition: column_matrix.h:44
bool IsMissing(size_t idx) const
Definition: column_matrix.h:49
const size_t * GetRowData() const
Definition: column_matrix.h:52
HistCutMatrix cut
The corresponding cuts.
Definition: hist_util.h:142
size_t Size() const
Definition: column_matrix.h:37
Utility for fast histogram aggregation.
uint32_t GetFeatureBinIdx(size_t idx) const
Definition: column_matrix.h:39
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:64
namespace of xgboost
Definition: base.h:79
void Init(const GHistIndexMatrix &gmat, double sparse_threshold)
Definition: column_matrix.h:72
std::vector< size_t > row_ptr
row pointer to rows by element position
Definition: hist_util.h:136
ColumnType GetType() const
Definition: column_matrix.h:43
ColumnType
column type
Definition: column_matrix.h:21
Column(ColumnType type, const uint32_t *index, uint32_t index_base, const size_t *row_ind, size_t len)
Definition: column_matrix.h:30
uint32_t bst_uint
unsigned integer type used in boost, used for feature index and row index.
Definition: base.h:84
Definition: column_matrix.h:22
preprocessed global index matrix, in CSR format Transform floating values to integer index in histogr...
Definition: hist_util.h:134
void GetFeatureCounts(size_t *counts) const
Definition: hist_util.h:151
Definition: column_matrix.h:23
Column GetColumn(unsigned fid) const
Definition: column_matrix.h:168