7 #ifndef XGBOOST_DATA_H_ 8 #define XGBOOST_DATA_H_ 10 #include <dmlc/base.h> 11 #include <dmlc/data.h> 12 #include <rabit/rabit.h> 20 #include "../../src/common/span.h" 21 #include "../../src/common/group_data.h" 23 #include "../../src/common/host_device_vector.h" 90 return root_index_.size() != 0 ? root_index_[i] : 0U;
94 if (label_order_cache_.size() == labels_.
Size()) {
95 return label_order_cache_;
97 label_order_cache_.resize(labels_.
Size());
98 std::iota(label_order_cache_.begin(), label_order_cache_.end(), 0);
101 [&l](
size_t i1,
size_t i2) {
return std::abs(l[i1]) < std::abs(l[i2]);});
103 return label_order_cache_;
124 void SetInfo(
const char* key,
const void* dptr,
DataType dtype,
size_t num);
128 mutable std::vector<size_t> label_order_cache_;
150 return (this->index == other.
index && this->fvalue == other.
fvalue);
176 if (rabit::IsDistributed() && i + 1 >= offset_vec.size()) {
179 size = offset_vec[i + 1] - offset_vec[i];
181 return {data_vec.data() + offset_vec[i],
191 return offset.
Size() - 1;
195 return offset.
Size() *
sizeof(size_t) + data.
Size() *
sizeof(
Entry);
202 offset_vec.push_back(0);
209 &transpose.
data.HostVector());
210 const int nthread = omp_get_max_threads();
211 builder.InitBudget(num_columns, nthread);
212 long batch_size =
static_cast<long>(this->Size());
213 #pragma omp parallel for schedule(static) 214 for (
long i = 0; i < batch_size; ++i) {
215 int tid = omp_get_thread_num();
216 auto inst = (*this)[i];
217 for (
bst_uint j = 0; j < inst.size(); ++j) {
218 builder.AddBudget(inst[j].index, tid);
221 builder.InitStorage();
222 #pragma omp parallel for schedule(static) 223 for (
long i = 0; i < batch_size; ++i) {
224 int tid = omp_get_thread_num();
225 auto inst = (*this)[i];
226 for (
bst_uint j = 0; j < inst.size(); ++j) {
229 Entry(static_cast<bst_uint>(this->base_rowid + i), inst[j].fvalue),
238 #pragma omp parallel for schedule(dynamic, 1) 253 void Push(
const dmlc::RowBlock<uint32_t>& batch);
271 offset_vec.push_back(offset_vec.back() + inst.
size());
272 size_t begin = data_vec.size();
273 data_vec.resize(begin + inst.
size());
274 if (inst.
size() != 0) {
275 std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.
data(),
288 virtual const SparsePage& operator*()
const = 0;
289 virtual void operator++() = 0;
290 virtual bool AtEnd()
const = 0;
300 impl_.reset(other.impl_->Clone());
307 CHECK(impl_ !=
nullptr);
312 CHECK(impl_ !=
nullptr);
317 CHECK(impl_ !=
nullptr);
322 CHECK(impl_ !=
nullptr);
323 return !impl_->AtEnd();
327 CHECK(impl_ !=
nullptr);
328 return impl_->AtEnd();
332 std::unique_ptr<BatchIteratorImpl> impl_;
368 inline bst_uint operator[](
size_t i)
const;
370 inline size_t Size()
const;
379 inline void Save(dmlc::Stream* fo)
const;
385 inline bool Load(dmlc::Stream* fi);
393 std::vector<bst_uint> rows_;
413 virtual const MetaInfo& Info()
const = 0;
417 virtual BatchSet GetRowBatches() = 0;
418 virtual BatchSet GetSortedColumnBatches() = 0;
419 virtual BatchSet GetColumnBatches() = 0;
422 virtual bool SingleColBlock()
const = 0;
424 virtual float GetColDensity(
size_t cidx) = 0;
434 virtual void SaveToLocalFile(
const std::string& fname);
445 static DMatrix* Load(
const std::string& uri,
448 const std::string& file_format =
"auto",
449 const size_t page_size = kPageSize);
457 static DMatrix* Create(std::unique_ptr<DataSource>&& source,
458 const std::string& cache_prefix =
"");
472 static DMatrix* Create(dmlc::Parser<uint32_t>* parser,
473 const std::string& cache_prefix =
"",
474 const size_t page_size = kPageSize);
477 static const size_t kPageSize = 32UL << 20UL;
482 return rows_.size() == 0 ?
static_cast<bst_uint>(i) : rows_[i];
490 rows_.clear(); size_ = 0;
494 if (rows_.size() == 0) {
499 for (
size_t i = 0; i < size_; ++i) {
500 rows_[i] =
static_cast<bst_uint>(i);
510 fo->Write(&size_,
sizeof(size_));
514 if (!fi->Read(&rows_))
return false;
515 if (rows_.size() != 0)
return true;
516 return fi->Read(&size_,
sizeof(size_)) ==
sizeof(size_);
524 #endif // XGBOOST_DATA_H_
float bst_float
float type, used for storing statistics
Definition: base.h:89
MetaInfo info
Meta information about the dataset The subclass need to be able to load this correctly from data...
Definition: data.h:358
XGBOOST_DEVICE constexpr index_type size() const __span_noexcept
Definition: span.h:502
bool Load(dmlc::Stream *fi)
Load rowset from file.
Definition: data.h:513
#define XGBOOST_PARALLEL_SORT(X, Y, Z)
Definition: base.h:65
void Save(dmlc::Stream *fo) const
save rowset to file.
Definition: data.h:508
detail::ptrdiff_t index_type
Definition: span.h:387
BatchIterator end()
Definition: data.h:339
size_t Size() const
Definition: data.h:485
std::forward_iterator_tag iterator_category
Definition: data.h:295
void SortRows()
Definition: data.h:236
multi-thread version of group builder
Definition: group_data.h:27
dmlc::omp_uint bst_omp_uint
define unsigned int for openmp loop
Definition: base.h:208
size_t Size()
Definition: data.h:280
Internal data structured used by XGBoost during training. There are two ways to create a customized D...
Definition: data.h:406
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:157
DataType
data type accepted by xgboost interface
Definition: data.h:30
A vector-like structure to represent set of rows. But saves the memory when all rows are in the set (...
Definition: data.h:365
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:109
DMLC_DECLARE_TRAITS(has_saveload, xgboost::RowSet, true)
virtual ~BatchIteratorImpl()
Definition: data.h:285
void PushBack(bst_uint i)
push the index back to the set
Definition: data.h:493
SparsePage GetTranspose(int num_columns) const
Definition: data.h:206
SparsePage()
constructor
Definition: data.h:186
void Push(const Inst &inst)
Push one instance into page.
Definition: data.h:268
bool operator!=(const BatchIterator &rhs) const
Definition: data.h:321
void operator++()
Definition: data.h:306
XGBOOST_DEVICE constexpr pointer data() const __span_noexcept
Definition: span.h:497
SparsePage & operator*()
Definition: data.h:311
std::vector< T > & HostVector()
bool operator==(const Entry &other) const
Definition: data.h:149
BatchIterator(BatchIteratorImpl *impl)
Definition: data.h:296
Inst operator[](size_t i) const
get i-th row from the batch
Definition: data.h:170
BatchIterator begin()
Definition: data.h:338
This is data structure that user can pass to DMatrix::Create to create a DMatrix for training...
Definition: data.h:352
HostDeviceVector< size_t > offset
Definition: data.h:160
bst_uint operator[](size_t i) const
Definition: data.h:481
size_t MemCostBytes() const
Definition: data.h:194
namespace of xgboost
Definition: base.h:79
BatchIterator(const BatchIterator &other)
Definition: data.h:298
void Clear()
clear the set
Definition: data.h:489
const SparsePage & operator*() const
Definition: data.h:316
defines configuration macros of xgboost.
size_t Size() const
Definition: data.h:190
size_t base_rowid
Definition: data.h:164
BatchSet(BatchIterator begin_iter)
Definition: data.h:337
HostDeviceVector< Entry > data
the data of the segments
Definition: data.h:162
Element from a sparse vector.
Definition: data.h:132
Entry(bst_uint index, bst_float fvalue)
constructor with index and value
Definition: data.h:144
uint32_t bst_uint
unsigned integer type used in boost, used for feature index and row index.
Definition: base.h:84
bst_float fvalue
feature value
Definition: data.h:136
static bool CmpValue(const Entry &a, const Entry &b)
reversely compare feature values
Definition: data.h:146
void Clear()
clear the page
Definition: data.h:198
bool AtEnd() const
Definition: data.h:326
bst_uint index
feature index
Definition: data.h:134