xgboost
group_data.h
Go to the documentation of this file.
1 
14 #ifndef XGBOOST_COMMON_GROUP_DATA_H_
15 #define XGBOOST_COMMON_GROUP_DATA_H_
16 
17 #include <cstddef>
18 #include <vector>
19 #include <algorithm>
20 #include <utility>
21 
22 #include "xgboost/base.h"
23 
24 namespace xgboost {
25 namespace common {
32 template<typename ValueType, typename SizeType = bst_ulong, bool is_row_major = false>
34  public:
45  ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
46  std::vector<ValueType> *p_data,
47  size_t base_row_offset = 0)
48  : rptr_(*p_rptr),
49  data_(*p_data),
50  base_row_offset_(base_row_offset) {}
51 
59  void InitBudget(std::size_t max_key, int nthread) {
60  thread_rptr_.resize(nthread);
61  const size_t full_size = is_row_major ? max_key : max_key - std::min(base_row_offset_, max_key);
62  thread_displacement_ = is_row_major ? full_size / nthread : 0;
63  for (std::size_t i = 0; i < thread_rptr_.size() - 1; ++i) {
64  const size_t thread_size = is_row_major ? thread_displacement_ : full_size;
65  thread_rptr_[i].resize(thread_size, 0);
66  }
67  const size_t last_thread_size = is_row_major ? (full_size - (nthread - 1)*thread_displacement_)
68  : full_size;
69  thread_rptr_[nthread - 1].resize(last_thread_size, 0);
70  }
71 
78  void AddBudget(std::size_t key, int threadid, SizeType nelem = 1) {
79  std::vector<SizeType> &trptr = thread_rptr_[threadid];
80  size_t offset_key = is_row_major ? (key - base_row_offset_ - threadid*thread_displacement_)
81  : (key - base_row_offset_);
82  if (trptr.size() < offset_key + 1) {
83  trptr.resize(offset_key + 1, 0);
84  }
85  trptr[offset_key] += nelem;
86  }
87 
89  inline void InitStorage() {
90  if (is_row_major) {
91  size_t expected_rows = 0;
92  for (std::size_t tid = 0; tid < thread_rptr_.size(); ++tid) {
93  expected_rows += thread_rptr_[tid].size();
94  }
95  // initialize rptr to be beginning of each segment
96  SizeType rptr_fill_value = rptr_.empty() ? 0 : rptr_.back();
97  rptr_.resize(expected_rows + base_row_offset_ + 1, rptr_fill_value);
98 
99  std::size_t count = 0;
100  size_t offset_idx = base_row_offset_ + 1;
101  for (std::size_t tid = 0; tid < thread_rptr_.size(); ++tid) {
102  std::vector<SizeType> &trptr = thread_rptr_[tid];
103  for (std::size_t i = 0; i < trptr.size(); ++i) {
104  std::size_t thread_count = trptr[i]; // how many entries in this row
105  trptr[i] = count + rptr_fill_value;
106  count += thread_count;
107  if (offset_idx < rptr_.size()) {
108  rptr_[offset_idx++] += count;
109  }
110  }
111  }
112  data_.resize(rptr_.back()); // usage of empty allocator can help to improve performance
113  } else {
114  // set rptr to correct size
115  SizeType rptr_fill_value = rptr_.empty() ? 0 : rptr_.back();
116  for (std::size_t tid = 0; tid < thread_rptr_.size(); ++tid) {
117  if (rptr_.size() <= thread_rptr_[tid].size() + base_row_offset_) {
118  rptr_.resize(thread_rptr_[tid].size() + base_row_offset_ + 1,
119  rptr_fill_value); // key + 1
120  }
121  }
122  // initialize rptr to be beginning of each segment
123  std::size_t count = 0;
124  for (std::size_t i = base_row_offset_; i + 1 < rptr_.size(); ++i) {
125  for (std::size_t tid = 0; tid < thread_rptr_.size(); ++tid) {
126  std::vector<SizeType> &trptr = thread_rptr_[tid];
127  if (i < trptr.size() +
128  base_row_offset_) { // i^th row is assigned for this thread
129  std::size_t thread_count =
130  trptr[i - base_row_offset_]; // how many entries in this row
131  trptr[i - base_row_offset_] = count + rptr_.back();
132  count += thread_count;
133  }
134  }
135  rptr_[i + 1] += count; // pointer accumulated from all thread
136  }
137  data_.resize(rptr_.back());
138  }
139  }
140 
149  void Push(std::size_t key, ValueType&& value, int threadid) {
150  size_t offset_key = is_row_major ? (key - base_row_offset_ - threadid * thread_displacement_)
151  : (key - base_row_offset_);
152  SizeType &rp = thread_rptr_[threadid][offset_key];
153  data_[rp++] = std::move(value);
154  }
155 
156  private:
158  std::vector<SizeType> &rptr_;
160  std::vector<ValueType> &data_;
162  std::vector<std::vector<SizeType> > thread_rptr_;
164  size_t base_row_offset_;
166  size_t thread_displacement_;
167 };
168 } // namespace common
169 } // namespace xgboost
170 #endif // XGBOOST_COMMON_GROUP_DATA_H_
base.h
defines configuration macros of xgboost.
xgboost::common::ParallelGroupBuilder
multi-thread version of group builder
Definition: group_data.h:33
xgboost::common::ParallelGroupBuilder::InitBudget
void InitBudget(std::size_t max_key, int nthread)
step 1: initialize the helper, with hint of number keys and thread used in the construction
Definition: group_data.h:59
xgboost::common::ParallelGroupBuilder::AddBudget
void AddBudget(std::size_t key, int threadid, SizeType nelem=1)
step 2: add budget to each key
Definition: group_data.h:78
xgboost::common::ParallelGroupBuilder::Push
void Push(std::size_t key, ValueType &&value, int threadid)
step 4: add data to the allocated space, the calls to this function should be exactly match previous ...
Definition: group_data.h:149
xgboost::common::ParallelGroupBuilder::InitStorage
void InitStorage()
step 3: initialize the necessary storage
Definition: group_data.h:89
xgboost::common::ParallelGroupBuilder::ParallelGroupBuilder
ParallelGroupBuilder(std::vector< SizeType > *p_rptr, std::vector< ValueType > *p_data, size_t base_row_offset=0)
parallel group builder of data.
Definition: group_data.h:45
xgboost
namespace of xgboost
Definition: base.h:110