xgboost
row_set.h
Go to the documentation of this file.
1 
7 #ifndef XGBOOST_COMMON_ROW_SET_H_
8 #define XGBOOST_COMMON_ROW_SET_H_
9 
10 #include <xgboost/data.h>
11 #include <algorithm>
12 #include <vector>
13 #include <utility>
14 
15 namespace xgboost {
16 namespace common {
17 
20  public:
24  struct Elem {
25  const size_t* begin{nullptr};
26  const size_t* end{nullptr};
27  int node_id{-1};
28  // id of node associated with this instance set; -1 means uninitialized
29  Elem()
30  = default;
31  Elem(const size_t* begin,
32  const size_t* end,
33  int node_id = -1)
34  : begin(begin), end(end), node_id(node_id) {}
35 
36  inline size_t Size() const {
37  return end - begin;
38  }
39  };
40  /* \brief specifies how to split a rowset into two */
41  struct Split {
42  std::vector<size_t> left;
43  std::vector<size_t> right;
44  };
45 
46  inline std::vector<Elem>::const_iterator begin() const { // NOLINT
47  return elem_of_each_node_.begin();
48  }
49 
50  inline std::vector<Elem>::const_iterator end() const { // NOLINT
51  return elem_of_each_node_.end();
52  }
53 
55  inline const Elem& operator[](unsigned node_id) const {
56  const Elem& e = elem_of_each_node_[node_id];
57  CHECK(e.begin != nullptr)
58  << "access element that is not in the set";
59  return e;
60  }
61 
63  inline Elem& operator[](unsigned node_id) {
64  Elem& e = elem_of_each_node_[node_id];
65  return e;
66  }
67 
68  // clear up things
69  inline void Clear() {
70  elem_of_each_node_.clear();
71  }
72  // initialize node id 0->everything
73  inline void Init() {
74  CHECK_EQ(elem_of_each_node_.size(), 0U);
75 
76  if (row_indices_.empty()) { // edge case: empty instance set
77  // assign arbitrary address here, to bypass nullptr check
78  // (nullptr usually indicates a nonexistent rowset, but we want to
79  // indicate a valid rowset that happens to have zero length and occupies
80  // the whole instance set)
81  // this is okay, as BuildHist will compute (end-begin) as the set size
82  const size_t* begin = reinterpret_cast<size_t*>(20);
83  const size_t* end = begin;
84  elem_of_each_node_.emplace_back(Elem(begin, end, 0));
85  return;
86  }
87 
88  const size_t* begin = dmlc::BeginPtr(row_indices_);
89  const size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
90  elem_of_each_node_.emplace_back(Elem(begin, end, 0));
91  }
92 
93  std::vector<size_t>* Data() { return &row_indices_; }
94  // split rowset into two
95  inline void AddSplit(unsigned node_id,
96  unsigned left_node_id,
97  unsigned right_node_id,
98  size_t n_left,
99  size_t n_right) {
100  const Elem e = elem_of_each_node_[node_id];
101  CHECK(e.begin != nullptr);
102  size_t* all_begin = dmlc::BeginPtr(row_indices_);
103  size_t* begin = all_begin + (e.begin - all_begin);
104 
105  CHECK_EQ(n_left + n_right, e.Size());
106  CHECK_LE(begin + n_left, e.end);
107  CHECK_EQ(begin + n_left + n_right, e.end);
108 
109  if (left_node_id >= elem_of_each_node_.size()) {
110  elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1));
111  }
112  if (right_node_id >= elem_of_each_node_.size()) {
113  elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1));
114  }
115 
116  elem_of_each_node_[left_node_id] = Elem(begin, begin + n_left, left_node_id);
117  elem_of_each_node_[right_node_id] = Elem(begin + n_left, e.end, right_node_id);
118  elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1);
119  }
120 
121  private:
122  // stores the row indexes in the set
123  std::vector<size_t> row_indices_;
124  // vector: node_id -> elements
125  std::vector<Elem> elem_of_each_node_;
126 };
127 
128 
129 // The builder is required for samples partition to left and rights children for set of nodes
130 // Responsible for:
131 // 1) Effective memory allocation for intermediate results for multi-thread work
132 // 2) Merging partial results produced by threads into original row set (row_set_collection_)
133 // BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
134 template<size_t BlockSize>
136  public:
137  template<typename Func>
138  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTaks) {
139  left_right_nodes_sizes_.resize(n_nodes);
140  blocks_offsets_.resize(n_nodes+1);
141 
142  blocks_offsets_[0] = 0;
143  for (size_t i = 1; i < n_nodes+1; ++i) {
144  blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTaks(i-1);
145  }
146 
147  if (n_tasks > max_n_tasks_) {
148  mem_blocks_.resize(n_tasks);
149  max_n_tasks_ = n_tasks;
150  }
151  }
152 
153  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
154  const size_t task_idx = GetTaskIdx(nid, begin);
155  return { mem_blocks_.at(task_idx).Left(), end - begin };
156  }
157 
158  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
159  const size_t task_idx = GetTaskIdx(nid, begin);
160  return { mem_blocks_.at(task_idx).Right(), end - begin };
161  }
162 
163  void SetNLeftElems(int nid, size_t begin, size_t end, size_t n_left) {
164  size_t task_idx = GetTaskIdx(nid, begin);
165  mem_blocks_.at(task_idx).n_left = n_left;
166  }
167 
168  void SetNRightElems(int nid, size_t begin, size_t end, size_t n_right) {
169  size_t task_idx = GetTaskIdx(nid, begin);
170  mem_blocks_.at(task_idx).n_right = n_right;
171  }
172 
173 
174  size_t GetNLeftElems(int nid) const {
175  return left_right_nodes_sizes_[nid].first;
176  }
177 
178  size_t GetNRightElems(int nid) const {
179  return left_right_nodes_sizes_[nid].second;
180  }
181 
182  // Each thread has partial results for some set of tree-nodes
183  // The function decides order of merging partial results into final row set
185  for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
186  size_t n_left = 0;
187  for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
188  mem_blocks_[j].n_offset_left = n_left;
189  n_left += mem_blocks_[j].n_left;
190  }
191  size_t n_right = 0;
192  for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
193  mem_blocks_[j].n_offset_right = n_left + n_right;
194  n_right += mem_blocks_[j].n_right;
195  }
196  left_right_nodes_sizes_[i] = {n_left, n_right};
197  }
198  }
199 
200  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
201  size_t task_idx = GetTaskIdx(nid, begin);
202 
203  size_t* left_result = rows_indexes + mem_blocks_[task_idx].n_offset_left;
204  size_t* right_result = rows_indexes + mem_blocks_[task_idx].n_offset_right;
205 
206  const size_t* left = mem_blocks_[task_idx].Left();
207  const size_t* right = mem_blocks_[task_idx].Right();
208 
209  std::copy_n(left, mem_blocks_[task_idx].n_left, left_result);
210  std::copy_n(right, mem_blocks_[task_idx].n_right, right_result);
211  }
212 
213  protected:
214  size_t GetTaskIdx(int nid, size_t begin) {
215  return blocks_offsets_[nid] + begin / BlockSize;
216  }
217 
218  struct BlockInfo{
219  size_t n_left;
220  size_t n_right;
221 
224 
225  size_t* Left() {
226  return &left_data_[0];
227  }
228 
229  size_t* Right() {
230  return &right_data_[0];
231  }
232  private:
233  alignas(128) size_t left_data_[BlockSize];
234  alignas(128) size_t right_data_[BlockSize];
235  };
236  std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
237  std::vector<size_t> blocks_offsets_;
238  std::vector<BlockInfo> mem_blocks_;
239  size_t max_n_tasks_ = 0;
240 };
241 
242 
243 } // namespace common
244 } // namespace xgboost
245 
246 #endif // XGBOOST_COMMON_ROW_SET_H_
size_t n_offset_left
Definition: row_set.h:222
Definition: row_set.h:135
void AddSplit(unsigned node_id, unsigned left_node_id, unsigned right_node_id, size_t n_left, size_t n_right)
Definition: row_set.h:95
collection of rowset
Definition: row_set.h:19
std::vector< Elem >::const_iterator begin() const
Definition: row_set.h:46
size_t n_left
Definition: row_set.h:219
The input data structure of xgboost.
std::vector< BlockInfo > mem_blocks_
Definition: row_set.h:238
void Init()
Definition: row_set.h:73
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:126
Elem & operator[](unsigned node_id)
return corresponding element set given the node_id
Definition: row_set.h:63
std::vector< Elem >::const_iterator end() const
Definition: row_set.h:50
common::Span< size_t > GetRightBuffer(int nid, size_t begin, size_t end)
Definition: row_set.h:158
size_t * Left()
Definition: row_set.h:225
int node_id
Definition: row_set.h:27
size_t n_offset_right
Definition: row_set.h:223
size_t GetNLeftElems(int nid) const
Definition: row_set.h:174
void CalculateRowOffsets()
Definition: row_set.h:184
namespace of xgboost
Definition: base.h:102
data structure to store an instance set, a subset of rows (instances) associated with a particular no...
Definition: row_set.h:24
void SetNRightElems(int nid, size_t begin, size_t end, size_t n_right)
Definition: row_set.h:168
void SetNLeftElems(int nid, size_t begin, size_t end, size_t n_left)
Definition: row_set.h:163
size_t * Right()
Definition: row_set.h:229
const size_t * begin
Definition: row_set.h:25
std::vector< size_t > blocks_offsets_
Definition: row_set.h:237
const Elem & operator[](unsigned node_id) const
return corresponding element set given the node_id
Definition: row_set.h:55
const size_t * end
Definition: row_set.h:26
void MergeToArray(int nid, size_t begin, size_t *rows_indexes)
Definition: row_set.h:200
std::vector< std::pair< size_t, size_t > > left_right_nodes_sizes_
Definition: row_set.h:236
std::vector< size_t > left
Definition: row_set.h:42
size_t GetNRightElems(int nid) const
Definition: row_set.h:178
std::vector< size_t > right
Definition: row_set.h:43
void Init(const size_t n_tasks, size_t n_nodes, Func funcNTaks)
Definition: row_set.h:138
std::vector< size_t > * Data()
Definition: row_set.h:93
size_t GetTaskIdx(int nid, size_t begin)
Definition: row_set.h:214
void Clear()
Definition: row_set.h:69
Elem(const size_t *begin, const size_t *end, int node_id=-1)
Definition: row_set.h:31
common::Span< size_t > GetLeftBuffer(int nid, size_t begin, size_t end)
Definition: row_set.h:153
size_t Size() const
Definition: row_set.h:36
size_t n_right
Definition: row_set.h:220