xgboost
row_set.h
Go to the documentation of this file.
1 
7 #ifndef XGBOOST_COMMON_ROW_SET_H_
8 #define XGBOOST_COMMON_ROW_SET_H_
9 
10 #include <xgboost/data.h>
11 #include <algorithm>
12 #include <vector>
13 #include <utility>
14 #include <memory>
15 
16 namespace xgboost {
17 namespace common {
18 
21  public:
25  struct Elem {
26  const size_t* begin{nullptr};
27  const size_t* end{nullptr};
29  // id of node associated with this instance set; -1 means uninitialized
30  Elem()
31  = default;
32  Elem(const size_t* begin,
33  const size_t* end,
34  bst_node_t node_id = -1)
35  : begin(begin), end(end), node_id(node_id) {}
36 
37  inline size_t Size() const {
38  return end - begin;
39  }
40  };
41  /* \brief specifies how to split a rowset into two */
42  struct Split {
43  std::vector<size_t> left;
44  std::vector<size_t> right;
45  };
46 
47  inline std::vector<Elem>::const_iterator begin() const { // NOLINT
48  return elem_of_each_node_.begin();
49  }
50 
51  inline std::vector<Elem>::const_iterator end() const { // NOLINT
52  return elem_of_each_node_.end();
53  }
54 
56  inline const Elem& operator[](unsigned node_id) const {
57  const Elem& e = elem_of_each_node_[node_id];
58  CHECK(e.begin != nullptr)
59  << "access element that is not in the set";
60  return e;
61  }
62 
64  inline Elem& operator[](unsigned node_id) {
65  Elem& e = elem_of_each_node_[node_id];
66  return e;
67  }
68 
69  // clear up things
70  inline void Clear() {
71  elem_of_each_node_.clear();
72  }
73  // initialize node id 0->everything
74  inline void Init() {
75  CHECK_EQ(elem_of_each_node_.size(), 0U);
76 
77  if (row_indices_.empty()) { // edge case: empty instance set
78  // assign arbitrary address here, to bypass nullptr check
79  // (nullptr usually indicates a nonexistent rowset, but we want to
80  // indicate a valid rowset that happens to have zero length and occupies
81  // the whole instance set)
82  // this is okay, as BuildHist will compute (end-begin) as the set size
83  const size_t* begin = reinterpret_cast<size_t*>(20);
84  const size_t* end = begin;
85  elem_of_each_node_.emplace_back(Elem(begin, end, 0));
86  return;
87  }
88 
89  const size_t* begin = dmlc::BeginPtr(row_indices_);
90  const size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
91  elem_of_each_node_.emplace_back(Elem(begin, end, 0));
92  }
93 
94  std::vector<size_t>* Data() { return &row_indices_; }
95  // split rowset into two
96  inline void AddSplit(unsigned node_id,
97  unsigned left_node_id,
98  unsigned right_node_id,
99  size_t n_left,
100  size_t n_right) {
101  const Elem e = elem_of_each_node_[node_id];
102  CHECK(e.begin != nullptr);
103  size_t* all_begin = dmlc::BeginPtr(row_indices_);
104  size_t* begin = all_begin + (e.begin - all_begin);
105 
106  CHECK_EQ(n_left + n_right, e.Size());
107  CHECK_LE(begin + n_left, e.end);
108  CHECK_EQ(begin + n_left + n_right, e.end);
109 
110  if (left_node_id >= elem_of_each_node_.size()) {
111  elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1));
112  }
113  if (right_node_id >= elem_of_each_node_.size()) {
114  elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1));
115  }
116 
117  elem_of_each_node_[left_node_id] = Elem(begin, begin + n_left, left_node_id);
118  elem_of_each_node_[right_node_id] = Elem(begin + n_left, e.end, right_node_id);
119  elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1);
120  }
121 
122  private:
123  // stores the row indexes in the set
124  std::vector<size_t> row_indices_;
125  // vector: node_id -> elements
126  std::vector<Elem> elem_of_each_node_;
127 };
128 
129 
130 // The builder is required for samples partition to left and rights children for set of nodes
131 // Responsible for:
132 // 1) Effective memory allocation for intermediate results for multi-thread work
133 // 2) Merging partial results produced by threads into original row set (row_set_collection_)
134 // BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
135 template<size_t BlockSize>
137  public:
138  template<typename Func>
139  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTaks) {
140  left_right_nodes_sizes_.resize(n_nodes);
141  blocks_offsets_.resize(n_nodes+1);
142 
143  blocks_offsets_[0] = 0;
144  for (size_t i = 1; i < n_nodes+1; ++i) {
145  blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTaks(i-1);
146  }
147 
148  if (n_tasks > max_n_tasks_) {
149  mem_blocks_.resize(n_tasks);
150  max_n_tasks_ = n_tasks;
151  }
152  }
153 
154  // allocate thread local memory, should be called for each specific task
155  void AllocateForTask(size_t id) {
156  if (mem_blocks_[id].get() == nullptr) {
157  BlockInfo* local_block_ptr = new BlockInfo;
158  CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
159  mem_blocks_[id].reset(local_block_ptr);
160  }
161  }
162 
163  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
164  const size_t task_idx = GetTaskIdx(nid, begin);
165  return { mem_blocks_.at(task_idx)->Left(), end - begin };
166  }
167 
168  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
169  const size_t task_idx = GetTaskIdx(nid, begin);
170  return { mem_blocks_.at(task_idx)->Right(), end - begin };
171  }
172 
173  void SetNLeftElems(int nid, size_t begin, size_t end, size_t n_left) {
174  size_t task_idx = GetTaskIdx(nid, begin);
175  mem_blocks_.at(task_idx)->n_left = n_left;
176  }
177 
178  void SetNRightElems(int nid, size_t begin, size_t end, size_t n_right) {
179  size_t task_idx = GetTaskIdx(nid, begin);
180  mem_blocks_.at(task_idx)->n_right = n_right;
181  }
182 
183 
184  size_t GetNLeftElems(int nid) const {
185  return left_right_nodes_sizes_[nid].first;
186  }
187 
188  size_t GetNRightElems(int nid) const {
189  return left_right_nodes_sizes_[nid].second;
190  }
191 
192  // Each thread has partial results for some set of tree-nodes
193  // The function decides order of merging partial results into final row set
195  for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
196  size_t n_left = 0;
197  for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
198  mem_blocks_[j]->n_offset_left = n_left;
199  n_left += mem_blocks_[j]->n_left;
200  }
201  size_t n_right = 0;
202  for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
203  mem_blocks_[j]->n_offset_right = n_left + n_right;
204  n_right += mem_blocks_[j]->n_right;
205  }
206  left_right_nodes_sizes_[i] = {n_left, n_right};
207  }
208  }
209 
210  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
211  size_t task_idx = GetTaskIdx(nid, begin);
212 
213  size_t* left_result = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
214  size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
215 
216  const size_t* left = mem_blocks_[task_idx]->Left();
217  const size_t* right = mem_blocks_[task_idx]->Right();
218 
219  std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
220  std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
221  }
222 
223  size_t GetTaskIdx(int nid, size_t begin) {
224  return blocks_offsets_[nid] + begin / BlockSize;
225  }
226 
227  protected:
228  struct BlockInfo{
229  size_t n_left;
230  size_t n_right;
231 
234 
235  size_t* Left() {
236  return &left_data_[0];
237  }
238 
239  size_t* Right() {
240  return &right_data_[0];
241  }
242  private:
243  size_t left_data_[BlockSize];
244  size_t right_data_[BlockSize];
245  };
246  std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
247  std::vector<size_t> blocks_offsets_;
248  std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
249  size_t max_n_tasks_ = 0;
250 };
251 
252 
253 } // namespace common
254 } // namespace xgboost
255 
256 #endif // XGBOOST_COMMON_ROW_SET_H_
xgboost::common::PartitionBuilder::MergeToArray
void MergeToArray(int nid, size_t begin, size_t *rows_indexes)
Definition: row_set.h:210
xgboost::common::RowSetCollection::Split::right
std::vector< size_t > right
Definition: row_set.h:44
xgboost::common::RowSetCollection::end
std::vector< Elem >::const_iterator end() const
Definition: row_set.h:51
xgboost::common::PartitionBuilder::BlockInfo::Right
size_t * Right()
Definition: row_set.h:239
xgboost::common::PartitionBuilder
Definition: row_set.h:136
xgboost::common::PartitionBuilder::GetRightBuffer
common::Span< size_t > GetRightBuffer(int nid, size_t begin, size_t end)
Definition: row_set.h:168
xgboost::common::RowSetCollection::operator[]
const Elem & operator[](unsigned node_id) const
return corresponding element set given the node_id
Definition: row_set.h:56
xgboost::common::RowSetCollection::Elem::Size
size_t Size() const
Definition: row_set.h:37
xgboost::common::RowSetCollection::Split
Definition: row_set.h:42
xgboost::common::PartitionBuilder::max_n_tasks_
size_t max_n_tasks_
Definition: row_set.h:249
xgboost::common::PartitionBuilder::Init
void Init(const size_t n_tasks, size_t n_nodes, Func funcNTaks)
Definition: row_set.h:139
xgboost::common::PartitionBuilder::mem_blocks_
std::vector< std::shared_ptr< BlockInfo > > mem_blocks_
Definition: row_set.h:248
xgboost::common::RowSetCollection::Elem::Elem
Elem()=default
xgboost::common::PartitionBuilder::BlockInfo::n_offset_right
size_t n_offset_right
Definition: row_set.h:233
xgboost::common::RowSetCollection
collection of rowset
Definition: row_set.h:20
xgboost::common::PartitionBuilder::BlockInfo::n_offset_left
size_t n_offset_left
Definition: row_set.h:232
xgboost::common::RowSetCollection::begin
std::vector< Elem >::const_iterator begin() const
Definition: row_set.h:47
xgboost::common::PartitionBuilder::BlockInfo::n_left
size_t n_left
Definition: row_set.h:229
xgboost::common::RowSetCollection::Split::left
std::vector< size_t > left
Definition: row_set.h:43
xgboost::common::PartitionBuilder::left_right_nodes_sizes_
std::vector< std::pair< size_t, size_t > > left_right_nodes_sizes_
Definition: row_set.h:246
xgboost::bst_node_t
int32_t bst_node_t
Type for tree node index.
Definition: base.h:132
xgboost::common::RowSetCollection::operator[]
Elem & operator[](unsigned node_id)
return corresponding element set given the node_id
Definition: row_set.h:64
xgboost::common::RowSetCollection::Elem::node_id
bst_node_t node_id
Definition: row_set.h:28
xgboost::common::PartitionBuilder::GetLeftBuffer
common::Span< size_t > GetLeftBuffer(int nid, size_t begin, size_t end)
Definition: row_set.h:163
xgboost::common::PartitionBuilder::BlockInfo
Definition: row_set.h:228
xgboost::common::RowSetCollection::Data
std::vector< size_t > * Data()
Definition: row_set.h:94
xgboost::common::RowSetCollection::AddSplit
void AddSplit(unsigned node_id, unsigned left_node_id, unsigned right_node_id, size_t n_left, size_t n_right)
Definition: row_set.h:96
xgboost::common::PartitionBuilder::BlockInfo::n_right
size_t n_right
Definition: row_set.h:230
xgboost::common::PartitionBuilder::BlockInfo::Left
size_t * Left()
Definition: row_set.h:235
xgboost::common::RowSetCollection::Elem::end
const size_t * end
Definition: row_set.h:27
xgboost::common::Span
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:142
xgboost::common::PartitionBuilder::GetTaskIdx
size_t GetTaskIdx(int nid, size_t begin)
Definition: row_set.h:223
data.h
The input data structure of xgboost.
xgboost::common::PartitionBuilder::SetNRightElems
void SetNRightElems(int nid, size_t begin, size_t end, size_t n_right)
Definition: row_set.h:178
xgboost::common::PartitionBuilder::AllocateForTask
void AllocateForTask(size_t id)
Definition: row_set.h:155
xgboost::common::PartitionBuilder::GetNLeftElems
size_t GetNLeftElems(int nid) const
Definition: row_set.h:184
xgboost::common::RowSetCollection::Clear
void Clear()
Definition: row_set.h:70
xgboost::common::PartitionBuilder::blocks_offsets_
std::vector< size_t > blocks_offsets_
Definition: row_set.h:247
xgboost::common::RowSetCollection::Elem::begin
const size_t * begin
Definition: row_set.h:26
xgboost::common::PartitionBuilder::CalculateRowOffsets
void CalculateRowOffsets()
Definition: row_set.h:194
xgboost::common::RowSetCollection::Elem::Elem
Elem(const size_t *begin, const size_t *end, bst_node_t node_id=-1)
Definition: row_set.h:32
xgboost::get
auto get(U &json) -> decltype(detail::GetImpl(*Cast< T >(&json.GetValue())))&
Get Json value.
Definition: json.h:556
xgboost::common::RowSetCollection::Init
void Init()
Definition: row_set.h:74
xgboost::common::PartitionBuilder::SetNLeftElems
void SetNLeftElems(int nid, size_t begin, size_t end, size_t n_left)
Definition: row_set.h:173
xgboost::common::RowSetCollection::Elem
data structure to store an instance set, a subset of rows (instances) associated with a particular no...
Definition: row_set.h:25
xgboost::common::PartitionBuilder::GetNRightElems
size_t GetNRightElems(int nid) const
Definition: row_set.h:188
xgboost
namespace of xgboost
Definition: base.h:110