xgboost
transform.h
Go to the documentation of this file.
1 
4 #ifndef XGBOOST_COMMON_TRANSFORM_H_
5 #define XGBOOST_COMMON_TRANSFORM_H_
6 
7 #include <dmlc/omp.h>
8 #include <dmlc/common.h>
9 
10 #include <xgboost/data.h>
11 #include <utility>
12 #include <vector>
13 #include <type_traits> // enable_if
14 
16 #include "xgboost/span.h"
17 
18 #include "common.h"
19 
20 #if defined (__CUDACC__)
21 #include "device_helpers.cuh"
22 #endif // defined (__CUDACC__)
23 
24 namespace xgboost {
25 namespace common {
26 
27 constexpr size_t kBlockThreads = 256;
28 
29 namespace detail {
30 
31 #if defined(__CUDACC__)
32 template <typename Functor, typename... SpanType>
33 __global__ void LaunchCUDAKernel(Functor _func, Range _range,
34  SpanType... _spans) {
35  for (auto i : dh::GridStrideRange(*_range.begin(), *_range.end())) {
36  _func(i, _spans...);
37  }
38 }
39 #endif // defined(__CUDACC__)
40 
41 } // namespace detail
42 
57 template <bool CompiledWithCuda = WITH_CUDA()>
58 class Transform {
59  private:
60  template <typename Functor>
61  struct Evaluator {
62  public:
63  Evaluator(Functor func, Range range, int device, bool shard) :
64  func_(func), range_{std::move(range)},
65  shard_{shard},
66  device_{device} {}
67 
74  template <typename... HDV>
75  void Eval(HDV... vectors) const {
76  bool on_device = device_ >= 0;
77 
78  if (on_device) {
79  LaunchCUDA(func_, vectors...);
80  } else {
81  LaunchCPU(func_, vectors...);
82  }
83  }
84 
85  private:
86  // CUDA UnpackHDV
87  template <typename T>
88  Span<T> UnpackHDVOnDevice(HostDeviceVector<T>* _vec) const {
89  auto span = _vec->DeviceSpan();
90  return span;
91  }
92  template <typename T>
93  Span<T const> UnpackHDVOnDevice(const HostDeviceVector<T>* _vec) const {
94  auto span = _vec->ConstDeviceSpan();
95  return span;
96  }
97  // CPU UnpackHDV
98  template <typename T>
99  Span<T> UnpackHDV(HostDeviceVector<T>* _vec) const {
100  return Span<T> {_vec->HostPointer(),
101  static_cast<typename Span<T>::index_type>(_vec->Size())};
102  }
103  template <typename T>
104  Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec) const {
105  return Span<T const> {_vec->ConstHostPointer(),
106  static_cast<typename Span<T>::index_type>(_vec->Size())};
107  }
108  // Recursive unpack for Shard.
109  template <typename T>
110  void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
111  vector->SetDevice(device);
112  }
113  template <typename Head, typename... Rest>
114  void UnpackShard(int device,
115  const HostDeviceVector<Head> *_vector,
116  const HostDeviceVector<Rest> *... _vectors) const {
117  _vector->SetDevice(device);
118  UnpackShard(device, _vectors...);
119  }
120 
121 #if defined(__CUDACC__)
122  template <typename std::enable_if<CompiledWithCuda>::type* = nullptr,
123  typename... HDV>
124  void LaunchCUDA(Functor _func, HDV*... _vectors) const {
125  if (shard_)
126  UnpackShard(device_, _vectors...);
127 
128  size_t range_size = *range_.end() - *range_.begin();
129 
130  // Extract index to deal with possible old OpenMP.
131  // This deals with situation like multi-class setting where
132  // granularity is used in data vector.
133  size_t shard_size = range_size;
134  Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
135  dh::safe_cuda(cudaSetDevice(device_));
136  const int kGrids =
137  static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
138  if (kGrids == 0) {
139  return;
140  }
141  detail::LaunchCUDAKernel<<<kGrids, kBlockThreads>>>( // NOLINT
142  _func, shard_range, UnpackHDVOnDevice(_vectors)...);
143  }
144 #else
145 
146  template <typename std::enable_if<!CompiledWithCuda>::type* = nullptr,
147  typename... HDV>
148  void LaunchCUDA(Functor _func, HDV*... _vectors) const {
149  LOG(FATAL) << "Not part of device code. WITH_CUDA: " << WITH_CUDA();
150  }
151 #endif // defined(__CUDACC__)
152 
153  template <typename... HDV>
154  void LaunchCPU(Functor func, HDV*... vectors) const {
155  omp_ulong end = static_cast<omp_ulong>(*(range_.end()));
156  dmlc::OMPException omp_exc;
157 #pragma omp parallel for schedule(static)
158  for (omp_ulong idx = 0; idx < end; ++idx) {
159  omp_exc.Run(func, idx, UnpackHDV(vectors)...);
160  }
161  omp_exc.Rethrow();
162  }
163 
164  private:
166  Functor func_;
168  Range range_;
170  bool shard_;
171  int device_;
172  };
173 
174  public:
188  template <typename Functor>
189  static Evaluator<Functor> Init(Functor func, Range const range,
190  int device,
191  bool const shard = true) {
192  return Evaluator<Functor> {func, std::move(range), device, shard};
193  }
194 };
195 
196 } // namespace common
197 } // namespace xgboost
198 
199 #endif // XGBOOST_COMMON_TRANSFORM_H_
XGBOOST_DEVICE Iterator begin() const
Definition: common.h:121
std::size_t index_type
Definition: span.h:394
Definition: host_device_vector.h:85
The input data structure of xgboost.
T * HostPointer()
Definition: host_device_vector.h:107
dmlc::omp_ulong omp_ulong
define unsigned long for openmp loop
Definition: base.h:244
Do Transformation on HostDeviceVectors.
Definition: transform.h:58
Definition: common.h:83
int64_t DifferenceType
Definition: common.h:85
A device-and-host vector abstraction layer.
XGBOOST_DEVICE Iterator end() const
Definition: common.h:122
common::Span< const T > ConstDeviceSpan() const
const T * ConstHostPointer() const
Definition: host_device_vector.h:108
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:115
static Evaluator< Functor > Init(Functor func, Range const range, int device, bool const shard=true)
Initialize a Transform object.
Definition: transform.h:189
common::Span< T > DeviceSpan()
XGBOOST_DEVICE T1 DivRoundUp(const T1 a, const T2 b)
Definition: common.h:76
void SetDevice(int device) const
#define WITH_CUDA()
Definition: common.h:27
constexpr size_t kBlockThreads
Definition: transform.h:27
namespace of xgboost
Definition: base.h:102
Common utilities.