release_3.1.0/dev/linalg_8h_source.html

 #ifndef XGBOOST_LINALG_H_

 #define XGBOOST_LINALG_H_


 #include <dmlc/endian.h>

 #include <xgboost/base.h>

 #include <xgboost/context.h>

 #include <xgboost/host_device_vector.h>

 #include <xgboost/json.h>

 #include <xgboost/span.h>


 #include <algorithm>

 #include <cassert>

 #include <cstddef>  // for size_t

 #include <cstdint>  // for int32_t

 #include <limits>

 #include <string>

 #include <tuple>  // for make_tuple

 #include <type_traits>

 #include <utility>

 #include <vector>


 #if defined(_MSC_VER)

 #include <intrin.h>

 #endif  // defined(_MSC_VER)


 // decouple it from xgboost.

 #ifndef LINALG_HD

 #if defined(__CUDA__) || defined(__NVCC__)

 #define LINALG_HD __host__ __device__

 #else

 #define LINALG_HD

 #endif  // defined (__CUDA__) || defined(__NVCC__)

 #endif  // LINALG_HD


 namespace xgboost::linalg {

 namespace detail {


 struct ArrayInterfaceHandler {

   template <typename T>

   static constexpr char TypeChar() {

     return (std::is_floating_point_v<T>

                 ? 'f'

                 : (std::is_integral_v<T> ? (std::is_signed_v<T> ? 'i' : 'u') : '\0'));

   }

 };


 template <size_t dim, typename S, typename Head, size_t D>

 constexpr size_t Offset(S (&strides)[D], size_t n, Head head) {

   static_assert(dim < D);

   return n + head * strides[dim];

 }


 template <size_t dim, typename S, size_t D, typename Head, typename... Tail>

 constexpr std::enable_if_t<sizeof...(Tail) != 0, size_t> Offset(S (&strides)[D], size_t n,

                                                                 Head head, Tail &&...rest) {

   static_assert(dim < D);

   return Offset<dim + 1>(strides, n + (head * strides[dim]), std::forward<Tail>(rest)...);

 }


 template <int32_t D, bool f_array = false>

 constexpr void CalcStride(size_t const (&shape)[D], size_t (&stride)[D]) {

   if (f_array) {

     stride[0] = 1;

     for (int32_t s = 1; s < D; ++s) {

       stride[s] = shape[s - 1] * stride[s - 1];

     }

   } else {

     stride[D - 1] = 1;

     for (int32_t s = D - 2; s >= 0; --s) {

       stride[s] = shape[s + 1] * stride[s + 1];

     }

   }

 }


 struct AllTag {};


 struct IntTag {};


 template <typename I>

 struct RangeTag {

   I beg;

   I end;

   [[nodiscard]] constexpr size_t Size() const { return end - beg; }

 };


 template <typename T>

 constexpr int32_t CalcSliceDim() {

   return std::is_same_v<T, IntTag> ? 0 : 1;

 }


 template <typename T, typename... S>

 constexpr std::enable_if_t<sizeof...(S) != 0, int32_t> CalcSliceDim() {

   return CalcSliceDim<T>() + CalcSliceDim<S...>();

 }


 template <int32_t D>

 constexpr size_t CalcSize(size_t (&shape)[D]) {

   size_t size = 1;

   for (auto d : shape) {

     size *= d;

   }

   return size;

 }


 template <typename S>

 using RemoveCRType = std::remove_const_t<std::remove_reference_t<S>>;


 template <typename S>

 using IndexToTag = std::conditional_t<std::is_integral_v<RemoveCRType<S>>, IntTag, S>;


 template <int32_t n, typename Fn>

 LINALG_HD constexpr auto UnrollLoop(Fn fn) {

 #if defined __CUDA_ARCH__

 #pragma unroll n

 #endif  // defined __CUDA_ARCH__

   for (int32_t i = 0; i < n; ++i) {

     fn(i);

   }

 }


 template <typename T>

 int32_t NativePopc(T v) {

   int c = 0;

   for (; v != 0; v &= v - 1) c++;

   return c;

 }


 inline LINALG_HD int Popc(uint32_t v) {

 #if defined(__CUDA_ARCH__)

   return __popc(v);

 #elif defined(__GNUC__) || defined(__clang__)

   return __builtin_popcount(v);

 #elif defined(_MSC_VER)

   return __popcnt(v);

 #else

   return NativePopc(v);

 #endif  // compiler

 }


 inline LINALG_HD int Popc(uint64_t v) {

 #if defined(__CUDA_ARCH__)

   return __popcll(v);

 #elif defined(__GNUC__) || defined(__clang__)

   return __builtin_popcountll(v);

 #elif defined(_MSC_VER) && defined(_M_X64)

   return __popcnt64(v);

 #else

   return NativePopc(v);

 #endif  // compiler

 }


 template <std::size_t D, typename Head>

 LINALG_HD void IndexToArr(std::size_t (&arr)[D], Head head) {

   static_assert(std::is_integral_v<std::remove_reference_t<Head>>, "Invalid index type.");

   arr[D - 1] = head;

 }


 template <std::size_t D, typename Head, typename... Rest>

 LINALG_HD void IndexToArr(std::size_t (&arr)[D], Head head, Rest &&...index) {

   static_assert(sizeof...(Rest) < D, "Index overflow.");

   static_assert(std::is_integral_v<std::remove_reference_t<Head>>, "Invalid index type.");

   arr[D - sizeof...(Rest) - 1] = head;

   IndexToArr(arr, std::forward<Rest>(index)...);

 }


 template <class T, std::size_t N, std::size_t... Idx>

 constexpr auto ArrToTuple(T (&arr)[N], std::index_sequence<Idx...>) {

   return std::make_tuple(arr[Idx]...);

 }


 template <class T, std::size_t N>

 constexpr auto ArrToTuple(T (&arr)[N]) {

   return ArrToTuple(arr, std::make_index_sequence<N>{});

 }


 // uint division optimization inspired by the CIndexer in cupy.  Division operation is

 // slow on both CPU and GPU, especially 64 bit integer.  So here we first try to avoid 64

 // bit when the index is smaller, then try to avoid division when it's exp of 2.

 template <typename I, std::int32_t D>

 LINALG_HD auto UnravelImpl(I idx, common::Span<size_t const, D> shape) {

   std::size_t index[D]{0};

   static_assert(std::is_signed_v<decltype(D)>,

                 "Don't change the type without changing the for loop.");

   auto const sptr = shape.data();

   for (int32_t dim = D; --dim > 0;) {

     auto s = static_cast<std::remove_const_t<std::remove_reference_t<I>>>(sptr[dim]);

     if (s & (s - 1)) {

       auto t = idx / s;

       index[dim] = idx - t * s;

       idx = t;

     } else {  // exp of 2

       index[dim] = idx & (s - 1);

       idx >>= Popc(s - 1);

     }

   }

   index[0] = idx;

   return ArrToTuple(index);

 }


 template <size_t dim, typename I, int32_t D>

 void ReshapeImpl(size_t (&out_shape)[D], I s) {

   static_assert(dim < D);

   out_shape[dim] = s;

 }


 template <size_t dim, int32_t D, typename... S, typename I,

           std::enable_if_t<sizeof...(S) != 0> * = nullptr>

 void ReshapeImpl(size_t (&out_shape)[D], I &&s, S &&...rest) {

   static_assert(dim < D);

   out_shape[dim] = s;

   ReshapeImpl<dim + 1>(out_shape, std::forward<S>(rest)...);

 }


 template <class...>

 struct Conjunction : std::true_type {};

 template <class B1>

 struct Conjunction<B1> : B1 {};

 template <class B1, class... Bn>

 struct Conjunction<B1, Bn...>

     : std::conditional_t<static_cast<bool>(B1::value), Conjunction<Bn...>, B1> {};


 template <typename... Index>

 using IsAllIntegral = Conjunction<std::is_integral<std::remove_reference_t<Index>>...>;


 template <typename... Index>

 using EnableIfIntegral = std::enable_if_t<IsAllIntegral<Index...>::value>;

 }  // namespace detail


 constexpr detail::AllTag All() { return {}; }

 template <typename I>

 constexpr detail::RangeTag<I> Range(I beg, I end) {

   return {beg, end};

 }


 enum Order : std::uint8_t {

   kC,  // Row major

   kF,  // Col major

 };


 template <typename T, int32_t kDim>

 class TensorView {

  public:

   using ShapeT = std::size_t[kDim];

   using StrideT = ShapeT;


   using element_type = T;                  // NOLINT

   using value_type = std::remove_cv_t<T>;  // NOLINT


  private:

   StrideT stride_{1};

   ShapeT shape_{0};

   common::Span<T> data_;

   T *ptr_{nullptr};  // pointer of data_ to avoid bound check.


   size_t size_{0};

   DeviceOrd device_;


   // Unlike `Tensor`, the data_ can have arbitrary size since this is just a view.

   LINALG_HD void CalcSize() {

     if (data_.empty()) {

       size_ = 0;

     } else {

       size_ = detail::CalcSize(shape_);

     }

   }


   template <size_t old_dim, size_t new_dim, int32_t D, typename I>

   LINALG_HD size_t MakeSliceDim(std::size_t new_shape[D], std::size_t new_stride[D],

                                 detail::RangeTag<I> &&range) const {

     static_assert(new_dim < D);

     static_assert(old_dim < kDim);

     new_stride[new_dim] = stride_[old_dim];

     new_shape[new_dim] = range.Size();

     assert(static_cast<decltype(shape_[old_dim])>(range.end) <= shape_[old_dim]);


     auto offset = stride_[old_dim] * range.beg;

     return offset;

   }

   template <size_t old_dim, size_t new_dim, int32_t D, typename I, typename... S>

   LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],

                                 detail::RangeTag<I> &&range, S &&...slices) const {

     static_assert(new_dim < D);

     static_assert(old_dim < kDim);

     new_stride[new_dim] = stride_[old_dim];

     new_shape[new_dim] = range.Size();

     assert(static_cast<decltype(shape_[old_dim])>(range.end) <= shape_[old_dim]);


     auto offset = stride_[old_dim] * range.beg;

     return MakeSliceDim<old_dim + 1, new_dim + 1, D>(new_shape, new_stride,

                                                      std::forward<S>(slices)...) +

            offset;

   }


   template <size_t old_dim, size_t new_dim, int32_t D>

   LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag) const {

     static_assert(new_dim < D);

     static_assert(old_dim < kDim);

     new_stride[new_dim] = stride_[old_dim];

     new_shape[new_dim] = shape_[old_dim];

     return 0;

   }

   template <size_t old_dim, size_t new_dim, int32_t D, typename... S>

   LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag,

                                 S &&...slices) const {

     static_assert(new_dim < D);

     static_assert(old_dim < kDim);

     new_stride[new_dim] = stride_[old_dim];

     new_shape[new_dim] = shape_[old_dim];

     return MakeSliceDim<old_dim + 1, new_dim + 1, D>(new_shape, new_stride,

                                                      std::forward<S>(slices)...);

   }


   template <size_t old_dim, size_t new_dim, int32_t D, typename Index>

   LINALG_HD size_t MakeSliceDim(DMLC_ATTRIBUTE_UNUSED size_t new_shape[D],

                                 DMLC_ATTRIBUTE_UNUSED size_t new_stride[D], Index i) const {

     static_assert(old_dim < kDim);

     return stride_[old_dim] * i;

   }

   template <size_t old_dim, size_t new_dim, int32_t D, typename Index, typename... S>

   LINALG_HD std::enable_if_t<std::is_integral_v<Index>, size_t> MakeSliceDim(

       size_t new_shape[D], size_t new_stride[D], Index i, S &&...slices) const {

     static_assert(old_dim < kDim);

     auto offset = stride_[old_dim] * i;

     auto res =

         MakeSliceDim<old_dim + 1, new_dim, D>(new_shape, new_stride, std::forward<S>(slices)...);

     return res + offset;

   }


  public:

   size_t constexpr static kValueSize = sizeof(T);

   size_t constexpr static kDimension = kDim;


  public:

   template <typename I, std::int32_t D>

   LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], DeviceOrd device)

       : TensorView{data, shape, device, Order::kC} {}


   template <typename I, int32_t D>

   LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], DeviceOrd device, Order order)

       : data_{data}, ptr_{data_.data()}, device_{device} {

     static_assert(D > 0 && D <= kDim, "Invalid shape.");

     // shape

     detail::UnrollLoop<D>([&](auto i) { shape_[i] = shape[i]; });

     for (auto i = D; i < kDim; ++i) {

       shape_[i] = 1;

     }

     // stride

     switch (order) {

       case Order::kC: {

         detail::CalcStride(shape_, stride_);

         break;

       }

       case Order::kF: {

         detail::CalcStride<kDim, true>(shape_, stride_);

         break;

       }

       default: {

         SPAN_CHECK(false);

       }

     }

     // size

     this->CalcSize();

   }


   template <typename I, std::int32_t D>

   LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], I const (&stride)[D],

                        DeviceOrd device)

       : data_{data}, ptr_{data_.data()}, device_{device} {

     static_assert(D == kDim, "Invalid shape & stride.");

     detail::UnrollLoop<D>([&](auto i) {

       shape_[i] = shape[i];

       stride_[i] = stride[i];

     });

     this->CalcSize();

   }


   template <

       typename U,

       std::enable_if_t<common::detail::IsAllowedElementTypeConversion<U, T>::value> * = nullptr>

   LINALG_HD TensorView(TensorView<U, kDim> const &that)  // NOLINT

       : data_{that.Values()}, ptr_{data_.data()}, size_{that.Size()}, device_{that.Device()} {

     detail::UnrollLoop<kDim>([&](auto i) {

       stride_[i] = that.Stride(i);

       shape_[i] = that.Shape(i);

     });

   }


   template <typename... Index, detail::EnableIfIntegral<Index...> * = nullptr>

   LINALG_HD T &operator()(Index &&...index) {

     static_assert(sizeof...(index) <= kDim, "Invalid index.");

     size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward<Index>(index)...);

     assert(offset < data_.size() && "Out of bound access.");

     return ptr_[offset];

   }

   template <typename... Index, detail::EnableIfIntegral<Index...> * = nullptr>

   LINALG_HD T const &operator()(Index &&...index) const {

     static_assert(sizeof...(index) <= kDim, "Invalid index.");

     size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward<Index>(index)...);

     assert(offset < data_.size() && "Out of bound access.");

     return ptr_[offset];

   }


   template <typename... S>

   LINALG_HD auto Slice(S &&...slices) const {

     static_assert(sizeof...(slices) <= kDim, "Invalid slice.");

     int32_t constexpr kNewDim{detail::CalcSliceDim<detail::IndexToTag<S>...>()};

     size_t new_shape[kNewDim];

     size_t new_stride[kNewDim];

     auto offset = MakeSliceDim<0, 0, kNewDim>(new_shape, new_stride, std::forward<S>(slices)...);

     // ret is a different type due to changed dimension, so we can not access its private

     // fields.

     TensorView<T, kNewDim> ret{data_.subspan(data_.empty() ? 0 : offset), new_shape, new_stride,

                                device_};

     return ret;

   }


   LINALG_HD auto Shape() const { return common::Span<size_t const, kDim>{shape_}; }

   LINALG_HD auto Shape(size_t i) const { return shape_[i]; }

   LINALG_HD auto Stride() const { return common::Span<size_t const, kDim>{stride_}; }

   LINALG_HD auto Stride(size_t i) const { return stride_[i]; }


   [[nodiscard]] LINALG_HD std::size_t Size() const { return size_; }

   [[nodiscard]] bool Empty() const { return Size() == 0; }

   [[nodiscard]] LINALG_HD bool Contiguous() const {

     return data_.size() == this->Size() || this->CContiguous() || this->FContiguous();

   }

   [[nodiscard]] LINALG_HD bool CContiguous() const {

     StrideT stride;

     static_assert(std::is_same_v<decltype(stride), decltype(stride_)>);

     // It's contiguous if the stride can be calculated from shape.

     detail::CalcStride(shape_, stride);

     return common::Span<size_t const, kDim>{stride_} == common::Span<size_t const, kDim>{stride};

   }

   [[nodiscard]] LINALG_HD bool FContiguous() const {

     StrideT stride;

     static_assert(std::is_same_v<decltype(stride), decltype(stride_)>);

     // It's contiguous if the stride can be calculated from shape.

     detail::CalcStride<kDim, true>(shape_, stride);

     return common::Span<size_t const, kDim>{stride_} == common::Span<size_t const, kDim>{stride};

   }

   LINALG_HD auto Values() const -> decltype(data_) const & { return data_; }

   LINALG_HD auto Device() const { return device_; }

 };


 template <typename Container, typename... S,

           std::enable_if_t<!common::detail::IsSpan<Container>::value &&

                            !std::is_pointer_v<Container>> * = nullptr>

 auto MakeTensorView(Context const *ctx, Container &data, S &&...shape) {  // NOLINT

   using T = std::conditional_t<std::is_const_v<Container>,

                                std::add_const_t<typename Container::value_type>,

                                typename Container::value_type>;

   std::size_t in_shape[sizeof...(S)];

   detail::IndexToArr(in_shape, std::forward<S>(shape)...);

   return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Device()};

 }


 template <typename T, decltype(common::dynamic_extent) ext, typename... S>

 LINALG_HD auto MakeTensorView(DeviceOrd device, common::Span<T, ext> data, S &&...shape) {

   std::size_t in_shape[sizeof...(S)];

   detail::IndexToArr(in_shape, std::forward<S>(shape)...);

   return TensorView<T, sizeof...(S)>{data, in_shape, device};

 }


 template <typename T, decltype(common::dynamic_extent) ext, typename... S>

 auto MakeTensorView(Context const *ctx, common::Span<T, ext> data, S &&...shape) {

   return MakeTensorView(ctx->Device(), data, std::forward<S>(shape)...);

 }


 template <typename T, decltype(common::dynamic_extent) ext, typename... S>

 auto MakeTensorView(Context const *ctx, Order order, common::Span<T, ext> data, S &&...shape) {

   std::size_t in_shape[sizeof...(S)];

   detail::IndexToArr(in_shape, std::forward<S>(shape)...);

   return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Device(), order};

 }


 template <typename T, typename... S>

 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) {

   auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan();

   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);

 }


 template <typename T, typename... S>

 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> const *data, S &&...shape) {

   auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan();

   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);

 }


 template <size_t D>

 LINALG_HD auto UnravelIndex(size_t idx, common::Span<size_t const, D> shape) {

   if (idx > std::numeric_limits<uint32_t>::max()) {

     return detail::UnravelImpl<uint64_t, D>(static_cast<uint64_t>(idx), shape);

   } else {

     return detail::UnravelImpl<uint32_t, D>(static_cast<uint32_t>(idx), shape);

   }

 }


 template <size_t D>

 LINALG_HD auto UnravelIndex(size_t idx, std::size_t const (&shape)[D]) {

   return UnravelIndex(idx, common::Span<std::size_t const, D>(shape));

 }


 template <typename... S>

 LINALG_HD auto UnravelIndex(std::size_t idx, S... shape) {

   std::size_t s[sizeof...(S)];

   detail::IndexToArr(s, shape...);

   return UnravelIndex(idx, common::Span<std::size_t const, sizeof...(S)>(s));

 }


 template <typename T>

 using VectorView = TensorView<T, 1>;


 template <typename T>

 auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {

   return linalg::TensorView<T, 1>{{ptr, s}, {s}, device};

 }


 template <typename T>

 auto MakeVec(HostDeviceVector<T> *data) {

   return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(),

                  data->Size(), data->Device());

 }


 template <typename T>

 auto MakeVec(HostDeviceVector<T> const *data) {

   return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),

                  data->Size(), data->Device());

 }


 template <typename T>

 using MatrixView = TensorView<T, 2>;


 template <typename T, std::int32_t D>

 Json ArrayInterface(TensorView<T const, D> const &t) {

   Json array_interface{Object{}};

   array_interface["data"] = std::vector<Json>(2);

   array_interface["data"][0] = Integer{reinterpret_cast<int64_t>(t.Values().data())};

   array_interface["data"][1] = Boolean{true};

   if (t.Device().IsCUDA()) {

     // Change this once we have different CUDA stream.

     array_interface["stream"] = Integer{2};

   }

   std::vector<Json> shape(t.Shape().size());

   std::vector<Json> stride(t.Stride().size());

   for (size_t i = 0; i < t.Shape().size(); ++i) {

     shape[i] = Integer(t.Shape(i));

     stride[i] = Integer(t.Stride(i) * sizeof(T));

   }

   array_interface["shape"] = Array{shape};

   array_interface["strides"] = Array{stride};

   array_interface["version"] = 3;


   char constexpr kT = detail::ArrayInterfaceHandler::TypeChar<T>();

   static_assert(kT != '\0');

   if (DMLC_LITTLE_ENDIAN) {

     array_interface["typestr"] = String{"<" + (kT + std::to_string(sizeof(T)))};

   } else {

     array_interface["typestr"] = String{">" + (kT + std::to_string(sizeof(T)))};

   }

   return array_interface;

 }


 template <typename T, int32_t D>

 Json ArrayInterface(TensorView<T, D> const &t) {

   TensorView<T const, D> const &as_const = t;

   auto res = ArrayInterface(as_const);

   res["data"][1] = Boolean{false};

   return res;

 }


 template <typename T, int32_t D>

 auto ArrayInterfaceStr(TensorView<T const, D> const &t) {

   std::string str;

   Json::Dump(ArrayInterface(t), &str);

   return str;

 }


 template <typename T, int32_t D>

 auto ArrayInterfaceStr(TensorView<T, D> const &t) {

   std::string str;

   Json::Dump(ArrayInterface(t), &str);

   return str;

 }


 template <typename T>

 auto Make1dInterface(T const *vec, std::size_t len) {

   Context ctx;

   auto t = linalg::MakeTensorView(&ctx, common::Span{vec, len}, len);

   auto str = linalg::ArrayInterfaceStr(t);

   return str;

 }


 template <typename T, int32_t kDim = 5>

 class Tensor {

  public:

   using ShapeT = std::size_t[kDim];

   using StrideT = ShapeT;


  private:

   HostDeviceVector<T> data_;

   ShapeT shape_{0};

   Order order_{Order::kC};


   template <typename I, std::int32_t D>

   void Initialize(I const (&shape)[D], DeviceOrd device) {

     static_assert(D <= kDim, "Invalid shape.");

     std::copy(shape, shape + D, shape_);

     for (auto i = D; i < kDim; ++i) {

       shape_[i] = 1;

     }

     if (!device.IsCPU()) {

       data_.SetDevice(device);

       data_.ConstDevicePointer();  // Pull to device;

     }

     CHECK_EQ(data_.Size(), detail::CalcSize(shape_));

   }


  public:

   Tensor() = default;


   template <typename I, int32_t D>

   explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)

       : Tensor{common::Span<I const, D>{shape}, device, order} {}


   template <typename I, size_t D>

   explicit Tensor(common::Span<I const, D> shape, DeviceOrd device, Order order = kC)

       : order_{order} {

     // No device unroll as this is a host only function.

     std::copy(shape.data(), shape.data() + D, shape_);

     for (auto i = D; i < kDim; ++i) {

       shape_[i] = 1;

     }

     auto size = detail::CalcSize(shape_);

     if (!device.IsCPU()) {

       data_.SetDevice(device);

     }

     data_.Resize(size);

     if (!device.IsCPU()) {

       data_.DevicePointer();  // Pull to device

     }

   }

   template <typename It, typename I, int32_t D>

   explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC)

       : order_{order} {

     auto &h_vec = data_.HostVector();

     h_vec.insert(h_vec.begin(), begin, end);

     // shape

     this->Initialize(shape, device);

   }


   template <typename I, int32_t D>

   explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,

                   Order order = kC)

       : order_{order} {

     auto &h_vec = data_.HostVector();

     h_vec = data;

     // shape

     this->Initialize(shape, device);

   }

   template <typename... Index>

   T &operator()(Index &&...idx) {

     return this->HostView()(std::forward<Index>(idx)...);

   }

   template <typename... Index>

   T const &operator()(Index &&...idx) const {

     return this->HostView()(std::forward<Index>(idx)...);

   }


   auto View(DeviceOrd device) {

     if (device.IsCPU()) {

       auto span = data_.HostSpan();

       return TensorView<T, kDim>{span, shape_, device, order_};

     } else {

       data_.SetDevice(device);

       auto span = data_.DeviceSpan();

       return TensorView<T, kDim>{span, shape_, device, order_};

     }

   }

   auto View(DeviceOrd device) const {

     if (device.IsCPU()) {

       auto span = data_.ConstHostSpan();

       return TensorView<T const, kDim>{span, shape_, device, order_};

     } else {

       data_.SetDevice(device);

       auto span = data_.ConstDeviceSpan();

       return TensorView<T const, kDim>{span, shape_, device, order_};

     }

   }


   auto HostView() { return this->View(DeviceOrd::CPU()); }

   auto HostView() const { return this->View(DeviceOrd::CPU()); }


   [[nodiscard]] std::size_t Size() const { return data_.Size(); }

   [[nodiscard]] bool Empty() const { return Size() == 0; }


   auto Shape() const { return common::Span<size_t const, kDim>{shape_}; }

   auto Shape(size_t i) const { return shape_[i]; }


   HostDeviceVector<T> *Data() { return &data_; }

   HostDeviceVector<T> const *Data() const { return &data_; }


   template <typename Fn>

   void ModifyInplace(Fn &&fn) {

     fn(this->Data(), common::Span<size_t, kDim>{this->shape_});

     CHECK_EQ(this->Data()->Size(), detail::CalcSize(this->shape_))

         << "Inconsistent size after modification.";

   }


   template <typename... S, detail::EnableIfIntegral<S...> * = nullptr>

   void Reshape(S &&...s) {

     static_assert(sizeof...(S) <= kDim, "Invalid shape.");

     detail::ReshapeImpl<0>(shape_, std::forward<S>(s)...);

     auto constexpr kEnd = sizeof...(S);

     static_assert(kEnd <= kDim, "Invalid shape.");

     std::fill(shape_ + kEnd, shape_ + kDim, 1);

     auto n = detail::CalcSize(shape_);

     data_.Resize(n);

   }


   template <size_t D>

   void Reshape(common::Span<size_t const, D> shape) {

     static_assert(D <= kDim, "Invalid shape.");

     std::copy(shape.data(), shape.data() + D, this->shape_);

     std::fill(shape_ + D, shape_ + kDim, 1);

     auto n = detail::CalcSize(shape_);

     data_.Resize(n);

   }


   template <size_t D>

   void Reshape(size_t (&shape)[D]) {

     this->Reshape(common::Span<size_t const, D>{shape});

   }

   template <typename... S>

   auto Slice(S &&...slices) const {

     return this->HostView().Slice(std::forward<S>(slices)...);

   }

   template <typename... S>

   auto Slice(S &&...slices) {

     return this->HostView().Slice(std::forward<S>(slices)...);

   }


   void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }

   [[nodiscard]] DeviceOrd Device() const { return data_.Device(); }

 };


 template <typename T>

 using Matrix = Tensor<T, 2>;


 template <typename T>

 using Vector = Tensor<T, 1>;


 template <typename T, typename... Index>

 auto Empty(Context const *ctx, Index &&...index) {

   Tensor<T, sizeof...(Index)> t;

   t.SetDevice(ctx->Device());

   t.Reshape(index...);

   return t;

 }


 template <typename T, typename... Index>

 auto Constant(Context const *ctx, T v, Index &&...index) {

   Tensor<T, sizeof...(Index)> t;

   t.SetDevice(ctx->Device());

   t.Reshape(index...);

   t.Data()->Fill(std::move(v));

   return t;

 }


 template <typename T, typename... Index>

 auto Zeros(Context const *ctx, Index &&...index) {

   return Constant(ctx, static_cast<T>(0), index...);

 }


 // Only first axis is supported for now.

 template <typename T, int32_t D>

 void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {

   if (r.Device().IsCUDA()) {

     l->SetDevice(r.Device());

   }

   l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) {

     for (size_t i = 1; i < D; ++i) {

       if (shape[i] == 0) {

         shape[i] = r.Shape(i);

       } else {

         CHECK_EQ(shape[i], r.Shape(i));

       }

     }

     data->Extend(*r.Data());

     shape[0] = l->Shape(0) + r.Shape(0);

   });

 }

 }  // namespace xgboost::linalg


 #if defined(LINALG_HD)

 #undef LINALG_HD

 #endif  // defined(LINALG_HD)

 #endif  // XGBOOST_LINALG_H_

base.h
Defines configuration macros and basic types for xgboost.

xgboost::HostDeviceVector
Definition: host_device_vector.h:87

xgboost::HostDeviceVector::Size
std::size_t Size() const

xgboost::HostDeviceVector::ConstDevicePointer
const T * ConstDevicePointer() const

xgboost::HostDeviceVector::Extend
void Extend(const HostDeviceVector< T > &other)

xgboost::HostDeviceVector::ConstHostSpan
common::Span< T const  > ConstHostSpan() const
Definition: host_device_vector.h:116

xgboost::HostDeviceVector::HostVector
std::vector< T > & HostVector()

xgboost::HostDeviceVector::ConstDeviceSpan
common::Span< const T > ConstDeviceSpan() const

xgboost::HostDeviceVector::HostPointer
T * HostPointer()
Definition: host_device_vector.h:113

xgboost::HostDeviceVector::DeviceSpan
common::Span< T > DeviceSpan()

xgboost::HostDeviceVector::HostSpan
common::Span< T > HostSpan()
Definition: host_device_vector.h:114

xgboost::HostDeviceVector::SetDevice
void SetDevice(DeviceOrd device) const

xgboost::HostDeviceVector::Device
DeviceOrd Device() const

xgboost::HostDeviceVector::Resize
void Resize(std::size_t new_size)

xgboost::HostDeviceVector::DevicePointer
T * DevicePointer()

xgboost::HostDeviceVector::ConstHostPointer
const T * ConstHostPointer() const
Definition: host_device_vector.h:117

xgboost::JsonArray
Definition: json.h:116

xgboost::JsonBoolean
Describes both true and false.
Definition: json.h:350

xgboost::JsonInteger
Definition: json.h:295

xgboost::JsonObject
Definition: json.h:219

xgboost::JsonString
Definition: json.h:89

xgboost::Json
Data structure representing JSON format.
Definition: json.h:392

xgboost::Json::Dump
static void Dump(Json json, std::string *out, std::ios::openmode mode=std::ios::out)
Encode the JSON object. Optional parameter mode for choosing between text and binary (ubjson) output.

xgboost::common::Span
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:431

xgboost::common::Span::data
constexpr XGBOOST_DEVICE pointer data() const __span_noexcept
Definition: span.h:550

xgboost::common::Span::subspan
XGBOOST_DEVICE auto subspan() const -> Span< element_type, detail::ExtentValue< Extent, Offset, Count >::value >
Definition: span.h:597

xgboost::common::Span::size
constexpr XGBOOST_DEVICE index_type size() const __span_noexcept
Definition: span.h:555

xgboost::common::Span::empty
constexpr XGBOOST_DEVICE bool empty() const __span_noexcept
Definition: span.h:562

xgboost::linalg::TensorView
A tensor view with static type and dimension. It implements indexing and slicing.
Definition: linalg.h:277

xgboost::linalg::TensorView::Size
LINALG_HD std::size_t Size() const
Number of items in the tensor.
Definition: linalg.h:520

xgboost::linalg::TensorView::TensorView
LINALG_HD TensorView(common::Span< T > data, I const (&shape)[D], DeviceOrd device)
Create a tensor with data and shape.
Definition: linalg.h:391

xgboost::linalg::TensorView::value_type
std::remove_cv_t< T > value_type
Definition: linalg.h:283

xgboost::linalg::TensorView::element_type
T element_type
Definition: linalg.h:282

xgboost::linalg::TensorView::CContiguous
LINALG_HD bool CContiguous() const
Whether it's a c-contiguous array.
Definition: linalg.h:531

xgboost::linalg::TensorView::Stride
LINALG_HD auto Stride(size_t i) const
Definition: linalg.h:515

xgboost::linalg::TensorView::TensorView
LINALG_HD TensorView(common::Span< T > data, I const (&shape)[D], DeviceOrd device, Order order)
Definition: linalg.h:395

xgboost::linalg::TensorView::Shape
LINALG_HD auto Shape() const
Definition: linalg.h:506

xgboost::linalg::TensorView::StrideT
ShapeT StrideT
Definition: linalg.h:280

xgboost::linalg::TensorView::kDimension
constexpr static size_t kDimension
Definition: linalg.h:376

xgboost::linalg::TensorView::Stride
LINALG_HD auto Stride() const
Definition: linalg.h:511

xgboost::linalg::TensorView::Slice
LINALG_HD auto Slice(S &&...slices) const
Slice the tensor. The returned tensor has inferred dim and shape. Scalar result is not supported.
Definition: linalg.h:493

xgboost::linalg::TensorView::Values
LINALG_HD auto Values() const -> decltype(data_) const &
Obtain a reference to the raw data.
Definition: linalg.h:551

xgboost::linalg::TensorView::Contiguous
LINALG_HD bool Contiguous() const
Whether this is a contiguous array, both C and F contiguous returns true.
Definition: linalg.h:525

xgboost::linalg::TensorView::Empty
bool Empty() const
Definition: linalg.h:521

xgboost::linalg::TensorView::operator()
LINALG_HD T const  & operator()(Index &&...index) const
Index the tensor to obtain a scalar value.
Definition: linalg.h:472

xgboost::linalg::TensorView::TensorView
LINALG_HD TensorView(TensorView< U, kDim > const &that)
Definition: linalg.h:440

xgboost::linalg::TensorView::operator()
LINALG_HD T & operator()(Index &&...index)
Index the tensor to obtain a scalar value.
Definition: linalg.h:462

xgboost::linalg::TensorView::kValueSize
constexpr static size_t kValueSize
Definition: linalg.h:375

xgboost::linalg::TensorView::FContiguous
LINALG_HD bool FContiguous() const
Whether it's a f-contiguous array.
Definition: linalg.h:541

xgboost::linalg::TensorView::Shape
LINALG_HD auto Shape(size_t i) const
Definition: linalg.h:510

xgboost::linalg::TensorView::TensorView
LINALG_HD TensorView(common::Span< T > data, I const (&shape)[D], I const (&stride)[D], DeviceOrd device)
Create a tensor with data, shape and strides. Don't use this constructor if stride can be calculated ...
Definition: linalg.h:426

xgboost::linalg::TensorView::ShapeT
std::size_t[kDim] ShapeT
Definition: linalg.h:279

xgboost::linalg::TensorView::Device
LINALG_HD auto Device() const
Obtain the CUDA device ordinal.
Definition: linalg.h:555

xgboost::linalg::Tensor
A tensor storage. To use it for other functionality like slicing one needs to obtain a view first....
Definition: linalg.h:745

xgboost::linalg::Tensor::Slice
auto Slice(S &&...slices)
Get a host view on the slice.
Definition: linalg.h:931

xgboost::linalg::Tensor::Empty
bool Empty() const
Definition: linalg.h:865

xgboost::linalg::Tensor::Tensor
Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order=kC)
Definition: linalg.h:803

xgboost::linalg::Tensor::Slice
auto Slice(S &&...slices) const
Get a host view on the slice.
Definition: linalg.h:924

xgboost::linalg::Tensor::Data
HostDeviceVector< T > const  * Data() const
Definition: linalg.h:871

xgboost::linalg::Tensor::Reshape
void Reshape(size_t(&shape)[D])
Definition: linalg.h:917

xgboost::linalg::Tensor::View
auto View(DeviceOrd device) const
Definition: linalg.h:850

xgboost::linalg::Tensor::HostView
auto HostView()
Definition: linalg.h:861

xgboost::linalg::Tensor::Shape
auto Shape(size_t i) const
Definition: linalg.h:868

xgboost::linalg::Tensor::Data
HostDeviceVector< T > * Data()
Definition: linalg.h:870

xgboost::linalg::Tensor::operator()
T & operator()(Index &&...idx)
Index operator. Not thread safe, should not be used in performance critical region....
Definition: linalg.h:825

xgboost::linalg::Tensor::View
auto View(DeviceOrd device)
Get a TensorView for this tensor.
Definition: linalg.h:840

xgboost::linalg::Tensor::Tensor
Tensor(common::Span< I const, D > shape, DeviceOrd device, Order order=kC)
Definition: linalg.h:783

xgboost::linalg::Tensor::Shape
auto Shape() const
Definition: linalg.h:867

xgboost::linalg::Tensor::Tensor
Tensor()=default

xgboost::linalg::Tensor::ModifyInplace
void ModifyInplace(Fn &&fn)
Visitor function for modification that changes shape and data.
Definition: linalg.h:880

xgboost::linalg::Tensor::SetDevice
void SetDevice(DeviceOrd device) const
Set device ordinal for this tensor.
Definition: linalg.h:938

xgboost::linalg::Tensor::Tensor
Tensor(std::initializer_list< T > data, I const (&shape)[D], DeviceOrd device, Order order=kC)
Definition: linalg.h:812

xgboost::linalg::Tensor::Reshape
void Reshape(common::Span< size_t const, D > shape)
Reshape the tensor.
Definition: linalg.h:908

xgboost::linalg::Tensor::Device
DeviceOrd Device() const
Definition: linalg.h:939

xgboost::linalg::Tensor::HostView
auto HostView() const
Definition: linalg.h:862

xgboost::linalg::Tensor::Tensor
Tensor(I const (&shape)[D], DeviceOrd device, Order order=kC)
Create a tensor with shape and device ordinal. The storage is initialized automatically.
Definition: linalg.h:779

xgboost::linalg::Tensor::operator()
T const  & operator()(Index &&...idx) const
Index operator. Not thread safe, should not be used in performance critical region....
Definition: linalg.h:833

xgboost::linalg::Tensor::ShapeT
std::size_t[kDim] ShapeT
Definition: linalg.h:747

xgboost::linalg::Tensor::Reshape
void Reshape(S &&...s)
Reshape the tensor.
Definition: linalg.h:892

xgboost::linalg::Tensor::StrideT
ShapeT StrideT
Definition: linalg.h:748

xgboost::linalg::Tensor::Size
std::size_t Size() const
Definition: linalg.h:864

context.h

host_device_vector.h
A device-and-host vector abstraction layer.

json.h

LINALG_HD
#define LINALG_HD
Definition: linalg.h:36

xgboost::common::dynamic_extent
constexpr std::size_t dynamic_extent
Definition: span.h:150

xgboost::common::Span
Span(std::vector< T > const &) -> Span< T const >

xgboost::linalg::detail::IndexToTag
std::conditional_t< std::is_integral_v< RemoveCRType< S > >, IntTag, S > IndexToTag
Definition: linalg.h:117

xgboost::linalg::detail::UnravelImpl
LINALG_HD auto UnravelImpl(I idx, common::Span< size_t const, D > shape)
Definition: linalg.h:194

xgboost::linalg::detail::ReshapeImpl
void ReshapeImpl(size_t(&out_shape)[D], I s)
Definition: linalg.h:215

xgboost::linalg::detail::Popc
LINALG_HD int Popc(uint32_t v)
Definition: linalg.h:136

xgboost::linalg::detail::RemoveCRType
std::remove_const_t< std::remove_reference_t< S > > RemoveCRType
Definition: linalg.h:114

xgboost::linalg::detail::CalcSliceDim
constexpr int32_t CalcSliceDim()
Calculate the dimension of sliced tensor.
Definition: linalg.h:95

xgboost::linalg::detail::UnrollLoop
constexpr LINALG_HD auto UnrollLoop(Fn fn)
Definition: linalg.h:120

xgboost::linalg::detail::Offset
constexpr size_t Offset(S(&strides)[D], size_t n, Head head)
Definition: linalg.h:53

xgboost::linalg::detail::IndexToArr
LINALG_HD void IndexToArr(std::size_t(&arr)[D], Head head)
Definition: linalg.h:161

xgboost::linalg::detail::CalcStride
constexpr void CalcStride(size_t const (&shape)[D], size_t(&stride)[D])
Definition: linalg.h:66

xgboost::linalg::detail::ArrToTuple
constexpr auto ArrToTuple(T(&arr)[N], std::index_sequence< Idx... >)
Definition: linalg.h:178

xgboost::linalg::detail::NativePopc
int32_t NativePopc(T v)
Definition: linalg.h:130

xgboost::linalg::detail::EnableIfIntegral
std::enable_if_t< IsAllIntegral< Index... >::value > EnableIfIntegral
Definition: linalg.h:243

xgboost::linalg::detail::CalcSize
constexpr size_t CalcSize(size_t(&shape)[D])
Definition: linalg.h:105

xgboost::linalg
Definition: linalg.h:40

xgboost::linalg::Range
constexpr detail::RangeTag< I > Range(I beg, I end)
Specify a range of elements in the axis for slicing.
Definition: linalg.h:254

xgboost::linalg::Make1dInterface
auto Make1dInterface(T const *vec, std::size_t len)
Definition: linalg.h:733

xgboost::linalg::MakeTensorView
auto MakeTensorView(Context const *ctx, Container &data, S &&...shape)
Constructor for automatic type deduction.
Definition: linalg.h:564

xgboost::linalg::ArrayInterfaceStr
auto ArrayInterfaceStr(TensorView< T const, D > const &t)
Return string representation of array interface.
Definition: linalg.h:719

xgboost::linalg::MakeVec
auto MakeVec(T *ptr, size_t s, DeviceOrd device=DeviceOrd::CPU())
Create a vector view from contigious memory.
Definition: linalg.h:644

xgboost::linalg::UnravelIndex
LINALG_HD auto UnravelIndex(size_t idx, common::Span< size_t const, D > shape)
Turns linear index into multi-dimension index. Similar to numpy unravel.
Definition: linalg.h:608

xgboost::linalg::Stack
void Stack(Tensor< T, D > *l, Tensor< T, D > const &r)
Definition: linalg.h:981

xgboost::linalg::Constant
auto Constant(Context const *ctx, T v, Index &&...index)
Create an array with value v.
Definition: linalg.h:963

xgboost::linalg::Zeros
auto Zeros(Context const *ctx, Index &&...index)
Like np.zeros, return a new array of given shape and type, filled with zeros.
Definition: linalg.h:975

xgboost::linalg::Empty
auto Empty(Context const *ctx, Index &&...index)
Create an array without initialization.
Definition: linalg.h:952

xgboost::linalg::All
constexpr detail::AllTag All()
Specify all elements in the axis for slicing.
Definition: linalg.h:249

xgboost::linalg::ArrayInterface
Json ArrayInterface(TensorView< T const, D > const &t)
Array Interface defined by numpy.
Definition: linalg.h:675

xgboost::linalg::Order
Order
Definition: linalg.h:258

xgboost::linalg::kC
@ kC
Definition: linalg.h:259

xgboost::linalg::kF
@ kF
Definition: linalg.h:260

xgboost::Integer
JsonInteger Integer
Definition: json.h:621

span.h

SPAN_CHECK
#define SPAN_CHECK(cond)
Definition: span.h:127

xgboost::Context
Runtime context for XGBoost. Contains information like threads and device.
Definition: context.h:133

xgboost::Context::Device
DeviceOrd Device() const
Get the current device and ordinal.
Definition: context.h:200

xgboost::Context::IsCPU
bool IsCPU() const
Is XGBoost running on CPU?
Definition: context.h:173

xgboost::DeviceOrd
A type for device ordinal. The type is packed into 32-bit for efficient use in viewing types like lin...
Definition: context.h:34

xgboost::DeviceOrd::IsCUDA
bool IsCUDA() const
Definition: context.h:44

xgboost::DeviceOrd::IsCPU
bool IsCPU() const
Definition: context.h:45

xgboost::DeviceOrd::CPU
constexpr static auto CPU()
Constructor for CPU.
Definition: context.h:64

xgboost::linalg::detail::AllTag
Definition: linalg.h:80

xgboost::linalg::detail::ArrayInterfaceHandler
Definition: linalg.h:43

xgboost::linalg::detail::ArrayInterfaceHandler::TypeChar
static constexpr char TypeChar()
Definition: linalg.h:45

xgboost::linalg::detail::Conjunction< B1 >
Definition: linalg.h:234

xgboost::linalg::detail::Conjunction
Definition: linalg.h:232

xgboost::linalg::detail::IntTag
Definition: linalg.h:82

xgboost::linalg::detail::RangeTag
Definition: linalg.h:85

xgboost::linalg::detail::RangeTag::Size
constexpr size_t Size() const
Definition: linalg.h:88

xgboost::linalg::detail::RangeTag::end
I end
Definition: linalg.h:87

xgboost::linalg::detail::RangeTag::beg
I beg
Definition: linalg.h:86