xgboost
Modules | Functions
DMatrix

DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. There are a few variants of DMatrix including normal DMatrix, which is a CSR matrix, QuantileDMatrix, which is used by histogram-based tree methods for saving memory, and lastly the experimental external-memory-based DMatrix, which reads data in batches during training. For the last two variants, see the Streaming group. More...

Collaboration diagram for DMatrix:

Modules

 Streaming
 Quantile DMatrix and external memory DMatrix can be created from batches of data.
 

Functions

int XGDMatrixCreateFromFile (const char *fname, int silent, DMatrixHandle *out)
 load a data matrix More...
 
int XGDMatrixCreateFromURI (char const *config, DMatrixHandle *out)
 load a data matrix More...
 
int XGDMatrixCreateFromCSREx (const size_t *indptr, const unsigned *indices, const float *data, size_t nindptr, size_t nelem, size_t num_col, DMatrixHandle *out)
 create a matrix content from CSR format More...
 
int XGDMatrixCreateFromCSR (char const *indptr, char const *indices, char const *data, bst_ulong ncol, char const *config, DMatrixHandle *out)
 Create a matrix from CSR matrix. More...
 
int XGDMatrixCreateFromDense (char const *data, char const *config, DMatrixHandle *out)
 Create a matrix from dense array. More...
 
int XGDMatrixCreateFromCSC (char const *indptr, char const *indices, char const *data, bst_ulong nrow, char const *config, DMatrixHandle *out)
 Create a matrix from a CSC matrix. More...
 
int XGDMatrixCreateFromCSCEx (const size_t *col_ptr, const unsigned *indices, const float *data, size_t nindptr, size_t nelem, size_t num_row, DMatrixHandle *out)
 create a matrix content from CSC format More...
 
int XGDMatrixCreateFromMat (const float *data, bst_ulong nrow, bst_ulong ncol, float missing, DMatrixHandle *out)
 create matrix content from dense matrix More...
 
int XGDMatrixCreateFromMat_omp (const float *data, bst_ulong nrow, bst_ulong ncol, float missing, DMatrixHandle *out, int nthread)
 create matrix content from dense matrix More...
 
int XGDMatrixCreateFromDT (void **data, const char **feature_stypes, bst_ulong nrow, bst_ulong ncol, DMatrixHandle *out, int nthread)
 create matrix content from python data table More...
 
int XGDMatrixCreateFromCudaColumnar (char const *data, char const *config, DMatrixHandle *out)
 Create DMatrix from CUDA columnar format. (cuDF) More...
 
int XGDMatrixCreateFromCudaArrayInterface (char const *data, char const *config, DMatrixHandle *out)
 Create DMatrix from CUDA array. More...
 
int XGImportArrowRecordBatch (DataIterHandle data_handle, void *ptr_array, void *ptr_schema)
 
int XGDMatrixCreateFromArrowCallback (XGDMatrixCallbackNext *next, char const *config, DMatrixHandle *out)
 Construct DMatrix from arrow using callbacks. Arrow related C API is not stable and subject to change in the future. More...
 
int XGDMatrixSliceDMatrix (DMatrixHandle handle, const int *idxset, bst_ulong len, DMatrixHandle *out)
 create a new dmatrix from sliced content of existing matrix More...
 
int XGDMatrixSliceDMatrixEx (DMatrixHandle handle, const int *idxset, bst_ulong len, DMatrixHandle *out, int allow_groups)
 create a new dmatrix from sliced content of existing matrix More...
 
int XGDMatrixFree (DMatrixHandle handle)
 free space in data matrix More...
 
int XGDMatrixSaveBinary (DMatrixHandle handle, const char *fname, int silent)
 load a data matrix into binary file More...
 
int XGDMatrixSetInfoFromInterface (DMatrixHandle handle, char const *field, char const *c_interface_str)
 Set content in array interface to a content in info. More...
 
int XGDMatrixSetFloatInfo (DMatrixHandle handle, const char *field, const float *array, bst_ulong len)
 set float vector to a content in info More...
 
int XGDMatrixSetUIntInfo (DMatrixHandle handle, const char *field, const unsigned *array, bst_ulong len)
 set uint32 vector to a content in info More...
 
int XGDMatrixSetStrFeatureInfo (DMatrixHandle handle, const char *field, const char **features, const bst_ulong size)
 Set string encoded information of all features. More...
 
int XGDMatrixGetStrFeatureInfo (DMatrixHandle handle, const char *field, bst_ulong *size, const char ***out_features)
 Get string encoded information of all features. More...
 
int XGDMatrixSetDenseInfo (DMatrixHandle handle, const char *field, void const *data, bst_ulong size, int type)
 Set meta info from dense matrix. Valid field names are: More...
 
int XGDMatrixSetGroup (DMatrixHandle handle, const unsigned *group, bst_ulong len)
 (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix More...
 
int XGDMatrixGetFloatInfo (const DMatrixHandle handle, const char *field, bst_ulong *out_len, const float **out_dptr)
 get float info vector from matrix. More...
 
int XGDMatrixGetUIntInfo (const DMatrixHandle handle, const char *field, bst_ulong *out_len, const unsigned **out_dptr)
 get uint32 info vector from matrix More...
 
int XGDMatrixNumRow (DMatrixHandle handle, bst_ulong *out)
 get number of rows. More...
 
int XGDMatrixNumCol (DMatrixHandle handle, bst_ulong *out)
 get number of columns More...
 
int XGDMatrixNumNonMissing (DMatrixHandle handle, bst_ulong *out)
 Get number of valid values from DMatrix. More...
 
int XGDMatrixGetDataAsCSR (DMatrixHandle const handle, char const *config, bst_ulong *out_indptr, unsigned *out_indices, float *out_data)
 Get the predictors from DMatrix as CSR matrix for testing. If this is a quantized DMatrix, quantized values are returned instead. More...
 
int XGDMatrixGetQuantileCut (DMatrixHandle const handle, char const *config, char const **out_indptr, char const **out_data)
 Export the quantile cuts used for training histogram-based models like hist and approx. Useful for model compression. More...
 

Detailed Description

DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. There are a few variants of DMatrix including normal DMatrix, which is a CSR matrix, QuantileDMatrix, which is used by histogram-based tree methods for saving memory, and lastly the experimental external-memory-based DMatrix, which reads data in batches during training. For the last two variants, see the Streaming group.

Function Documentation

◆ XGDMatrixCreateFromArrowCallback()

int XGDMatrixCreateFromArrowCallback ( XGDMatrixCallbackNext next,
char const *  config,
DMatrixHandle out 
)

Construct DMatrix from arrow using callbacks. Arrow related C API is not stable and subject to change in the future.

Parameters
nextCallback function for fetching arrow records.
configJSON encoded configuration. Required values are:
  • missing: Which value to represent missing value.
  • nbatch: Number of batches in arrow table.
  • nthread (optional): Number of threads used for initializing DMatrix.
outThe created DMatrix.
Returns
0 when success, -1 when failure happens

◆ XGDMatrixCreateFromCSC()

int XGDMatrixCreateFromCSC ( char const *  indptr,
char const *  indices,
char const *  data,
bst_ulong  nrow,
char const *  config,
DMatrixHandle out 
)

Create a matrix from a CSC matrix.

Parameters
indptrJSON encoded array_interface to column pointers in CSC.
indicesJSON encoded array_interface to row indices in CSC.
dataJSON encoded array_interface to values in CSC.
nrownumber of rows in the matrix.
configJSON encoded configuration. Supported values are:
  • missing: Which value to represent missing value.
  • nthread (optional): Number of threads used for initializing DMatrix.
outcreated dmatrix
Returns
0 when success, -1 when failure happens
Examples
c-api-demo.c.

◆ XGDMatrixCreateFromCSCEx()

int XGDMatrixCreateFromCSCEx ( const size_t *  col_ptr,
const unsigned *  indices,
const float *  data,
size_t  nindptr,
size_t  nelem,
size_t  num_row,
DMatrixHandle out 
)

create a matrix content from CSC format

Deprecated:
since 2.0.0
See also
XGDMatrixCreateFromCSC()

◆ XGDMatrixCreateFromCSR()

int XGDMatrixCreateFromCSR ( char const *  indptr,
char const *  indices,
char const *  data,
bst_ulong  ncol,
char const *  config,
DMatrixHandle out 
)

Create a matrix from CSR matrix.

Parameters
indptrJSON encoded array_interface to row pointers in CSR.
indicesJSON encoded array_interface to column indices in CSR.
dataJSON encoded array_interface to values in CSR.
ncolNumber of columns.
configJSON encoded configuration. Required values are:
  • missing: Which value to represent missing value.
  • nthread (optional): Number of threads used for initializing DMatrix.
outcreated dmatrix
Returns
0 when success, -1 when failure happens
Examples
c-api-demo.c.

◆ XGDMatrixCreateFromCSREx()

int XGDMatrixCreateFromCSREx ( const size_t *  indptr,
const unsigned *  indices,
const float *  data,
size_t  nindptr,
size_t  nelem,
size_t  num_col,
DMatrixHandle out 
)

create a matrix content from CSR format

Deprecated:
since 2.0.0
See also
XGDMatrixCreateFromCSR()

◆ XGDMatrixCreateFromCudaArrayInterface()

int XGDMatrixCreateFromCudaArrayInterface ( char const *  data,
char const *  config,
DMatrixHandle out 
)

Create DMatrix from CUDA array.

Parameters
dataJSON encoded cuda_array_interface for array data.
configJSON encoded configuration. Required values are:
  • missing: Which value to represent missing value.
  • nthread (optional): Number of threads used for initializing DMatrix.
outcreated dmatrix
Returns
0 when success, -1 when failure happens

◆ XGDMatrixCreateFromCudaColumnar()

int XGDMatrixCreateFromCudaColumnar ( char const *  data,
char const *  config,
DMatrixHandle out 
)

Create DMatrix from CUDA columnar format. (cuDF)

Parameters
dataArray of JSON encoded cuda_array_interface for each column.
configJSON encoded configuration. Required values are:
  • missing: Which value to represent missing value.
  • nthread (optional): Number of threads used for initializing DMatrix.
outcreated dmatrix
Returns
0 when success, -1 when failure happens

◆ XGDMatrixCreateFromDense()

int XGDMatrixCreateFromDense ( char const *  data,
char const *  config,
DMatrixHandle out 
)

Create a matrix from dense array.

Parameters
dataJSON encoded array_interface to array values.
configJSON encoded configuration. Required values are:
  • missing: Which value to represent missing value.
  • nthread (optional): Number of threads used for initializing DMatrix.
outcreated dmatrix
Returns
0 when success, -1 when failure happens
Examples
inference.c.

◆ XGDMatrixCreateFromDT()

int XGDMatrixCreateFromDT ( void **  data,
const char **  feature_stypes,
bst_ulong  nrow,
bst_ulong  ncol,
DMatrixHandle out,
int  nthread 
)

create matrix content from python data table

Parameters
datapointer to pointer to column data
feature_stypespointer to strings
nrownumber of rows
ncolnumber columns
outcreated dmatrix
nthreadnumber of threads (up to maximum cores available, if <=0 use all cores)
Returns
0 when success, -1 when failure happens

◆ XGDMatrixCreateFromFile()

int XGDMatrixCreateFromFile ( const char *  fname,
int  silent,
DMatrixHandle out 
)

load a data matrix

Deprecated:
since 2.0.0
See also
XGDMatrixCreateFromURI()
Parameters
fnamethe name of the file
silentwhether print messages during loading
outa loaded data matrix
Returns
0 when success, -1 when failure happens
Examples
c-api-demo.c.

◆ XGDMatrixCreateFromMat()

int XGDMatrixCreateFromMat ( const float *  data,
bst_ulong  nrow,
bst_ulong  ncol,
float  missing,
DMatrixHandle out 
)

create matrix content from dense matrix

Parameters
datapointer to the data space
nrownumber of rows
ncolnumber columns
missingwhich value to represent missing value
outcreated dmatrix
Returns
0 when success, -1 when failure happens
Examples
c-api-demo.c.

◆ XGDMatrixCreateFromMat_omp()

int XGDMatrixCreateFromMat_omp ( const float *  data,
bst_ulong  nrow,
bst_ulong  ncol,
float  missing,
DMatrixHandle out,
int  nthread 
)

create matrix content from dense matrix

Parameters
datapointer to the data space
nrownumber of rows
ncolnumber columns
missingwhich value to represent missing value
outcreated dmatrix
nthreadnumber of threads (up to maximum cores available, if <=0 use all cores)
Returns
0 when success, -1 when failure happens

◆ XGDMatrixCreateFromURI()

int XGDMatrixCreateFromURI ( char const *  config,
DMatrixHandle out 
)

load a data matrix

Parameters
configJSON encoded parameters for DMatrix construction. Accepted fields are:
  • uri: The URI of the input file. The URI parameter format is required when loading text data.
    embed:rst:leading-asterisk
    *            See :doc:`/tutorials/input_format` for more info.
    *          
  • silent (optional): Whether to print message during loading. Default to true.
  • data_split_mode (optional): Whether to split by row or column. In distributed mode, the file is split accordingly; otherwise this is only an indicator on how the file was split beforehand. Default to row.
    Parameters
    outa loaded data matrix
    Returns
    0 when success, -1 when failure happens

◆ XGDMatrixFree()

int XGDMatrixFree ( DMatrixHandle  handle)

free space in data matrix

Returns
0 when success, -1 when failure happens
Examples
c-api-demo.c, external_memory.c, and inference.c.

◆ XGDMatrixGetDataAsCSR()

int XGDMatrixGetDataAsCSR ( DMatrixHandle const  handle,
char const *  config,
bst_ulong out_indptr,
unsigned *  out_indices,
float *  out_data 
)

Get the predictors from DMatrix as CSR matrix for testing. If this is a quantized DMatrix, quantized values are returned instead.

Unlike most of XGBoost C functions, caller of XGDMatrixGetDataAsCSR is required to allocate the memory for return buffer instead of using thread local memory from XGBoost. This is to avoid allocating a huge memory buffer that can not be freed until exiting the thread.

Since
1.7.0
Parameters
handlethe handle to the DMatrix
configJSON configuration string. At the moment it should be an empty document, preserved for future use.
out_indptrindptr of output CSR matrix.
out_indicesColumn index of output CSR matrix.
out_dataData value of CSR matrix.
Returns
0 when success, -1 when failure happens

◆ XGDMatrixGetFloatInfo()

int XGDMatrixGetFloatInfo ( const DMatrixHandle  handle,
const char *  field,
bst_ulong out_len,
const float **  out_dptr 
)

get float info vector from matrix.

Parameters
handlea instance of data matrix
fieldfield name
out_lenused to set result length
out_dptrpointer to the result
Returns
0 when success, -1 when failure happens
Examples
c-api-demo.c.

◆ XGDMatrixGetQuantileCut()

int XGDMatrixGetQuantileCut ( DMatrixHandle const  handle,
char const *  config,
char const **  out_indptr,
char const **  out_data 
)

Export the quantile cuts used for training histogram-based models like hist and approx. Useful for model compression.

Since
2.0.0
Parameters
handlethe handle to the DMatrix
configJSON configuration string. At the moment it should be an empty document, preserved for future use.
out_indptrindptr of output CSC matrix represented by a JSON encoded __(cuda_)array_interface__.
out_dataData value of CSC matrix represented by a JSON encoded __(cuda_)array_interface__.

◆ XGDMatrixGetStrFeatureInfo()

int XGDMatrixGetStrFeatureInfo ( DMatrixHandle  handle,
const char *  field,
bst_ulong size,
const char ***  out_features 
)

Get string encoded information of all features.

Accepted fields are:

  • feature_name
  • feature_type

Caller is responsible for copying out the data, before next call to any API function of XGBoost.

Parameters
handleAn instance of data matrix
fieldField name
sizeSize of output pointer features (number of strings returned).
out_featuresAddress of a pointer to array of strings. Result is stored in thread local memory.
Returns
0 when success, -1 when failure happens
char const **c_out_features = NULL;
bst_ulong out_size = 0;
// Asumming the feature names are already set by `XGDMatrixSetStrFeatureInfo`.
XGDMatrixGetStrFeatureInfo(handle, "feature_name", &out_size,
&c_out_features)
for (bst_ulong i = 0; i < out_size; ++i) {
// Here we are simply printing the string. Copy it out if the feature name is
// useful after printing.
printf("feature %lu: %s\n", i, c_out_features[i]);
}
uint64_t bst_ulong
Definition: c_api.h:29
int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field, bst_ulong *size, const char ***out_features)
Get string encoded information of all features.

◆ XGDMatrixGetUIntInfo()

int XGDMatrixGetUIntInfo ( const DMatrixHandle  handle,
const char *  field,
bst_ulong out_len,
const unsigned **  out_dptr 
)

get uint32 info vector from matrix

Parameters
handlea instance of data matrix
fieldfield name
out_lenThe length of the field.
out_dptrpointer to the result
Returns
0 when success, -1 when failure happens

◆ XGDMatrixNumCol()

int XGDMatrixNumCol ( DMatrixHandle  handle,
bst_ulong out 
)

get number of columns

Parameters
handlethe handle to the DMatrix
outThe output of number of columns
Returns
0 when success, -1 when failure happens

◆ XGDMatrixNumNonMissing()

int XGDMatrixNumNonMissing ( DMatrixHandle  handle,
bst_ulong out 
)

Get number of valid values from DMatrix.

Parameters
handlethe handle to the DMatrix
outThe output of number of non-missing values
Returns
0 when success, -1 when failure happens

◆ XGDMatrixNumRow()

int XGDMatrixNumRow ( DMatrixHandle  handle,
bst_ulong out 
)

get number of rows.

Parameters
handlethe handle to the DMatrix
outThe address to hold number of rows.
Returns
0 when success, -1 when failure happens

◆ XGDMatrixSaveBinary()

int XGDMatrixSaveBinary ( DMatrixHandle  handle,
const char *  fname,
int  silent 
)

load a data matrix into binary file

Parameters
handlea instance of data matrix
fnamefile name
silentprint statistics when saving
Returns
0 when success, -1 when failure happens

◆ XGDMatrixSetDenseInfo()

int XGDMatrixSetDenseInfo ( DMatrixHandle  handle,
const char *  field,
void const *  data,
bst_ulong  size,
int  type 
)

Set meta info from dense matrix. Valid field names are:

  • label
  • weight
  • base_margin
  • group
  • label_lower_bound
  • label_upper_bound
  • feature_weights
Parameters
handleAn instance of data matrix
fieldField name
dataPointer to consecutive memory storing data.
sizeSize of the data, this is relative to size of type. (Meaning NOT number of bytes.)
typeIndicator of data type. This is defined in xgboost::DataType enum class.
  • float = 1
  • double = 2
  • uint32_t = 3
  • uint64_t = 4
Returns
0 when success, -1 when failure happens
Examples
external_memory.c, and inference.c.

◆ XGDMatrixSetFloatInfo()

int XGDMatrixSetFloatInfo ( DMatrixHandle  handle,
const char *  field,
const float *  array,
bst_ulong  len 
)

set float vector to a content in info

Parameters
handlea instance of data matrix
fieldfield name, can be label, weight
arraypointer to float vector
lenlength of array
Returns
0 when success, -1 when failure happens

◆ XGDMatrixSetGroup()

int XGDMatrixSetGroup ( DMatrixHandle  handle,
const unsigned *  group,
bst_ulong  len 
)

(deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix

Parameters
handlea instance of data matrix
grouppointer to group size
lenlength of array
Returns
0 when success, -1 when failure happens

◆ XGDMatrixSetInfoFromInterface()

int XGDMatrixSetInfoFromInterface ( DMatrixHandle  handle,
char const *  field,
char const *  c_interface_str 
)

Set content in array interface to a content in info.

Parameters
handlea instance of data matrix
fieldfield name.
c_interface_strJSON string representation of array interface.
Returns
0 when success, -1 when failure happens

◆ XGDMatrixSetStrFeatureInfo()

int XGDMatrixSetStrFeatureInfo ( DMatrixHandle  handle,
const char *  field,
const char **  features,
const bst_ulong  size 
)

Set string encoded information of all features.

Accepted fields are:

  • feature_name
  • feature_type
Parameters
handleAn instance of data matrix
fieldField name
featuresPointer to array of strings.
sizeSize of features pointer (number of strings passed in).
Returns
0 when success, -1 when failure happens
char const* feat_names [] {"feat_0", "feat_1"};
XGDMatrixSetStrFeatureInfo(handle, "feature_name", feat_names, 2);
// i for integer, q for quantitive, c for categorical. Similarly "int" and "float"
// are also recognized.
char const* feat_types [] {"i", "q"};
XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, 2);
int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field, const char **features, const bst_ulong size)
Set string encoded information of all features.

◆ XGDMatrixSetUIntInfo()

int XGDMatrixSetUIntInfo ( DMatrixHandle  handle,
const char *  field,
const unsigned *  array,
bst_ulong  len 
)

set uint32 vector to a content in info

Parameters
handlea instance of data matrix
fieldfield name
arraypointer to unsigned int vector
lenlength of array
Returns
0 when success, -1 when failure happens

◆ XGDMatrixSliceDMatrix()

int XGDMatrixSliceDMatrix ( DMatrixHandle  handle,
const int *  idxset,
bst_ulong  len,
DMatrixHandle out 
)

create a new dmatrix from sliced content of existing matrix

Parameters
handleinstance of data matrix to be sliced
idxsetindex set
lenlength of index set
outa sliced new matrix
Returns
0 when success, -1 when failure happens

◆ XGDMatrixSliceDMatrixEx()

int XGDMatrixSliceDMatrixEx ( DMatrixHandle  handle,
const int *  idxset,
bst_ulong  len,
DMatrixHandle out,
int  allow_groups 
)

create a new dmatrix from sliced content of existing matrix

Parameters
handleinstance of data matrix to be sliced
idxsetindex set
lenlength of index set
outa sliced new matrix
allow_groupsallow slicing of an array with groups
Returns
0 when success, -1 when failure happens

◆ XGImportArrowRecordBatch()

int XGImportArrowRecordBatch ( DataIterHandle  data_handle,
void *  ptr_array,
void *  ptr_schema 
)