xgboost
|
DMatrix is the basic data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. There are a few variants of DMatrix
including normal DMatrix
, which is a CSR matrix, QuantileDMatrix
, which is used by histogram-based tree methods for saving memory, and lastly the experimental external-memory-based DMatrix, which reads data in batches during training. For the last two variants, see the Streaming group.
More...
Modules | |
Streaming | |
Quantile DMatrix and external memory DMatrix can be created from batches of data. | |
Functions | |
int | XGDMatrixCreateFromFile (const char *fname, int silent, DMatrixHandle *out) |
load a data matrix More... | |
int | XGDMatrixCreateFromURI (char const *config, DMatrixHandle *out) |
load a data matrix More... | |
int | XGDMatrixCreateFromCSREx (const size_t *indptr, const unsigned *indices, const float *data, size_t nindptr, size_t nelem, size_t num_col, DMatrixHandle *out) |
create a matrix content from CSR format More... | |
int | XGDMatrixCreateFromColumnar (char const *data, char const *config, DMatrixHandle *out) |
Create a DMatrix from columnar data. (table) More... | |
int | XGDMatrixCreateFromCSR (char const *indptr, char const *indices, char const *data, bst_ulong ncol, char const *config, DMatrixHandle *out) |
Create a DMatrix from CSR matrix. More... | |
int | XGDMatrixCreateFromDense (char const *data, char const *config, DMatrixHandle *out) |
Create a DMatrix from dense array. More... | |
int | XGDMatrixCreateFromCSC (char const *indptr, char const *indices, char const *data, bst_ulong nrow, char const *config, DMatrixHandle *out) |
Create a DMatrix from a CSC matrix. More... | |
int | XGDMatrixCreateFromCSCEx (const size_t *col_ptr, const unsigned *indices, const float *data, size_t nindptr, size_t nelem, size_t num_row, DMatrixHandle *out) |
create a matrix content from CSC format More... | |
int | XGDMatrixCreateFromMat (const float *data, bst_ulong nrow, bst_ulong ncol, float missing, DMatrixHandle *out) |
create matrix content from dense matrix More... | |
int | XGDMatrixCreateFromMat_omp (const float *data, bst_ulong nrow, bst_ulong ncol, float missing, DMatrixHandle *out, int nthread) |
create matrix content from dense matrix More... | |
int | XGDMatrixCreateFromCudaColumnar (char const *data, char const *config, DMatrixHandle *out) |
Create DMatrix from CUDA columnar format. (cuDF) More... | |
int | XGDMatrixCreateFromCudaArrayInterface (char const *data, char const *config, DMatrixHandle *out) |
Create DMatrix from CUDA array. More... | |
int | XGDMatrixSliceDMatrix (DMatrixHandle handle, const int *idxset, bst_ulong len, DMatrixHandle *out) |
create a new dmatrix from sliced content of existing matrix More... | |
int | XGDMatrixSliceDMatrixEx (DMatrixHandle handle, const int *idxset, bst_ulong len, DMatrixHandle *out, int allow_groups) |
create a new dmatrix from sliced content of existing matrix More... | |
int | XGDMatrixFree (DMatrixHandle handle) |
free space in data matrix More... | |
int | XGDMatrixSaveBinary (DMatrixHandle handle, const char *fname, int silent) |
Save the DMatrix object into a file. QuantileDMatrix and external memory DMatrix are not supported. More... | |
int | XGDMatrixSetInfoFromInterface (DMatrixHandle handle, char const *field, char const *data) |
Set content in array interface to a content in info. More... | |
int | XGDMatrixSetFloatInfo (DMatrixHandle handle, const char *field, const float *array, bst_ulong len) |
set float vector to a content in info More... | |
int | XGDMatrixSetUIntInfo (DMatrixHandle handle, const char *field, const unsigned *array, bst_ulong len) |
int | XGDMatrixSetStrFeatureInfo (DMatrixHandle handle, const char *field, const char **features, const bst_ulong size) |
Set string encoded information of all features. More... | |
int | XGDMatrixGetStrFeatureInfo (DMatrixHandle handle, const char *field, bst_ulong *size, const char ***out_features) |
Get string encoded information of all features. More... | |
int | XGDMatrixSetDenseInfo (DMatrixHandle handle, const char *field, void const *data, bst_ulong size, int type) |
int | XGDMatrixGetFloatInfo (const DMatrixHandle handle, const char *field, bst_ulong *out_len, const float **out_dptr) |
get float info vector from matrix. More... | |
int | XGDMatrixGetUIntInfo (const DMatrixHandle handle, const char *field, bst_ulong *out_len, const unsigned **out_dptr) |
get uint32 info vector from matrix More... | |
int | XGDMatrixNumRow (DMatrixHandle handle, bst_ulong *out) |
get number of rows. More... | |
int | XGDMatrixNumCol (DMatrixHandle handle, bst_ulong *out) |
get number of columns More... | |
int | XGDMatrixNumNonMissing (DMatrixHandle handle, bst_ulong *out) |
Get number of valid values from DMatrix. More... | |
int | XGDMatrixDataSplitMode (DMatrixHandle handle, bst_ulong *out) |
Get the data split mode from DMatrix. More... | |
int | XGDMatrixGetDataAsCSR (DMatrixHandle const handle, char const *config, bst_ulong *out_indptr, unsigned *out_indices, float *out_data) |
Get the predictors from DMatrix as CSR matrix for testing. If this is a quantized DMatrix, quantized values are returned instead. More... | |
int | XGDMatrixGetQuantileCut (DMatrixHandle const handle, char const *config, char const **out_indptr, char const **out_data) |
Export the quantile cuts used for training histogram-based models like hist and approx . Useful for model compression. More... | |
DMatrix is the basic data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. There are a few variants of DMatrix
including normal DMatrix
, which is a CSR matrix, QuantileDMatrix
, which is used by histogram-based tree methods for saving memory, and lastly the experimental external-memory-based DMatrix, which reads data in batches during training. For the last two variants, see the Streaming group.
int XGDMatrixCreateFromColumnar | ( | char const * | data, |
char const * | config, | ||
DMatrixHandle * | out | ||
) |
Create a DMatrix from columnar data. (table)
A special type of input to the DMatrix
is the columnar format, which refers to column-based dataframes. XGBoost can accept both numeric data types like integers and floats, along with the categorical type, called dictionary in arrow's term. The addition of categorical type is introduced in 3.1.0. The dataframe is represented by a list array interfaces with one object for each column.
A categorical type is represented by 3 buffers, the validity mask, the names of the categories (called index for most of the dataframe implementation), and the codes used to represent the categories in the rows. XGBoost consumes a categorical column by accepting two JSON-encoded arrow arrays in a list. The first item in the list is a JSON object with {"offsets": IntegerArray, "values": StringArray }
representing the string names defined by the arrow columnar format. The second buffer is an masked integer array that stores the categorical codes along with the validity mask:
As for numeric inputs, it's the same as dense array.
data | A list of JSON-encoded array interfaces. |
config | See XGDMatrixCreateFromDense for details. |
out | The created DMatrix. |
int XGDMatrixCreateFromCSC | ( | char const * | indptr, |
char const * | indices, | ||
char const * | data, | ||
bst_ulong | nrow, | ||
char const * | config, | ||
DMatrixHandle * | out | ||
) |
Create a DMatrix from a CSC matrix.
indptr | JSON encoded array_interface to column pointers in CSC. |
indices | JSON encoded array_interface to row indices in CSC. |
data | JSON encoded array_interface to values in CSC. |
nrow | The number of rows in the matrix. |
config | See XGDMatrixCreateFromDense for details. |
out | The created dmatrix. |
int XGDMatrixCreateFromCSCEx | ( | const size_t * | col_ptr, |
const unsigned * | indices, | ||
const float * | data, | ||
size_t | nindptr, | ||
size_t | nelem, | ||
size_t | num_row, | ||
DMatrixHandle * | out | ||
) |
int XGDMatrixCreateFromCSR | ( | char const * | indptr, |
char const * | indices, | ||
char const * | data, | ||
bst_ulong | ncol, | ||
char const * | config, | ||
DMatrixHandle * | out | ||
) |
Create a DMatrix from CSR matrix.
indptr | JSON encoded array_interface to row pointers in CSR. |
indices | JSON encoded array_interface to column indices in CSR. |
data | JSON encoded array_interface to values in CSR. |
ncol | The number of columns. |
config | See XGDMatrixCreateFromDense for details. |
out | The created dmatrix |
int XGDMatrixCreateFromCSREx | ( | const size_t * | indptr, |
const unsigned * | indices, | ||
const float * | data, | ||
size_t | nindptr, | ||
size_t | nelem, | ||
size_t | num_col, | ||
DMatrixHandle * | out | ||
) |
int XGDMatrixCreateFromCudaArrayInterface | ( | char const * | data, |
char const * | config, | ||
DMatrixHandle * | out | ||
) |
Create DMatrix from CUDA array.
data | JSON encoded cuda_array_interface for array data. |
config | JSON encoded configuration. Required values are:
|
out | created dmatrix |
int XGDMatrixCreateFromCudaColumnar | ( | char const * | data, |
char const * | config, | ||
DMatrixHandle * | out | ||
) |
Create DMatrix from CUDA columnar format. (cuDF)
See XGDMatrixCreateFromColumnar for a brief description of the columnar format.
data | A list of JSON-encoded array interfaces. |
config | See XGDMatrixCreateFromDense for details. |
out | Created dmatrix |
int XGDMatrixCreateFromDense | ( | char const * | data, |
char const * | config, | ||
DMatrixHandle * | out | ||
) |
Create a DMatrix from dense array.
The array interface is defined in https://numpy.org/doc/2.1/reference/arrays.interface.html We encode the interface as a JSON object.
data | JSON encoded array_interface to array values. |
config | JSON encoded configuration. Required values are:
|
out | The created DMatrix |
int XGDMatrixCreateFromFile | ( | const char * | fname, |
int | silent, | ||
DMatrixHandle * | out | ||
) |
load a data matrix
fname | the name of the file |
silent | whether print messages during loading |
out | a loaded data matrix |
int XGDMatrixCreateFromMat | ( | const float * | data, |
bst_ulong | nrow, | ||
bst_ulong | ncol, | ||
float | missing, | ||
DMatrixHandle * | out | ||
) |
create matrix content from dense matrix
data | pointer to the data space |
nrow | number of rows |
ncol | number columns |
missing | which value to represent missing value |
out | created dmatrix |
int XGDMatrixCreateFromMat_omp | ( | const float * | data, |
bst_ulong | nrow, | ||
bst_ulong | ncol, | ||
float | missing, | ||
DMatrixHandle * | out, | ||
int | nthread | ||
) |
create matrix content from dense matrix
data | pointer to the data space |
nrow | number of rows |
ncol | number columns |
missing | which value to represent missing value |
out | created dmatrix |
nthread | number of threads (up to maximum cores available, if <=0 use all cores) |
int XGDMatrixCreateFromURI | ( | char const * | config, |
DMatrixHandle * | out | ||
) |
load a data matrix
config | JSON encoded parameters for DMatrix construction. Accepted fields are:
|
out | a loaded data matrix |
int XGDMatrixDataSplitMode | ( | DMatrixHandle | handle, |
bst_ulong * | out | ||
) |
Get the data split mode from DMatrix.
handle | the handle to the DMatrix |
out | The output of the data split mode |
int XGDMatrixFree | ( | DMatrixHandle | handle | ) |
free space in data matrix
int XGDMatrixGetDataAsCSR | ( | DMatrixHandle const | handle, |
char const * | config, | ||
bst_ulong * | out_indptr, | ||
unsigned * | out_indices, | ||
float * | out_data | ||
) |
Get the predictors from DMatrix as CSR matrix for testing. If this is a quantized DMatrix, quantized values are returned instead.
Unlike most of XGBoost C functions, caller of XGDMatrixGetDataAsCSR
is required to allocate the memory for return buffer instead of using thread local memory from XGBoost. This is to avoid allocating a huge memory buffer that can not be freed until exiting the thread.
handle | the handle to the DMatrix |
config | JSON configuration string. At the moment it should be an empty document, preserved for future use. |
out_indptr | indptr of output CSR matrix. |
out_indices | Column index of output CSR matrix. |
out_data | Data value of CSR matrix. |
int XGDMatrixGetFloatInfo | ( | const DMatrixHandle | handle, |
const char * | field, | ||
bst_ulong * | out_len, | ||
const float ** | out_dptr | ||
) |
get float info vector from matrix.
handle | a instance of data matrix |
field | field name |
out_len | used to set result length |
out_dptr | pointer to the result |
int XGDMatrixGetQuantileCut | ( | DMatrixHandle const | handle, |
char const * | config, | ||
char const ** | out_indptr, | ||
char const ** | out_data | ||
) |
Export the quantile cuts used for training histogram-based models like hist
and approx
. Useful for model compression.
handle | the handle to the DMatrix |
config | JSON configuration string. At the moment it should be an empty document, preserved for future use. |
out_indptr | indptr of output CSC matrix represented by a JSON encoded __(cuda_)array_interface__. |
out_data | Data value of CSC matrix represented by a JSON encoded __(cuda_)array_interface__. |
int XGDMatrixGetStrFeatureInfo | ( | DMatrixHandle | handle, |
const char * | field, | ||
bst_ulong * | size, | ||
const char *** | out_features | ||
) |
Get string encoded information of all features.
Accepted fields are:
Caller is responsible for copying out the data, before next call to any API function of XGBoost.
handle | An instance of data matrix |
field | Field name |
size | Size of output pointer features (number of strings returned). |
out_features | Address of a pointer to array of strings. Result is stored in thread local memory. |
int XGDMatrixGetUIntInfo | ( | const DMatrixHandle | handle, |
const char * | field, | ||
bst_ulong * | out_len, | ||
const unsigned ** | out_dptr | ||
) |
get uint32 info vector from matrix
handle | a instance of data matrix |
field | field name |
out_len | The length of the field. |
out_dptr | pointer to the result |
int XGDMatrixNumCol | ( | DMatrixHandle | handle, |
bst_ulong * | out | ||
) |
get number of columns
handle | the handle to the DMatrix |
out | The output of number of columns |
int XGDMatrixNumNonMissing | ( | DMatrixHandle | handle, |
bst_ulong * | out | ||
) |
Get number of valid values from DMatrix.
handle | the handle to the DMatrix |
out | The output of number of non-missing values |
int XGDMatrixNumRow | ( | DMatrixHandle | handle, |
bst_ulong * | out | ||
) |
get number of rows.
handle | the handle to the DMatrix |
out | The address to hold number of rows. |
int XGDMatrixSaveBinary | ( | DMatrixHandle | handle, |
const char * | fname, | ||
int | silent | ||
) |
Save the DMatrix object into a file. QuantileDMatrix
and external memory DMatrix are not supported.
handle | a instance of data matrix |
fname | file name |
silent | print statistics when saving |
int XGDMatrixSetDenseInfo | ( | DMatrixHandle | handle, |
const char * | field, | ||
void const * | data, | ||
bst_ulong | size, | ||
int | type | ||
) |
Use XGDMatrixSetInfoFromInterface instead.
int XGDMatrixSetFloatInfo | ( | DMatrixHandle | handle, |
const char * | field, | ||
const float * | array, | ||
bst_ulong | len | ||
) |
set float vector to a content in info
handle | a instance of data matrix |
field | field name, can be label, weight |
array | pointer to float vector |
len | length of array |
int XGDMatrixSetInfoFromInterface | ( | DMatrixHandle | handle, |
char const * | field, | ||
char const * | data | ||
) |
Set content in array interface to a content in info.
handle | An instance of data matrix |
field | Field name. |
data | JSON encoded array_interface to values in the dense matrix/vector. |
int XGDMatrixSetStrFeatureInfo | ( | DMatrixHandle | handle, |
const char * | field, | ||
const char ** | features, | ||
const bst_ulong | size | ||
) |
Set string encoded information of all features.
Accepted fields are:
handle | An instance of data matrix |
field | Field name |
features | Pointer to array of strings. |
size | Size of features pointer (number of strings passed in). |
int XGDMatrixSetUIntInfo | ( | DMatrixHandle | handle, |
const char * | field, | ||
const unsigned * | array, | ||
bst_ulong | len | ||
) |
Use XGDMatrixSetInfoFromInterface instead.
int XGDMatrixSliceDMatrix | ( | DMatrixHandle | handle, |
const int * | idxset, | ||
bst_ulong | len, | ||
DMatrixHandle * | out | ||
) |
create a new dmatrix from sliced content of existing matrix
handle | instance of data matrix to be sliced |
idxset | index set |
len | length of index set |
out | a sliced new matrix |
int XGDMatrixSliceDMatrixEx | ( | DMatrixHandle | handle, |
const int * | idxset, | ||
bst_ulong | len, | ||
DMatrixHandle * | out, | ||
int | allow_groups | ||
) |
create a new dmatrix from sliced content of existing matrix
handle | instance of data matrix to be sliced |
idxset | index set |
len | length of index set |
out | a sliced new matrix |
allow_groups | allow slicing of an array with groups |