|
xgboost
|
DMatrix is the basic data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. There are a few variants of DMatrix including normal DMatrix, which is a CSR matrix, QuantileDMatrix, which is used by histogram-based tree methods for saving memory, and lastly the experimental external-memory-based DMatrix, which reads data in batches during training. For the last two variants, see the Streaming group.
More...

Modules | |
| Streaming | |
| Quantile DMatrix and external memory DMatrix can be created from batches of data. | |
Functions | |
| int | XGDMatrixCreateFromFile (const char *fname, int silent, DMatrixHandle *out) |
| load a data matrix More... | |
| int | XGDMatrixCreateFromURI (char const *config, DMatrixHandle *out) |
| load a data matrix More... | |
| int | XGDMatrixCreateFromColumnar (char const *data, char const *config, DMatrixHandle *out) |
| Create a DMatrix from columnar data. (table) More... | |
| int | XGDMatrixCreateFromCSR (char const *indptr, char const *indices, char const *data, bst_ulong ncol, char const *config, DMatrixHandle *out) |
| Create a DMatrix from CSR matrix. More... | |
| int | XGDMatrixCreateFromDense (char const *data, char const *config, DMatrixHandle *out) |
| Create a DMatrix from dense array. More... | |
| int | XGDMatrixCreateFromCSC (char const *indptr, char const *indices, char const *data, bst_ulong nrow, char const *config, DMatrixHandle *out) |
| Create a DMatrix from a CSC matrix. More... | |
| int | XGDMatrixCreateFromMat (const float *data, bst_ulong nrow, bst_ulong ncol, float missing, DMatrixHandle *out) |
| create matrix content from dense matrix More... | |
| int | XGDMatrixCreateFromMat_omp (const float *data, bst_ulong nrow, bst_ulong ncol, float missing, DMatrixHandle *out, int nthread) |
| create matrix content from dense matrix More... | |
| int | XGDMatrixCreateFromCudaColumnar (char const *data, char const *config, DMatrixHandle *out) |
| Create DMatrix from CUDA columnar format. (cuDF) More... | |
| int | XGDMatrixCreateFromCudaArrayInterface (char const *data, char const *config, DMatrixHandle *out) |
| Create DMatrix from CUDA array. More... | |
| int | XGDMatrixSliceDMatrix (DMatrixHandle handle, const int *idxset, bst_ulong len, DMatrixHandle *out) |
| create a new dmatrix from sliced content of existing matrix More... | |
| int | XGDMatrixSliceDMatrixEx (DMatrixHandle handle, const int *idxset, bst_ulong len, DMatrixHandle *out, int allow_groups) |
| create a new dmatrix from sliced content of existing matrix More... | |
| int | XGDMatrixFree (DMatrixHandle handle) |
| Free a DMatrix object. More... | |
| int | XGDMatrixSaveBinary (DMatrixHandle handle, const char *fname, int silent) |
Save the DMatrix object into a file. QuantileDMatrix and external memory DMatrix are not supported. More... | |
| int | XGDMatrixSetInfoFromInterface (DMatrixHandle handle, char const *field, char const *data) |
| Set content in array interface to a content in info. More... | |
| int | XGDMatrixSetFloatInfo (DMatrixHandle handle, const char *field, const float *array, bst_ulong len) |
| set float vector to a content in info More... | |
| int | XGDMatrixSetUIntInfo (DMatrixHandle handle, const char *field, const unsigned *array, bst_ulong len) |
| int | XGDMatrixSetStrFeatureInfo (DMatrixHandle handle, const char *field, const char **features, const bst_ulong size) |
| Set string encoded information of all features. More... | |
| int | XGDMatrixGetStrFeatureInfo (DMatrixHandle handle, const char *field, bst_ulong *size, const char ***out_features) |
| Get string encoded information of all features. More... | |
| int | XGDMatrixGetCategories (DMatrixHandle handle, char const *config, CategoriesHandle *out) |
| Create an opaque handle to the internal category container. More... | |
| int | XGDMatrixGetCategoriesExportToArrow (DMatrixHandle handle, char const *config, CategoriesHandle *out, char const **export_out) |
| Create an opaque handle to the internal container and export it to arrow. More... | |
| int | XGBCategoriesFree (CategoriesHandle handle) |
| Free the opaque handle. More... | |
| int | XGDMatrixSetDenseInfo (DMatrixHandle handle, const char *field, void const *data, bst_ulong size, int type) |
| int | XGDMatrixGetInfoRef (DMatrixHandle handle, char const *field, char const **out_array) |
| Get a reference to data like label or weight. More... | |
| int | XGDMatrixGetFloatInfo (const DMatrixHandle handle, const char *field, bst_ulong *out_len, const float **out_dptr) |
| get float info vector from matrix. More... | |
| int | XGDMatrixGetUIntInfo (const DMatrixHandle handle, const char *field, bst_ulong *out_len, const unsigned **out_dptr) |
| get uint32 info vector from matrix More... | |
| int | XGDMatrixNumRow (DMatrixHandle handle, bst_ulong *out) |
| Get the number of rows from a DMatrix. More... | |
| int | XGDMatrixNumCol (DMatrixHandle handle, bst_ulong *out) |
| Get the number of columns from a DMatrix. More... | |
| int | XGDMatrixNumNonMissing (DMatrixHandle handle, bst_ulong *out) |
| Get number of valid values from a DMatrix. More... | |
| int | XGDMatrixDataSplitMode (DMatrixHandle handle, bst_ulong *out) |
| Get the data split mode from DMatrix. More... | |
| int | XGDMatrixGetDataAsCSR (DMatrixHandle const handle, char const *config, bst_ulong *out_indptr, unsigned *out_indices, float *out_data) |
| Get the predictors from DMatrix as CSR matrix for testing. If this is a quantized DMatrix, quantized values are returned instead. More... | |
| int | XGDMatrixGetQuantileCut (DMatrixHandle const handle, char const *config, char const **out_indptr, char const **out_data) |
Export the quantile cuts used for training histogram-based models like hist and approx. Useful for model compression. More... | |
DMatrix is the basic data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. There are a few variants of DMatrix including normal DMatrix, which is a CSR matrix, QuantileDMatrix, which is used by histogram-based tree methods for saving memory, and lastly the experimental external-memory-based DMatrix, which reads data in batches during training. For the last two variants, see the Streaming group.
| int XGBCategoriesFree | ( | CategoriesHandle | handle | ) |
Free the opaque handle.
| handle | An instance of the category container. |
| int XGDMatrixCreateFromColumnar | ( | char const * | data, |
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Create a DMatrix from columnar data. (table)
A special type of input to the DMatrix is the columnar format, which refers to column-based dataframes. XGBoost can accept both numeric data types like integers and floats, along with the categorical type, called dictionary in arrow's term. The addition of categorical type is introduced in 3.1.0. The dataframe is represented by a list array interfaces with one object for each column.
A categorical type is represented by 3 buffers, the validity mask, the names of the categories (called index for most of the dataframe implementation), and the codes used to represent the categories in the rows. XGBoost consumes a categorical column by accepting two JSON-encoded arrow arrays in a list. The first item in the list is a JSON object with {"offsets": IntegerArray, "values": StringArray } representing the string names defined by the arrow columnar format. The second buffer is an masked integer array that stores the categorical codes along with the validity mask:
As for numeric inputs, it's the same as dense array.
| data | A list of JSON-encoded array interfaces. |
| config | See XGDMatrixCreateFromDense for details. |
| out | The created DMatrix. |
| int XGDMatrixCreateFromCSC | ( | char const * | indptr, |
| char const * | indices, | ||
| char const * | data, | ||
| bst_ulong | nrow, | ||
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Create a DMatrix from a CSC matrix.
| indptr | JSON encoded array_interface to column pointers in CSC. |
| indices | JSON encoded array_interface to row indices in CSC. |
| data | JSON encoded array_interface to values in CSC. |
| nrow | The number of rows in the matrix. |
| config | See XGDMatrixCreateFromDense for details. |
| out | The created dmatrix. |
| int XGDMatrixCreateFromCSR | ( | char const * | indptr, |
| char const * | indices, | ||
| char const * | data, | ||
| bst_ulong | ncol, | ||
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Create a DMatrix from CSR matrix.
| indptr | JSON encoded array_interface to row pointers in CSR. |
| indices | JSON encoded array_interface to column indices in CSR. |
| data | JSON encoded array_interface to values in CSR. |
| ncol | The number of columns. |
| config | See XGDMatrixCreateFromDense for details. |
| out | The created dmatrix |
| int XGDMatrixCreateFromCudaArrayInterface | ( | char const * | data, |
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Create DMatrix from CUDA array.
| data | JSON encoded cuda_array_interface for array data. |
| config | JSON encoded configuration. Required values are:
|
| out | created dmatrix |
| int XGDMatrixCreateFromCudaColumnar | ( | char const * | data, |
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Create DMatrix from CUDA columnar format. (cuDF)
See XGDMatrixCreateFromColumnar for a brief description of the columnar format.
| data | A list of JSON-encoded array interfaces. |
| config | See XGDMatrixCreateFromDense for details. |
| out | Created dmatrix |
| int XGDMatrixCreateFromDense | ( | char const * | data, |
| char const * | config, | ||
| DMatrixHandle * | out | ||
| ) |
Create a DMatrix from dense array.
The array interface is defined in https://numpy.org/doc/2.1/reference/arrays.interface.html We encode the interface as a JSON object.
| data | JSON encoded array_interface to array values. |
| config | JSON encoded configuration. Required values are:
|
| out | The created DMatrix |
| int XGDMatrixCreateFromFile | ( | const char * | fname, |
| int | silent, | ||
| DMatrixHandle * | out | ||
| ) |
load a data matrix
| fname | the name of the file |
| silent | whether print messages during loading |
| out | a loaded data matrix |
| int XGDMatrixCreateFromMat | ( | const float * | data, |
| bst_ulong | nrow, | ||
| bst_ulong | ncol, | ||
| float | missing, | ||
| DMatrixHandle * | out | ||
| ) |
create matrix content from dense matrix
| data | pointer to the data space |
| nrow | number of rows |
| ncol | number columns |
| missing | which value to represent missing value |
| out | created dmatrix |
| int XGDMatrixCreateFromMat_omp | ( | const float * | data, |
| bst_ulong | nrow, | ||
| bst_ulong | ncol, | ||
| float | missing, | ||
| DMatrixHandle * | out, | ||
| int | nthread | ||
| ) |
create matrix content from dense matrix
| data | pointer to the data space |
| nrow | number of rows |
| ncol | number columns |
| missing | which value to represent missing value |
| out | created dmatrix |
| nthread | number of threads (up to maximum cores available, if <=0 use all cores) |
| int XGDMatrixCreateFromURI | ( | char const * | config, |
| DMatrixHandle * | out | ||
| ) |
load a data matrix
| config | JSON encoded parameters for DMatrix construction. Accepted fields are:
|
| out | a loaded data matrix |
| int XGDMatrixDataSplitMode | ( | DMatrixHandle | handle, |
| bst_ulong * | out | ||
| ) |
Get the data split mode from DMatrix.
| handle | the handle to the DMatrix |
| out | The output of the data split mode |
| int XGDMatrixFree | ( | DMatrixHandle | handle | ) |
Free a DMatrix object.
| int XGDMatrixGetCategories | ( | DMatrixHandle | handle, |
| char const * | config, | ||
| CategoriesHandle * | out | ||
| ) |
Create an opaque handle to the internal category container.
The container should be freed by XGBCategoriesFree
| handle | An instance of the data matrix. |
| config | Unused, reserved for the future. |
| out | Created handle to the category container. Set to NULL if there's no category. |
| int XGDMatrixGetCategoriesExportToArrow | ( | DMatrixHandle | handle, |
| char const * | config, | ||
| CategoriesHandle * | out, | ||
| char const ** | export_out | ||
| ) |
Create an opaque handle to the internal container and export it to arrow.
The container should be freed by XGBCategoriesFree
| handle | An instance of the data matrix. |
| config | Unused, reserved for the future. |
| out | Created handle to the category container |
| export_out | JSON encoded array of categories, with length equal to the number of features. |
| int XGDMatrixGetDataAsCSR | ( | DMatrixHandle const | handle, |
| char const * | config, | ||
| bst_ulong * | out_indptr, | ||
| unsigned * | out_indices, | ||
| float * | out_data | ||
| ) |
Get the predictors from DMatrix as CSR matrix for testing. If this is a quantized DMatrix, quantized values are returned instead.
Unlike most of XGBoost C functions, caller of XGDMatrixGetDataAsCSR is required to allocate the memory for return buffer instead of using thread local memory from XGBoost. This is to avoid allocating a huge memory buffer that can not be freed until exiting the thread.
| handle | the handle to the DMatrix |
| config | JSON configuration string. At the moment it should be an empty document, preserved for future use. |
| out_indptr | indptr of output CSR matrix. |
| out_indices | Column index of output CSR matrix. |
| out_data | Data value of CSR matrix. |
| int XGDMatrixGetFloatInfo | ( | const DMatrixHandle | handle, |
| const char * | field, | ||
| bst_ulong * | out_len, | ||
| const float ** | out_dptr | ||
| ) |
get float info vector from matrix.
| handle | a instance of data matrix |
| field | field name |
| out_len | used to set result length |
| out_dptr | pointer to the result |
| int XGDMatrixGetInfoRef | ( | DMatrixHandle | handle, |
| char const * | field, | ||
| char const ** | out_array | ||
| ) |
Get a reference to data like label or weight.
This method replaces the existing XGDMatrixGetFloatInfo and XGDMatrixGetUIntInfo to support non-vector (like a matrix) output. The output data directly references the internal storage, as a result, it's read-only and user should copy data before the next XGBoost call.
| handle | An instance of data matrix |
| field | Field name |
| out_array | JSON encoded __(cuda)_array_interface__ to the output. |
| int XGDMatrixGetQuantileCut | ( | DMatrixHandle const | handle, |
| char const * | config, | ||
| char const ** | out_indptr, | ||
| char const ** | out_data | ||
| ) |
Export the quantile cuts used for training histogram-based models like hist and approx. Useful for model compression.
| handle | the handle to the DMatrix |
| config | JSON configuration string. At the moment it should be an empty document, preserved for future use. |
| out_indptr | indptr of output CSC matrix represented by a JSON encoded __(cuda_)array_interface__. |
| out_data | Data value of CSC matrix represented by a JSON encoded __(cuda_)array_interface__. |
| int XGDMatrixGetStrFeatureInfo | ( | DMatrixHandle | handle, |
| const char * | field, | ||
| bst_ulong * | size, | ||
| const char *** | out_features | ||
| ) |
Get string encoded information of all features.
Accepted fields are:
Caller is responsible for copying out the data, before next call to any API function of XGBoost.
| handle | An instance of data matrix |
| field | Field name |
| size | Size of output pointer features (number of strings returned). |
| out_features | Address of a pointer to array of strings. Result is stored in thread local memory. |
| int XGDMatrixGetUIntInfo | ( | const DMatrixHandle | handle, |
| const char * | field, | ||
| bst_ulong * | out_len, | ||
| const unsigned ** | out_dptr | ||
| ) |
get uint32 info vector from matrix
| handle | a instance of data matrix |
| field | field name |
| out_len | The length of the field. |
| out_dptr | pointer to the result |
| int XGDMatrixNumCol | ( | DMatrixHandle | handle, |
| bst_ulong * | out | ||
| ) |
Get the number of columns from a DMatrix.
| handle | the handle to the DMatrix |
| out | The output of number of columns |
| int XGDMatrixNumNonMissing | ( | DMatrixHandle | handle, |
| bst_ulong * | out | ||
| ) |
Get number of valid values from a DMatrix.
| handle | the handle to the DMatrix |
| out | The output of number of non-missing values |
| int XGDMatrixNumRow | ( | DMatrixHandle | handle, |
| bst_ulong * | out | ||
| ) |
Get the number of rows from a DMatrix.
| handle | the handle to the DMatrix |
| out | The address to hold number of rows. |
| int XGDMatrixSaveBinary | ( | DMatrixHandle | handle, |
| const char * | fname, | ||
| int | silent | ||
| ) |
Save the DMatrix object into a file. QuantileDMatrix and external memory DMatrix are not supported.
| handle | a instance of data matrix |
| fname | File name |
| silent | print statistics when saving |
| int XGDMatrixSetDenseInfo | ( | DMatrixHandle | handle, |
| const char * | field, | ||
| void const * | data, | ||
| bst_ulong | size, | ||
| int | type | ||
| ) |
Use XGDMatrixSetInfoFromInterface instead.
| int XGDMatrixSetFloatInfo | ( | DMatrixHandle | handle, |
| const char * | field, | ||
| const float * | array, | ||
| bst_ulong | len | ||
| ) |
set float vector to a content in info
| handle | a instance of data matrix |
| field | field name, can be label, weight |
| array | pointer to float vector |
| len | length of array |
| int XGDMatrixSetInfoFromInterface | ( | DMatrixHandle | handle, |
| char const * | field, | ||
| char const * | data | ||
| ) |
Set content in array interface to a content in info.
| handle | An instance of data matrix |
| field | Field name. |
| data | JSON encoded array_interface to values in the dense matrix/vector. |
| int XGDMatrixSetStrFeatureInfo | ( | DMatrixHandle | handle, |
| const char * | field, | ||
| const char ** | features, | ||
| const bst_ulong | size | ||
| ) |
Set string encoded information of all features.
Accepted fields are:
| handle | An instance of data matrix |
| field | Field name |
| features | Pointer to array of strings. |
| size | Size of features pointer (number of strings passed in). |
| int XGDMatrixSetUIntInfo | ( | DMatrixHandle | handle, |
| const char * | field, | ||
| const unsigned * | array, | ||
| bst_ulong | len | ||
| ) |
Use XGDMatrixSetInfoFromInterface instead.
| int XGDMatrixSliceDMatrix | ( | DMatrixHandle | handle, |
| const int * | idxset, | ||
| bst_ulong | len, | ||
| DMatrixHandle * | out | ||
| ) |
create a new dmatrix from sliced content of existing matrix
| handle | instance of data matrix to be sliced |
| idxset | index set |
| len | length of index set |
| out | a sliced new matrix |
| int XGDMatrixSliceDMatrixEx | ( | DMatrixHandle | handle, |
| const int * | idxset, | ||
| bst_ulong | len, | ||
| DMatrixHandle * | out, | ||
| int | allow_groups | ||
| ) |
create a new dmatrix from sliced content of existing matrix
| handle | instance of data matrix to be sliced |
| idxset | index set |
| len | length of index set |
| out | a sliced new matrix |
| allow_groups | allow slicing of an array with groups |