xgboost
|
DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. There are a few variants of DMatrix
including normal DMatrix
, which is a CSR matrix, QuantileDMatrix
, which is used by histogram-based tree methods for saving memory, and lastly the experimental external-memory-based DMatrix, which reads data in batches during training. For the last two variants, see the Streaming group.
More...
Modules | |
Streaming | |
Quantile DMatrix and external memory DMatrix can be created from batches of data. | |
Functions | |
int | XGDMatrixCreateFromFile (const char *fname, int silent, DMatrixHandle *out) |
load a data matrix More... | |
int | XGDMatrixCreateFromCSREx (const size_t *indptr, const unsigned *indices, const float *data, size_t nindptr, size_t nelem, size_t num_col, DMatrixHandle *out) |
create a matrix content from CSR format More... | |
int | XGDMatrixCreateFromCSR (char const *indptr, char const *indices, char const *data, bst_ulong ncol, char const *config, DMatrixHandle *out) |
Create a matrix from CSR matrix. More... | |
int | XGDMatrixCreateFromDense (char const *data, char const *config, DMatrixHandle *out) |
Create a matrix from dense array. More... | |
int | XGDMatrixCreateFromCSCEx (const size_t *col_ptr, const unsigned *indices, const float *data, size_t nindptr, size_t nelem, size_t num_row, DMatrixHandle *out) |
create a matrix content from CSC format More... | |
int | XGDMatrixCreateFromMat (const float *data, bst_ulong nrow, bst_ulong ncol, float missing, DMatrixHandle *out) |
create matrix content from dense matrix More... | |
int | XGDMatrixCreateFromMat_omp (const float *data, bst_ulong nrow, bst_ulong ncol, float missing, DMatrixHandle *out, int nthread) |
create matrix content from dense matrix More... | |
int | XGDMatrixCreateFromDT (void **data, const char **feature_stypes, bst_ulong nrow, bst_ulong ncol, DMatrixHandle *out, int nthread) |
create matrix content from python data table More... | |
int | XGDMatrixCreateFromCudaColumnar (char const *data, char const *config, DMatrixHandle *out) |
Create DMatrix from CUDA columnar format. (cuDF) More... | |
int | XGDMatrixCreateFromCudaArrayInterface (char const *data, char const *config, DMatrixHandle *out) |
Create DMatrix from CUDA array. More... | |
int | XGImportArrowRecordBatch (DataIterHandle data_handle, void *ptr_array, void *ptr_schema) |
int | XGDMatrixCreateFromArrowCallback (XGDMatrixCallbackNext *next, char const *config, DMatrixHandle *out) |
Construct DMatrix from arrow using callbacks. Arrow related C API is not stable and subject to change in the future. More... | |
int | XGDMatrixSliceDMatrix (DMatrixHandle handle, const int *idxset, bst_ulong len, DMatrixHandle *out) |
create a new dmatrix from sliced content of existing matrix More... | |
int | XGDMatrixSliceDMatrixEx (DMatrixHandle handle, const int *idxset, bst_ulong len, DMatrixHandle *out, int allow_groups) |
create a new dmatrix from sliced content of existing matrix More... | |
int | XGDMatrixFree (DMatrixHandle handle) |
free space in data matrix More... | |
int | XGDMatrixSaveBinary (DMatrixHandle handle, const char *fname, int silent) |
load a data matrix into binary file More... | |
int | XGDMatrixSetInfoFromInterface (DMatrixHandle handle, char const *field, char const *c_interface_str) |
Set content in array interface to a content in info. More... | |
int | XGDMatrixSetFloatInfo (DMatrixHandle handle, const char *field, const float *array, bst_ulong len) |
set float vector to a content in info More... | |
int | XGDMatrixSetUIntInfo (DMatrixHandle handle, const char *field, const unsigned *array, bst_ulong len) |
set uint32 vector to a content in info More... | |
int | XGDMatrixSetStrFeatureInfo (DMatrixHandle handle, const char *field, const char **features, const bst_ulong size) |
Set string encoded information of all features. More... | |
int | XGDMatrixGetStrFeatureInfo (DMatrixHandle handle, const char *field, bst_ulong *size, const char ***out_features) |
Get string encoded information of all features. More... | |
int | XGDMatrixSetDenseInfo (DMatrixHandle handle, const char *field, void const *data, bst_ulong size, int type) |
Set meta info from dense matrix. Valid field names are: More... | |
int | XGDMatrixSetGroup (DMatrixHandle handle, const unsigned *group, bst_ulong len) |
(deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix More... | |
int | XGDMatrixGetFloatInfo (const DMatrixHandle handle, const char *field, bst_ulong *out_len, const float **out_dptr) |
get float info vector from matrix. More... | |
int | XGDMatrixGetUIntInfo (const DMatrixHandle handle, const char *field, bst_ulong *out_len, const unsigned **out_dptr) |
get uint32 info vector from matrix More... | |
int | XGDMatrixNumRow (DMatrixHandle handle, bst_ulong *out) |
get number of rows. More... | |
int | XGDMatrixNumCol (DMatrixHandle handle, bst_ulong *out) |
get number of columns More... | |
int | XGDMatrixNumNonMissing (DMatrixHandle handle, bst_ulong *out) |
Get number of valid values from DMatrix. More... | |
int | XGDMatrixGetDataAsCSR (DMatrixHandle const handle, char const *config, bst_ulong *out_indptr, unsigned *out_indices, float *out_data) |
Get the predictors from DMatrix as CSR matrix for testing. If this is a quantized DMatrix, quantized values are returned instead. More... | |
DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. There are a few variants of DMatrix
including normal DMatrix
, which is a CSR matrix, QuantileDMatrix
, which is used by histogram-based tree methods for saving memory, and lastly the experimental external-memory-based DMatrix, which reads data in batches during training. For the last two variants, see the Streaming group.
int XGDMatrixCreateFromArrowCallback | ( | XGDMatrixCallbackNext * | next, |
char const * | config, | ||
DMatrixHandle * | out | ||
) |
Construct DMatrix from arrow using callbacks. Arrow related C API is not stable and subject to change in the future.
next | Callback function for fetching arrow records. |
config | JSON encoded configuration. Required values are:
|
out | The created DMatrix. |
int XGDMatrixCreateFromCSCEx | ( | const size_t * | col_ptr, |
const unsigned * | indices, | ||
const float * | data, | ||
size_t | nindptr, | ||
size_t | nelem, | ||
size_t | num_row, | ||
DMatrixHandle * | out | ||
) |
create a matrix content from CSC format
col_ptr | pointer to col headers |
indices | findex |
data | fvalue |
nindptr | number of rows in the matrix + 1 |
nelem | number of nonzero elements in the matrix |
num_row | number of rows; when it's set to 0, then guess from data |
out | created dmatrix |
int XGDMatrixCreateFromCSR | ( | char const * | indptr, |
char const * | indices, | ||
char const * | data, | ||
bst_ulong | ncol, | ||
char const * | config, | ||
DMatrixHandle * | out | ||
) |
Create a matrix from CSR matrix.
indptr | JSON encoded array_interface to row pointers in CSR. |
indices | JSON encoded array_interface to column indices in CSR. |
data | JSON encoded array_interface to values in CSR. |
ncol | Number of columns. |
config | JSON encoded configuration. Required values are:
|
out | created dmatrix |
int XGDMatrixCreateFromCSREx | ( | const size_t * | indptr, |
const unsigned * | indices, | ||
const float * | data, | ||
size_t | nindptr, | ||
size_t | nelem, | ||
size_t | num_col, | ||
DMatrixHandle * | out | ||
) |
create a matrix content from CSR format
indptr | pointer to row headers |
indices | findex |
data | fvalue |
nindptr | number of rows in the matrix + 1 |
nelem | number of nonzero elements in the matrix |
num_col | number of columns; when it's set to kAdapterUnknownSize, then guess from data |
out | created dmatrix |
int XGDMatrixCreateFromCudaArrayInterface | ( | char const * | data, |
char const * | config, | ||
DMatrixHandle * | out | ||
) |
Create DMatrix from CUDA array.
data | JSON encoded cuda_array_interface for array data. |
config | JSON encoded configuration. Required values are:
|
out | created dmatrix |
int XGDMatrixCreateFromCudaColumnar | ( | char const * | data, |
char const * | config, | ||
DMatrixHandle * | out | ||
) |
Create DMatrix from CUDA columnar format. (cuDF)
data | Array of JSON encoded cuda_array_interface for each column. |
config | JSON encoded configuration. Required values are:
|
out | created dmatrix |
int XGDMatrixCreateFromDense | ( | char const * | data, |
char const * | config, | ||
DMatrixHandle * | out | ||
) |
Create a matrix from dense array.
data | JSON encoded array_interface to array values. |
config | JSON encoded configuration. Required values are:
|
out | created dmatrix |
int XGDMatrixCreateFromDT | ( | void ** | data, |
const char ** | feature_stypes, | ||
bst_ulong | nrow, | ||
bst_ulong | ncol, | ||
DMatrixHandle * | out, | ||
int | nthread | ||
) |
create matrix content from python data table
data | pointer to pointer to column data |
feature_stypes | pointer to strings |
nrow | number of rows |
ncol | number columns |
out | created dmatrix |
nthread | number of threads (up to maximum cores available, if <=0 use all cores) |
int XGDMatrixCreateFromFile | ( | const char * | fname, |
int | silent, | ||
DMatrixHandle * | out | ||
) |
load a data matrix
fname | the name of the file |
silent | whether print messages during loading |
out | a loaded data matrix |
int XGDMatrixCreateFromMat | ( | const float * | data, |
bst_ulong | nrow, | ||
bst_ulong | ncol, | ||
float | missing, | ||
DMatrixHandle * | out | ||
) |
create matrix content from dense matrix
data | pointer to the data space |
nrow | number of rows |
ncol | number columns |
missing | which value to represent missing value |
out | created dmatrix |
int XGDMatrixCreateFromMat_omp | ( | const float * | data, |
bst_ulong | nrow, | ||
bst_ulong | ncol, | ||
float | missing, | ||
DMatrixHandle * | out, | ||
int | nthread | ||
) |
create matrix content from dense matrix
data | pointer to the data space |
nrow | number of rows |
ncol | number columns |
missing | which value to represent missing value |
out | created dmatrix |
nthread | number of threads (up to maximum cores available, if <=0 use all cores) |
int XGDMatrixFree | ( | DMatrixHandle | handle | ) |
free space in data matrix
int XGDMatrixGetDataAsCSR | ( | DMatrixHandle const | handle, |
char const * | config, | ||
bst_ulong * | out_indptr, | ||
unsigned * | out_indices, | ||
float * | out_data | ||
) |
Get the predictors from DMatrix as CSR matrix for testing. If this is a quantized DMatrix, quantized values are returned instead.
Unlike most of XGBoost C functions, caller of XGDMatrixGetDataAsCSR
is required to allocate the memory for return buffer instead of using thread local memory from XGBoost. This is to avoid allocating a huge memory buffer that can not be freed until exiting the thread.
handle | the handle to the DMatrix |
config | Json configuration string. At the moment it should be an empty document, preserved for future use. |
out_indptr | indptr of output CSR matrix. |
out_indices | Column index of output CSR matrix. |
out_data | Data value of CSR matrix. |
int XGDMatrixGetFloatInfo | ( | const DMatrixHandle | handle, |
const char * | field, | ||
bst_ulong * | out_len, | ||
const float ** | out_dptr | ||
) |
get float info vector from matrix.
handle | a instance of data matrix |
field | field name |
out_len | used to set result length |
out_dptr | pointer to the result |
int XGDMatrixGetStrFeatureInfo | ( | DMatrixHandle | handle, |
const char * | field, | ||
bst_ulong * | size, | ||
const char *** | out_features | ||
) |
Get string encoded information of all features.
Accepted fields are:
Caller is responsible for copying out the data, before next call to any API function of XGBoost.
handle | An instance of data matrix |
field | Field name |
size | Size of output pointer features (number of strings returned). |
out_features | Address of a pointer to array of strings. Result is stored in thread local memory. |
int XGDMatrixGetUIntInfo | ( | const DMatrixHandle | handle, |
const char * | field, | ||
bst_ulong * | out_len, | ||
const unsigned ** | out_dptr | ||
) |
get uint32 info vector from matrix
handle | a instance of data matrix |
field | field name |
out_len | The length of the field. |
out_dptr | pointer to the result |
int XGDMatrixNumCol | ( | DMatrixHandle | handle, |
bst_ulong * | out | ||
) |
get number of columns
handle | the handle to the DMatrix |
out | The output of number of columns |
int XGDMatrixNumNonMissing | ( | DMatrixHandle | handle, |
bst_ulong * | out | ||
) |
Get number of valid values from DMatrix.
handle | the handle to the DMatrix |
out | The output of number of non-missing values |
int XGDMatrixNumRow | ( | DMatrixHandle | handle, |
bst_ulong * | out | ||
) |
get number of rows.
handle | the handle to the DMatrix |
out | The address to hold number of rows. |
int XGDMatrixSaveBinary | ( | DMatrixHandle | handle, |
const char * | fname, | ||
int | silent | ||
) |
load a data matrix into binary file
handle | a instance of data matrix |
fname | file name |
silent | print statistics when saving |
int XGDMatrixSetDenseInfo | ( | DMatrixHandle | handle, |
const char * | field, | ||
void const * | data, | ||
bst_ulong | size, | ||
int | type | ||
) |
Set meta info from dense matrix. Valid field names are:
handle | An instance of data matrix |
field | Field name |
data | Pointer to consecutive memory storing data. |
size | Size of the data, this is relative to size of type. (Meaning NOT number of bytes.) |
type | Indicator of data type. This is defined in xgboost::DataType enum class.
|
int XGDMatrixSetFloatInfo | ( | DMatrixHandle | handle, |
const char * | field, | ||
const float * | array, | ||
bst_ulong | len | ||
) |
set float vector to a content in info
handle | a instance of data matrix |
field | field name, can be label, weight |
array | pointer to float vector |
len | length of array |
int XGDMatrixSetGroup | ( | DMatrixHandle | handle, |
const unsigned * | group, | ||
bst_ulong | len | ||
) |
(deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
handle | a instance of data matrix |
group | pointer to group size |
len | length of array |
int XGDMatrixSetInfoFromInterface | ( | DMatrixHandle | handle, |
char const * | field, | ||
char const * | c_interface_str | ||
) |
Set content in array interface to a content in info.
handle | a instance of data matrix |
field | field name. |
c_interface_str | JSON string representation of array interface. |
int XGDMatrixSetStrFeatureInfo | ( | DMatrixHandle | handle, |
const char * | field, | ||
const char ** | features, | ||
const bst_ulong | size | ||
) |
Set string encoded information of all features.
Accepted fields are:
handle | An instance of data matrix |
field | Field name |
features | Pointer to array of strings. |
size | Size of features pointer (number of strings passed in). |
int XGDMatrixSetUIntInfo | ( | DMatrixHandle | handle, |
const char * | field, | ||
const unsigned * | array, | ||
bst_ulong | len | ||
) |
set uint32 vector to a content in info
handle | a instance of data matrix |
field | field name |
array | pointer to unsigned int vector |
len | length of array |
int XGDMatrixSliceDMatrix | ( | DMatrixHandle | handle, |
const int * | idxset, | ||
bst_ulong | len, | ||
DMatrixHandle * | out | ||
) |
create a new dmatrix from sliced content of existing matrix
handle | instance of data matrix to be sliced |
idxset | index set |
len | length of index set |
out | a sliced new matrix |
int XGDMatrixSliceDMatrixEx | ( | DMatrixHandle | handle, |
const int * | idxset, | ||
bst_ulong | len, | ||
DMatrixHandle * | out, | ||
int | allow_groups | ||
) |
create a new dmatrix from sliced content of existing matrix
handle | instance of data matrix to be sliced |
idxset | index set |
len | length of index set |
out | a sliced new matrix |
allow_groups | allow slicing of an array with groups |
int XGImportArrowRecordBatch | ( | DataIterHandle | data_handle, |
void * | ptr_array, | ||
void * | ptr_schema | ||
) |