xgboost
external_memory.c
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <xgboost/c_api.h>
#define safe_xgboost(err) \
if ((err) != 0) { \
fprintf(stderr, "%s:%d: error in %s: %s\n", __FILE__, __LINE__, #err, \
XGBGetLastError()); \
exit(1); \
}
#define N_BATCHS 32
#define BATCH_LEN 512
/* Shorthands. */
typedef DMatrixHandle DMatrix;
typedef BoosterHandle Booster;
typedef struct _DataIter {
/* Data of each batch. */
float **data;
/* Labels of each batch */
float **labels;
/* Length of each batch. */
size_t *lengths;
/* Total number of batches. */
size_t n;
/* Current iteration. */
size_t cur_it;
/* Private fields */
DMatrix _proxy;
char _array[128];
} DataIter;
#define safe_malloc(ptr) \
if ((ptr) == NULL) { \
fprintf(stderr, "%s:%d: Failed to allocate memory.\n", __FILE__, \
__LINE__); \
exit(1); \
}
void DataIterator_Init(DataIter *self, size_t batch_size, size_t n_batches) {
self->n = n_batches;
self->lengths = (size_t *)malloc(self->n * sizeof(size_t));
safe_malloc(self->lengths);
for (size_t i = 0; i < self->n; ++i) {
self->lengths[i] = batch_size;
}
self->data = (float **)malloc(self->n * sizeof(float *));
safe_malloc(self->data);
self->labels = (float **)malloc(self->n * sizeof(float *));
safe_malloc(self->labels);
/* Generate some random data. */
for (size_t i = 0; i < self->n; ++i) {
self->data[i] = (float *)malloc(self->lengths[i] * sizeof(float));
safe_malloc(self->data[i]);
for (size_t j = 0; j < self->lengths[i]; ++j) {
float x = (float)rand() / (float)(RAND_MAX);
self->data[i][j] = x;
}
self->labels[i] = (float *)malloc(self->lengths[i] * sizeof(float));
safe_malloc(self->labels[i]);
for (size_t j = 0; j < self->lengths[i]; ++j) {
float y = (float)rand() / (float)(RAND_MAX);
self->labels[i][j] = y;
}
}
self->cur_it = 0;
safe_xgboost(XGProxyDMatrixCreate(&self->_proxy));
}
void DataIterator_Free(DataIter *self) {
for (size_t i = 0; i < self->n; ++i) {
free(self->data[i]);
free(self->labels[i]);
}
free(self->data);
free(self->lengths);
free(self->labels);
safe_xgboost(XGDMatrixFree(self->_proxy));
};
int DataIterator_Next(DataIterHandle handle) {
DataIter *self = (DataIter *)(handle);
if (self->cur_it == self->n) {
self->cur_it = 0;
return 0; /* At end */
}
/* A JSON string encoding array interface (standard from numpy). */
char array[] = "{\"data\": [%lu, false], \"shape\":[%lu, 1], \"typestr\": "
"\"<f4\", \"version\": 3}";
memset(self->_array, '\0', sizeof(self->_array));
sprintf(self->_array, array, (size_t)self->data[self->cur_it],
self->lengths[self->cur_it]);
safe_xgboost(XGProxyDMatrixSetDataDense(self->_proxy, self->_array));
/* The data passed in the iterator must remain valid (not being freed until the next
* iteration or reset) */
safe_xgboost(XGDMatrixSetDenseInfo(self->_proxy, "label",
self->labels[self->cur_it],
self->lengths[self->cur_it], 1));
self->cur_it++;
return 1; /* Continue. */
}
void DataIterator_Reset(DataIterHandle handle) {
DataIter *self = (DataIter *)(handle);
self->cur_it = 0;
}
void TrainModel(DMatrix Xy) {
/* Create booster for training. */
Booster booster;
DMatrix cache[] = {Xy};
safe_xgboost(XGBoosterCreate(cache, 1, &booster));
/* Use approx or hist for external memory training. */
safe_xgboost(XGBoosterSetParam(booster, "tree_method", "hist"));
safe_xgboost(XGBoosterSetParam(booster, "objective", "reg:squarederror"));
/* Start training. */
const char *validation_names[1] = {"train"};
const char *validation_result = NULL;
size_t n_rounds = 10;
for (size_t i = 0; i < n_rounds; ++i) {
safe_xgboost(XGBoosterUpdateOneIter(booster, i, Xy));
safe_xgboost(XGBoosterEvalOneIter(booster, i, cache, validation_names, 1,
&validation_result));
printf("%s\n", validation_result);
}
/* Save the model to a JSON file. */
safe_xgboost(XGBoosterSaveModel(booster, "model.json"));
safe_xgboost(XGBoosterFree(booster));
}
int main() {
DataIter iter;
DataIterator_Init(&iter, BATCH_LEN, N_BATCHS);
/* Create DMatrix from iterator. During training, some cache files with the
* prefix "cache-" will be generated in current directory */
char config[] = "{\"missing\": NaN, \"cache_prefix\": \"cache\"}";
DMatrix Xy;
&iter, iter._proxy, DataIterator_Reset, DataIterator_Next, config, &Xy));
TrainModel(Xy);
safe_xgboost(XGDMatrixFree(Xy));
DataIterator_Free(&iter);
return 0;
}
C API of XGBoost, used for interfacing to other languages.
int XGBoosterFree(BoosterHandle handle)
free obj in handle
int XGBoosterEvalOneIter(BoosterHandle handle, int iter, DMatrixHandle dmats[], const char *evnames[], bst_ulong len, const char **out_result)
get evaluation statistics for xgboost
int XGBoosterUpdateOneIter(BoosterHandle handle, int iter, DMatrixHandle dtrain)
update the model in one round using dtrain
int XGBoosterSetParam(BoosterHandle handle, const char *name, const char *value)
set parameters
int XGBoosterCreate(const DMatrixHandle dmats[], bst_ulong len, BoosterHandle *out)
create xgboost learner
int XGDMatrixFree(DMatrixHandle handle)
free space in data matrix
int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, void const *data, bst_ulong size, int type)
Set meta info from dense matrix. Valid field names are:
void * BoosterHandle
handle to Booster
Definition: c_api.h:52
void * DMatrixHandle
handle to DMatrix
Definition: c_api.h:50
int XGBoosterSaveModel(BoosterHandle handle, const char *fname)
Save model into existing file.
int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset, XGDMatrixCallbackNext *next, char const *config, DMatrixHandle *out)
Create an external memory DMatrix with data iterator.
int XGProxyDMatrixSetDataDense(DMatrixHandle handle, char const *c_interface_str)
Set data on a DMatrix proxy.
int XGProxyDMatrixCreate(DMatrixHandle *out)
Create a DMatrix proxy for setting data, can be free by XGDMatrixFree.
void * DataIterHandle
handle to a external data iterator
Definition: c_api.h:335