Note
Go to the end to download the full example code.
Demo for using cross validation
import os
from typing import Any, Dict, Tuple
import numpy as np
import xgboost as xgb
# load data in do training
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(
os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
)
param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
num_round = 2
print("running cross validation")
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(
param,
dtrain,
num_round,
nfold=5,
metrics={"error"},
seed=0,
callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)],
)
print("running cross validation, disable standard deviation display")
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value
res = xgb.cv(
param,
dtrain,
num_boost_round=10,
nfold=5,
metrics={"error"},
seed=0,
callbacks=[
xgb.callback.EvaluationMonitor(show_stdv=False),
xgb.callback.EarlyStopping(3),
],
)
print(res)
print("running cross validation, with preprocessing function")
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(
dtrain: xgb.DMatrix, dtest: xgb.DMatrix, param: Any
) -> Tuple[xgb.DMatrix, xgb.DMatrix, Dict[str, Any]]:
label = dtrain.get_label()
ratio = float(np.sum(label == 0)) / np.sum(label == 1)
param["scale_pos_weight"] = ratio
return (dtrain, dtest, param)
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5, metrics={"auc"}, seed=0, fpreproc=fpreproc)
###
# you can also do cross validation with customized loss function
# See custom_objective.py
##
print("running cross validation, with customized loss function")
def logregobj(preds: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0 - preds)
return grad, hess
def evalerror(preds: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
return "error", float(sum(labels != (preds > 0.0))) / len(labels)
param = {"max_depth": 2, "eta": 1}
# train with customized objective
xgb.cv(
param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, custom_metric=evalerror
)