Note
Go to the end to download the full example code.
Use scikit-learn regressor interface with CPU histogram tree method
from dask import array as da
from dask.distributed import Client, LocalCluster
from xgboost import dask as dxgb
def main(client: Client) -> dxgb.Booster:
# generate some random data for demonstration
n = 100
m = 10000
partition_size = 100
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
regressor = dxgb.DaskXGBRegressor(verbosity=1, n_estimators=2)
regressor.set_params(tree_method="hist")
# assigning client here is optional
regressor.client = client
regressor.fit(X, y, eval_set=[(X, y)])
prediction = regressor.predict(X)
bst = regressor.get_booster()
history = regressor.evals_result()
print("Evaluation history:", history)
# returned prediction is always a dask array.
assert isinstance(prediction, da.Array)
return bst # returning the trained model
if __name__ == "__main__":
# or use other clusters for scaling
with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
with Client(cluster) as client:
main(client)