| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239 |
- """W&B callback for lightgbm.
- Really simple callback to get logging for each tree
- Example usage:
- param_list = [("eta", 0.08), ("max_depth", 6), ("subsample", 0.8), ("colsample_bytree", 0.8), ("alpha", 8), ("num_class", 10)]
- config.update(dict(param_list))
- lgb = lgb.train(param_list, d_train, callbacks=[wandb_callback()])
- """
- from pathlib import Path
- from typing import TYPE_CHECKING, Callable
- import lightgbm # type: ignore
- from lightgbm import Booster
- import wandb
- from wandb.sdk.lib import telemetry as wb_telemetry
- MINIMIZE_METRICS = [
- "l1",
- "l2",
- "rmse",
- "mape",
- "huber",
- "fair",
- "poisson",
- "gamma",
- "binary_logloss",
- ]
- MAXIMIZE_METRICS = ["map", "auc", "average_precision"]
- if TYPE_CHECKING:
- from typing import Any, NamedTuple, Union
- # Note: upstream lightgbm has this defined incorrectly
- _EvalResultTuple = Union[
- tuple[str, str, float, bool], tuple[str, str, float, bool, float]
- ]
- class CallbackEnv(NamedTuple):
- model: Any
- params: dict
- iteration: int
- begin_interation: int
- end_iteration: int
- evaluation_result_list: list[_EvalResultTuple]
- def _define_metric(data: str, metric_name: str) -> None:
- """Capture model performance at the best step.
- instead of the last step, of training in your `wandb.summary`
- """
- if "loss" in str.lower(metric_name):
- wandb.define_metric(f"{data}_{metric_name}", summary="min")
- elif str.lower(metric_name) in MINIMIZE_METRICS:
- wandb.define_metric(f"{data}_{metric_name}", summary="min")
- elif str.lower(metric_name) in MAXIMIZE_METRICS:
- wandb.define_metric(f"{data}_{metric_name}", summary="max")
- def _checkpoint_artifact(
- model: "Booster", iteration: int, aliases: "list[str]"
- ) -> None:
- """Upload model checkpoint as W&B artifact."""
- # NOTE: type ignore required because wandb.run is improperly inferred as None type
- model_name = f"model_{wandb.run.id}" # type: ignore
- model_path = Path(wandb.run.dir) / f"model_ckpt_{iteration}.txt" # type: ignore
- model.save_model(model_path, num_iteration=iteration)
- model_artifact = wandb.Artifact(name=model_name, type="model")
- model_artifact.add_file(str(model_path))
- wandb.log_artifact(model_artifact, aliases=aliases)
- def _log_feature_importance(model: "Booster") -> None:
- """Log feature importance."""
- feat_imps = model.feature_importance()
- feats = model.feature_name()
- fi_data = [[feat, feat_imp] for feat, feat_imp in zip(feats, feat_imps)]
- table = wandb.Table(data=fi_data, columns=["Feature", "Importance"])
- wandb.log(
- {
- "Feature Importance": wandb.plot.bar(
- table, "Feature", "Importance", title="Feature Importance"
- )
- },
- commit=False,
- )
- class _WandbCallback:
- """Internal class to handle `wandb_callback` logic.
- This callback is adapted form the LightGBM's `_RecordEvaluationCallback`.
- """
- def __init__(self, log_params: bool = True, define_metric: bool = True) -> None:
- self.order = 20
- self.before_iteration = False
- self.log_params = log_params
- self.define_metric_bool = define_metric
- def _init(self, env: "CallbackEnv") -> None:
- with wb_telemetry.context() as tel:
- tel.feature.lightgbm_wandb_callback = True
- # log the params as W&B config.
- if self.log_params:
- wandb.config.update(env.params)
- # use `define_metric` to set the wandb summary to the best metric value.
- for item in env.evaluation_result_list:
- if self.define_metric_bool:
- if len(item) == 4:
- data_name, eval_name = item[:2]
- _define_metric(data_name, eval_name)
- else:
- data_name, eval_name = item[1].split()
- _define_metric(data_name, f"{eval_name}-mean")
- _define_metric(data_name, f"{eval_name}-stdv")
- def __call__(self, env: "CallbackEnv") -> None:
- if env.iteration == env.begin_iteration: # type: ignore
- self._init(env)
- for item in env.evaluation_result_list:
- if len(item) == 4:
- data_name, eval_name, result = item[:3]
- wandb.log(
- {data_name + "_" + eval_name: result},
- commit=False,
- )
- else:
- data_name, eval_name = item[1].split()
- res_mean = item[2]
- res_stdv = item[4]
- wandb.log(
- {
- data_name + "_" + eval_name + "-mean": res_mean,
- data_name + "_" + eval_name + "-stdv": res_stdv,
- },
- commit=False,
- )
- # call `commit=True` to log the data as a single W&B step.
- wandb.log({"iteration": env.iteration}, commit=True)
- def wandb_callback(log_params: bool = True, define_metric: bool = True) -> Callable:
- """Automatically integrates LightGBM with wandb.
- Args:
- log_params: (boolean) if True (default) logs params passed to lightgbm.train as W&B config
- define_metric: (boolean) if True (default) capture model performance at the best step, instead of the last step, of training in your `wandb.summary`
- Passing `wandb_callback` to LightGBM will:
- - log params passed to lightgbm.train as W&B config (default).
- - log evaluation metrics collected by LightGBM, such as rmse, accuracy etc to Weights & Biases
- - Capture the best metric in `wandb.summary` when `define_metric=True` (default).
- Use `log_summary` as an extension of this callback.
- Example:
- ```python
- params = {
- "boosting_type": "gbdt",
- "objective": "regression",
- }
- gbm = lgb.train(
- params,
- lgb_train,
- num_boost_round=10,
- valid_sets=lgb_eval,
- valid_names=("validation"),
- callbacks=[wandb_callback()],
- )
- ```
- """
- return _WandbCallback(log_params, define_metric)
- def log_summary(
- model: Booster, feature_importance: bool = True, save_model_checkpoint: bool = False
- ) -> None:
- """Log useful metrics about lightgbm model after training is done.
- Args:
- model: (Booster) is an instance of lightgbm.basic.Booster.
- feature_importance: (boolean) if True (default), logs the feature importance plot.
- save_model_checkpoint: (boolean) if True saves the best model and upload as W&B artifacts.
- Using this along with `wandb_callback` will:
- - log `best_iteration` and `best_score` as `wandb.summary`.
- - log feature importance plot.
- - save and upload your best trained model to Weights & Biases Artifacts (when `save_model_checkpoint = True`)
- Example:
- ```python
- params = {
- "boosting_type": "gbdt",
- "objective": "regression",
- }
- gbm = lgb.train(
- params,
- lgb_train,
- num_boost_round=10,
- valid_sets=lgb_eval,
- valid_names=("validation"),
- callbacks=[wandb_callback()],
- )
- log_summary(gbm)
- ```
- """
- if wandb.run is None:
- raise wandb.Error("You must call wandb.init() before WandbCallback()")
- if not isinstance(model, Booster):
- raise wandb.Error("Model should be an instance of lightgbm.basic.Booster")
- wandb.run.summary["best_iteration"] = model.best_iteration
- wandb.run.summary["best_score"] = model.best_score
- # Log feature importance
- if feature_importance:
- _log_feature_importance(model)
- if save_model_checkpoint:
- _checkpoint_artifact(model, model.best_iteration, aliases=["best"])
- with wb_telemetry.context() as tel:
- tel.feature.lightgbm_log_summary = True
|