xgboost_predictor.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
  2. import pandas as pd
  3. import xgboost
  4. from ray.air.constants import TENSOR_COLUMN_NAME
  5. from ray.air.data_batch_type import DataBatchType
  6. from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
  7. from ray.train.predictor import Predictor
  8. from ray.train.xgboost import XGBoostCheckpoint
  9. from ray.util.annotations import Deprecated
  10. if TYPE_CHECKING:
  11. from ray.data.preprocessor import Preprocessor
  12. @Deprecated
  13. class XGBoostPredictor(Predictor):
  14. """A predictor for XGBoost models.
  15. Args:
  16. model: The XGBoost booster to use for predictions.
  17. preprocessor: A preprocessor used to transform data batches prior
  18. to prediction.
  19. """
  20. def __init__(
  21. self, model: xgboost.Booster, preprocessor: Optional["Preprocessor"] = None
  22. ):
  23. self.model = model
  24. super().__init__(preprocessor)
  25. def __repr__(self):
  26. return (
  27. f"{self.__class__.__name__}(model={self.model!r}, "
  28. f"preprocessor={self._preprocessor!r})"
  29. )
  30. @classmethod
  31. def from_checkpoint(cls, checkpoint: XGBoostCheckpoint) -> "XGBoostPredictor":
  32. """Instantiate the predictor from a Checkpoint.
  33. This is a helper constructor that instantiates the predictor from a
  34. framework-specific XGBoost checkpoint.
  35. Args:
  36. checkpoint: The checkpoint to load the model and preprocessor from.
  37. """
  38. model = checkpoint.get_model()
  39. preprocessor = checkpoint.get_preprocessor()
  40. return cls(model=model, preprocessor=preprocessor)
  41. def predict(
  42. self,
  43. data: DataBatchType,
  44. feature_columns: Optional[Union[List[str], List[int]]] = None,
  45. dmatrix_kwargs: Optional[Dict[str, Any]] = None,
  46. **predict_kwargs,
  47. ) -> DataBatchType:
  48. """Run inference on data batch.
  49. The data is converted into an XGBoost DMatrix before being inputted to
  50. the model.
  51. Args:
  52. data: A batch of input data.
  53. feature_columns: The names or indices of the columns in the
  54. data to use as features to predict on. If None, then use
  55. all columns in ``data``.
  56. dmatrix_kwargs: Dict of keyword arguments passed to ``xgboost.DMatrix``.
  57. **predict_kwargs: Keyword arguments passed to ``xgboost.Booster.predict``.
  58. Examples:
  59. .. testcode::
  60. import numpy as np
  61. import xgboost as xgb
  62. from ray.train.xgboost import XGBoostPredictor
  63. train_X = np.array([[1, 2], [3, 4]])
  64. train_y = np.array([0, 1])
  65. model = xgb.XGBClassifier().fit(train_X, train_y)
  66. predictor = XGBoostPredictor(model=model.get_booster())
  67. data = np.array([[1, 2], [3, 4]])
  68. predictions = predictor.predict(data)
  69. # Only use first and second column as the feature
  70. data = np.array([[1, 2, 8], [3, 4, 9]])
  71. predictions = predictor.predict(data, feature_columns=[0, 1])
  72. .. testcode::
  73. import pandas as pd
  74. import xgboost as xgb
  75. from ray.train.xgboost import XGBoostPredictor
  76. train_X = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
  77. train_y = pd.Series([0, 1])
  78. model = xgb.XGBClassifier().fit(train_X, train_y)
  79. predictor = XGBoostPredictor(model=model.get_booster())
  80. # Pandas dataframe.
  81. data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
  82. predictions = predictor.predict(data)
  83. # Only use first and second column as the feature
  84. data = pd.DataFrame([[1, 2, 8], [3, 4, 9]], columns=["A", "B", "C"])
  85. predictions = predictor.predict(data, feature_columns=["A", "B"])
  86. Returns:
  87. Prediction result.
  88. """
  89. return Predictor.predict(
  90. self,
  91. data,
  92. feature_columns=feature_columns,
  93. dmatrix_kwargs=dmatrix_kwargs,
  94. **predict_kwargs,
  95. )
  96. def _predict_pandas(
  97. self,
  98. data: "pd.DataFrame",
  99. feature_columns: Optional[Union[List[str], List[int]]] = None,
  100. dmatrix_kwargs: Optional[Dict[str, Any]] = None,
  101. **predict_kwargs,
  102. ) -> "pd.DataFrame":
  103. dmatrix_kwargs = dmatrix_kwargs or {}
  104. feature_names = None
  105. if TENSOR_COLUMN_NAME in data:
  106. data = data[TENSOR_COLUMN_NAME].to_numpy()
  107. data = _unwrap_ndarray_object_type_if_needed(data)
  108. if feature_columns:
  109. # In this case feature_columns is a list of integers
  110. data = data[:, feature_columns]
  111. elif feature_columns:
  112. # feature_columns is a list of integers or strings
  113. data = data[feature_columns].to_numpy()
  114. # Only set the feature names if they are strings
  115. if all(isinstance(fc, str) for fc in feature_columns):
  116. feature_names = feature_columns
  117. else:
  118. feature_columns = data.columns.tolist()
  119. data = data.to_numpy()
  120. if all(isinstance(fc, str) for fc in feature_columns):
  121. feature_names = feature_columns
  122. if feature_names:
  123. dmatrix_kwargs["feature_names"] = feature_names
  124. matrix = xgboost.DMatrix(data, **dmatrix_kwargs)
  125. df = pd.DataFrame(self.model.predict(matrix, **predict_kwargs))
  126. df.columns = (
  127. ["predictions"]
  128. if len(df.columns) == 1
  129. else [f"predictions_{i}" for i in range(len(df.columns))]
  130. )
  131. return df