lightgbm_predictor.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. from typing import TYPE_CHECKING, List, Optional, Union
  2. import lightgbm
  3. import pandas as pd
  4. from pandas.api.types import is_object_dtype
  5. from ray.air.constants import TENSOR_COLUMN_NAME
  6. from ray.air.data_batch_type import DataBatchType
  7. from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
  8. from ray.train.lightgbm import LightGBMCheckpoint
  9. from ray.train.predictor import Predictor
  10. from ray.util.annotations import Deprecated
  11. if TYPE_CHECKING:
  12. from ray.data.preprocessor import Preprocessor
  13. @Deprecated
  14. class LightGBMPredictor(Predictor):
  15. """A predictor for LightGBM models.
  16. Args:
  17. model: The LightGBM booster to use for predictions.
  18. preprocessor: A preprocessor used to transform data batches prior
  19. to prediction.
  20. """
  21. def __init__(
  22. self, model: lightgbm.Booster, preprocessor: Optional["Preprocessor"] = None
  23. ):
  24. self.model = model
  25. super().__init__(preprocessor)
  26. def __repr__(self):
  27. return (
  28. f"{self.__class__.__name__}(model={self.model!r}, "
  29. f"preprocessor={self._preprocessor!r})"
  30. )
  31. @classmethod
  32. def from_checkpoint(cls, checkpoint: LightGBMCheckpoint) -> "LightGBMPredictor":
  33. """Instantiate the predictor from a LightGBMCheckpoint.
  34. Args:
  35. checkpoint: The checkpoint to load the model and preprocessor from.
  36. """
  37. model = checkpoint.get_model()
  38. preprocessor = checkpoint.get_preprocessor()
  39. return cls(model=model, preprocessor=preprocessor)
  40. def predict(
  41. self,
  42. data: DataBatchType,
  43. feature_columns: Optional[Union[List[str], List[int]]] = None,
  44. **predict_kwargs,
  45. ) -> DataBatchType:
  46. """Run inference on data batch.
  47. Args:
  48. data: A batch of input data.
  49. feature_columns: The names or indices of the columns in the
  50. data to use as features to predict on. If None, then use
  51. all columns in ``data``.
  52. **predict_kwargs: Keyword arguments passed to
  53. ``lightgbm.Booster.predict``.
  54. Examples:
  55. .. testcode::
  56. import numpy as np
  57. import lightgbm as lgbm
  58. from ray.train.lightgbm import LightGBMPredictor
  59. train_X = np.array([[1, 2], [3, 4]])
  60. train_y = np.array([0, 1])
  61. model = lgbm.LGBMClassifier().fit(train_X, train_y)
  62. predictor = LightGBMPredictor(model=model.booster_)
  63. data = np.array([[1, 2], [3, 4]])
  64. predictions = predictor.predict(data)
  65. # Only use first and second column as the feature
  66. data = np.array([[1, 2, 8], [3, 4, 9]])
  67. predictions = predictor.predict(data, feature_columns=[0, 1])
  68. import pandas as pd
  69. import lightgbm as lgbm
  70. from ray.train.lightgbm import LightGBMPredictor
  71. train_X = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
  72. train_y = pd.Series([0, 1])
  73. model = lgbm.LGBMClassifier().fit(train_X, train_y)
  74. predictor = LightGBMPredictor(model=model.booster_)
  75. # Pandas dataframe.
  76. data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
  77. predictions = predictor.predict(data)
  78. # Only use first and second column as the feature
  79. data = pd.DataFrame([[1, 2, 8], [3, 4, 9]], columns=["A", "B", "C"])
  80. predictions = predictor.predict(data, feature_columns=["A", "B"])
  81. Returns:
  82. Prediction result.
  83. """
  84. return Predictor.predict(
  85. self, data, feature_columns=feature_columns, **predict_kwargs
  86. )
  87. def _predict_pandas(
  88. self,
  89. data: "pd.DataFrame",
  90. feature_columns: Optional[Union[List[str], List[int]]] = None,
  91. **predict_kwargs,
  92. ) -> pd.DataFrame:
  93. feature_names = None
  94. if TENSOR_COLUMN_NAME in data:
  95. data = data[TENSOR_COLUMN_NAME].to_numpy()
  96. data = _unwrap_ndarray_object_type_if_needed(data)
  97. if feature_columns:
  98. # In this case feature_columns is a list of integers
  99. data = data[:, feature_columns]
  100. # Turn into dataframe to make dtype resolution easy
  101. data = pd.DataFrame(data, columns=feature_names)
  102. data = data.infer_objects()
  103. # Pandas does not detect categorical dtypes. Any remaining object
  104. # dtypes are probably categories, so convert them.
  105. # This will fail if we have a category composed entirely of
  106. # integers, but this is the best we can do here.
  107. update_dtypes = {}
  108. for column in data.columns:
  109. dtype = data.dtypes[column]
  110. if is_object_dtype(dtype):
  111. update_dtypes[column] = pd.CategoricalDtype()
  112. if update_dtypes:
  113. data = data.astype(update_dtypes, copy=False)
  114. elif feature_columns:
  115. # feature_columns is a list of integers or strings
  116. data = data[feature_columns]
  117. df = pd.DataFrame(self.model.predict(data, **predict_kwargs))
  118. df.columns = (
  119. ["predictions"]
  120. if len(df.columns) == 1
  121. else [f"predictions_{i}" for i in range(len(df.columns))]
  122. )
  123. return df