transformer.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. from typing import List, Optional
  2. import numpy as np
  3. import pandas as pd
  4. from ray.data.preprocessor import Preprocessor
  5. from ray.util.annotations import PublicAPI
  6. @PublicAPI(stability="alpha")
  7. class PowerTransformer(Preprocessor):
  8. """Apply a `power transform <https://en.wikipedia.org/wiki/Power_transform>`_ to
  9. make your data more normally distributed.
  10. Some models expect data to be normally distributed. By making your data more
  11. Gaussian-like, you might be able to improve your model's performance.
  12. This preprocessor supports the following transformations:
  13. * `Yeo-Johnson <https://en.wikipedia.org/wiki/Power_transform#Yeo%E2%80%93Johnson_transformation>`_
  14. * `Box-Cox <https://en.wikipedia.org/wiki/Power_transform#Box%E2%80%93Cox_transformation>`_
  15. Box-Cox requires all data to be positive.
  16. .. warning::
  17. You need to manually specify the transform's power parameter. If you
  18. choose a bad value, the transformation might not work well.
  19. Args:
  20. columns: The columns to separately transform.
  21. power: A parameter that determines how your data is transformed. Practioners
  22. typically set ``power`` between :math:`-2.5` and :math:`2.5`, although you
  23. may need to try different values to find one that works well.
  24. method: A string representing which transformation to apply. Supports
  25. ``"yeo-johnson"`` and ``"box-cox"``. If you choose ``"box-cox"``, your data
  26. needs to be positive. Defaults to ``"yeo-johnson"``.
  27. output_columns: The names of the transformed columns. If None, the transformed
  28. columns will be the same as the input columns. If not None, the length of
  29. ``output_columns`` must match the length of ``columns``, othwerwise an error
  30. will be raised.
  31. """ # noqa: E501
  32. _valid_methods = ["yeo-johnson", "box-cox"]
  33. _is_fittable = False
  34. def __init__(
  35. self,
  36. columns: List[str],
  37. power: float,
  38. method: str = "yeo-johnson",
  39. *,
  40. output_columns: Optional[List[str]] = None,
  41. ):
  42. super().__init__()
  43. self.columns = columns
  44. self.method = method
  45. self.power = power
  46. self.output_columns = Preprocessor._derive_and_validate_output_columns(
  47. columns, output_columns
  48. )
  49. if method not in self._valid_methods:
  50. raise ValueError(
  51. f"Method {method} is not supported."
  52. f"Supported values are: {self._valid_methods}"
  53. )
  54. def _transform_pandas(self, df: pd.DataFrame):
  55. def column_power_transformer(s: pd.Series):
  56. if self.method == "yeo-johnson":
  57. result = np.zeros_like(s, dtype=np.float64)
  58. pos = s >= 0 # binary mask
  59. if self.power != 0:
  60. result[pos] = (np.power(s[pos] + 1, self.power) - 1) / self.power
  61. else:
  62. result[pos] = np.log(s[pos] + 1)
  63. if self.power != 2:
  64. result[~pos] = -(np.power(-s[~pos] + 1, 2 - self.power) - 1) / (
  65. 2 - self.power
  66. )
  67. else:
  68. result[~pos] = -np.log(-s[~pos] + 1)
  69. return result
  70. else: # box-cox
  71. if self.power != 0:
  72. return (np.power(s, self.power) - 1) / self.power
  73. else:
  74. return np.log(s)
  75. df[self.output_columns] = df[self.columns].transform(column_power_transformer)
  76. return df
  77. def __repr__(self):
  78. return (
  79. f"{self.__class__.__name__}(columns={self.columns!r}, "
  80. f"power={self.power!r}, method={self.method!r}, "
  81. f"output_columns={self.output_columns!r})"
  82. )