normalizer.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. from typing import List, Optional
  2. import numpy as np
  3. import pandas as pd
  4. from ray.data.preprocessor import Preprocessor
  5. from ray.util.annotations import PublicAPI
  6. @PublicAPI(stability="alpha")
  7. class Normalizer(Preprocessor):
  8. r"""Scales each sample to have unit norm.
  9. This preprocessor works by dividing each sample (i.e., row) by the sample's norm.
  10. The general formula is given by
  11. .. math::
  12. s' = \frac{s}{\lVert s \rVert_p}
  13. where :math:`s` is the sample, :math:`s'` is the transformed sample,
  14. :math:\lVert s \rVert`, and :math:`p` is the norm type.
  15. The following norms are supported:
  16. * `"l1"` (:math:`L^1`): Sum of the absolute values.
  17. * `"l2"` (:math:`L^2`): Square root of the sum of the squared values.
  18. * `"max"` (:math:`L^\infty`): Maximum value.
  19. Examples:
  20. >>> import pandas as pd
  21. >>> import ray
  22. >>> from ray.data.preprocessors import Normalizer
  23. >>>
  24. >>> df = pd.DataFrame({"X1": [1, 1], "X2": [1, 0], "X3": [0, 1]})
  25. >>> ds = ray.data.from_pandas(df) # doctest: +SKIP
  26. >>> ds.to_pandas() # doctest: +SKIP
  27. X1 X2 X3
  28. 0 1 1 0
  29. 1 1 0 1
  30. The :math:`L^2`-norm of the first sample is :math:`\sqrt{2}`, and the
  31. :math:`L^2`-norm of the second sample is :math:`1`.
  32. >>> preprocessor = Normalizer(columns=["X1", "X2"])
  33. >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP
  34. X1 X2 X3
  35. 0 0.707107 0.707107 0
  36. 1 1.000000 0.000000 1
  37. The :math:`L^1`-norm of the first sample is :math:`2`, and the
  38. :math:`L^1`-norm of the second sample is :math:`1`.
  39. >>> preprocessor = Normalizer(columns=["X1", "X2"], norm="l1")
  40. >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP
  41. X1 X2 X3
  42. 0 0.5 0.5 0
  43. 1 1.0 0.0 1
  44. The :math:`L^\infty`-norm of the both samples is :math:`1`.
  45. >>> preprocessor = Normalizer(columns=["X1", "X2"], norm="max")
  46. >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP
  47. X1 X2 X3
  48. 0 1.0 1.0 0
  49. 1 1.0 0.0 1
  50. :class:`Normalizer` can also be used in append mode by providing the
  51. name of the output_columns that should hold the normalized values.
  52. >>> preprocessor = Normalizer(columns=["X1", "X2"], output_columns=["X1_normalized", "X2_normalized"])
  53. >>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP
  54. X1 X2 X3 X1_normalized X2_normalized
  55. 0 1 1 0 0.707107 0.707107
  56. 1 1 0 1 1.000000 0.000000
  57. Args:
  58. columns: The columns to scale. For each row, these colmumns are scaled to
  59. unit-norm.
  60. norm: The norm to use. The supported values are ``"l1"``, ``"l2"``, or
  61. ``"max"``. Defaults to ``"l2"``.
  62. output_columns: The names of the transformed columns. If None, the transformed
  63. columns will be the same as the input columns. If not None, the length of
  64. ``output_columns`` must match the length of ``columns``, othwerwise an error
  65. will be raised.
  66. Raises:
  67. ValueError: if ``norm`` is not ``"l1"``, ``"l2"``, or ``"max"``.
  68. """
  69. _norm_fns = {
  70. "l1": lambda cols: np.abs(cols).sum(axis=1),
  71. "l2": lambda cols: np.sqrt(np.power(cols, 2).sum(axis=1)),
  72. "max": lambda cols: np.max(abs(cols), axis=1),
  73. }
  74. _is_fittable = False
  75. def __init__(
  76. self,
  77. columns: List[str],
  78. norm="l2",
  79. *,
  80. output_columns: Optional[List[str]] = None,
  81. ):
  82. super().__init__()
  83. self.columns = columns
  84. self.norm = norm
  85. if norm not in self._norm_fns:
  86. raise ValueError(
  87. f"Norm {norm} is not supported."
  88. f"Supported values are: {self._norm_fns.keys()}"
  89. )
  90. self.output_columns = Preprocessor._derive_and_validate_output_columns(
  91. columns, output_columns
  92. )
  93. def _transform_pandas(self, df: pd.DataFrame):
  94. columns = df.loc[:, self.columns]
  95. column_norms = self._norm_fns[self.norm](columns)
  96. df[self.output_columns] = columns.div(column_norms, axis=0)
  97. return df
  98. def __repr__(self):
  99. return (
  100. f"{self.__class__.__name__}(columns={self.columns!r}, "
  101. f"norm={self.norm!r}, "
  102. f"output_columns={self.output_columns!r})"
  103. )