hasher.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. import collections
  2. from typing import List
  3. import pandas as pd
  4. from ray.data.preprocessor import Preprocessor
  5. from ray.data.preprocessors.utils import simple_hash
  6. from ray.util.annotations import PublicAPI
  7. @PublicAPI(stability="alpha")
  8. class FeatureHasher(Preprocessor):
  9. r"""Apply the `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_ to a
  10. table that describes token frequencies.
  11. :class:`FeatureHasher` creates ``num_features`` columns named ``hash_{index}``,
  12. where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column
  13. ``hash_{index}`` describes the frequency of tokens that hash to ``index``.
  14. Distinct tokens can correspond to the same index. However, if ``num_features`` is
  15. large enough, then columns probably correspond to a unique token.
  16. This preprocessor is memory efficient and quick to pickle. However, given a
  17. transformed column, you can't know which tokens correspond to it. This might make it
  18. hard to determine which tokens are important to your model.
  19. .. warning::
  20. Sparse matrices aren't supported. If you use a large ``num_features``, this
  21. preprocessor might behave poorly.
  22. Examples:
  23. >>> import pandas as pd
  24. >>> import ray
  25. >>> from ray.data.preprocessors import FeatureHasher
  26. The data below describes the frequencies of tokens in ``"I like Python"`` and
  27. ``"I dislike Python"``.
  28. >>> df = pd.DataFrame({
  29. ... "I": [1, 1],
  30. ... "like": [1, 0],
  31. ... "dislike": [0, 1],
  32. ... "Python": [1, 1]
  33. ... })
  34. >>> ds = ray.data.from_pandas(df) # doctest: +SKIP
  35. :class:`FeatureHasher` hashes each token to determine its index. For example,
  36. the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.
  37. >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed")
  38. >>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy() # doctest: +SKIP
  39. array([[0, 0, 0, 2, 0, 1, 0, 0],
  40. [0, 0, 0, 1, 0, 1, 1, 0]])
  41. Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index
  42. :math:`3`. You can avoid hash collisions like these by increasing
  43. ``num_features``.
  44. Args:
  45. columns: The columns to apply the hashing trick to. Each column should describe
  46. the frequency of a token.
  47. num_features: The number of features used to represent the vocabulary. You
  48. should choose a value large enough to prevent hash collisions between
  49. distinct tokens.
  50. output_column: The name of the column that contains the hashed features.
  51. .. seealso::
  52. :class:`~ray.data.preprocessors.CountVectorizer`
  53. Use this preprocessor to generate inputs for :class:`FeatureHasher`.
  54. :class:`ray.data.preprocessors.HashingVectorizer`
  55. If your input data describes documents rather than token frequencies,
  56. use :class:`~ray.data.preprocessors.HashingVectorizer`.
  57. """ # noqa: E501
  58. _is_fittable = False
  59. def __init__(
  60. self,
  61. columns: List[str],
  62. num_features: int,
  63. output_column: str,
  64. ):
  65. super().__init__()
  66. self.columns = columns
  67. # TODO(matt): Set default number of features.
  68. # This likely requires sparse matrix support to avoid explosion of columns.
  69. self.num_features = num_features
  70. self.output_column = output_column
  71. def _transform_pandas(self, df: pd.DataFrame):
  72. # TODO(matt): Use sparse matrix for efficiency.
  73. def row_feature_hasher(row):
  74. hash_counts = collections.defaultdict(int)
  75. for column in self.columns:
  76. hashed_value = simple_hash(column, self.num_features)
  77. hash_counts[hashed_value] += row[column]
  78. return {f"hash_{i}": hash_counts[i] for i in range(self.num_features)}
  79. feature_columns = df.loc[:, self.columns].apply(
  80. row_feature_hasher, axis=1, result_type="expand"
  81. )
  82. # Concatenate the hash columns
  83. hash_columns = [f"hash_{i}" for i in range(self.num_features)]
  84. concatenated = feature_columns[hash_columns].to_numpy()
  85. # Use a Pandas Series for column assignment to get more consistent
  86. # behavior across Pandas versions.
  87. df.loc[:, self.output_column] = pd.Series(list(concatenated))
  88. return df
  89. def get_input_columns(self) -> List[str]:
  90. return self.columns
  91. def get_output_columns(self) -> List[str]:
  92. return [self.output_column]
  93. def __repr__(self):
  94. return (
  95. f"{self.__class__.__name__}(columns={self.columns!r}, "
  96. f"num_features={self.num_features!r}, "
  97. f"output_column={self.output_column!r})"
  98. )