vectorizer.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. from collections import Counter
  2. from typing import TYPE_CHECKING, Callable, List, Optional
  3. import pandas as pd
  4. from ray.data.preprocessor import Preprocessor
  5. from ray.data.preprocessors.utils import simple_hash, simple_split_tokenizer
  6. from ray.util.annotations import PublicAPI
  7. if TYPE_CHECKING:
  8. from ray.data.dataset import Dataset
  9. @PublicAPI(stability="alpha")
  10. class HashingVectorizer(Preprocessor):
  11. """Count the frequency of tokens using the
  12. `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_.
  13. This preprocessors creates a list column for each input column. For each row,
  14. the list contains the frequency counts of tokens (for CountVectorizer) or hash values
  15. (for HashingVectorizer). For HashingVectorizer, the list will have length
  16. ``num_features``. If ``num_features`` is large enough relative to the size of your
  17. vocabulary, then each index approximately corresponds to the frequency of a unique
  18. token.
  19. :class:`HashingVectorizer` is memory efficient and quick to pickle. However, given a
  20. transformed column, you can't know which tokens correspond to it. This might make it
  21. hard to determine which tokens are important to your model.
  22. .. note::
  23. This preprocessor transforms each input column to a
  24. `document-term matrix <https://en.wikipedia.org/wiki/Document-term_matrix>`_.
  25. A document-term matrix is a table that describes the frequency of tokens in a
  26. collection of documents. For example, the strings `"I like Python"` and `"I
  27. dislike Python"` might have the document-term matrix below:
  28. .. code-block::
  29. corpus_I corpus_Python corpus_dislike corpus_like
  30. 0 1 1 1 0
  31. 1 1 1 0 1
  32. To generate the matrix, you typically map each token to a unique index. For
  33. example:
  34. .. code-block::
  35. token index
  36. 0 I 0
  37. 1 Python 1
  38. 2 dislike 2
  39. 3 like 3
  40. The problem with this approach is that memory use scales linearly with the size
  41. of your vocabulary. :class:`HashingVectorizer` circumvents this problem by
  42. computing indices with a hash function:
  43. :math:`\\texttt{index} = hash(\\texttt{token})`.
  44. .. warning::
  45. Sparse matrices aren't currently supported. If you use a large ``num_features``,
  46. this preprocessor might behave poorly.
  47. Examples:
  48. >>> import pandas as pd
  49. >>> import ray
  50. >>> from ray.data.preprocessors import HashingVectorizer
  51. >>>
  52. >>> df = pd.DataFrame({
  53. ... "corpus": [
  54. ... "Jimmy likes volleyball",
  55. ... "Bob likes volleyball too",
  56. ... "Bob also likes fruit jerky"
  57. ... ]
  58. ... })
  59. >>> ds = ray.data.from_pandas(df) # doctest: +SKIP
  60. >>>
  61. >>> vectorizer = HashingVectorizer(["corpus"], num_features=8)
  62. >>> vectorizer.fit_transform(ds).to_pandas() # doctest: +SKIP
  63. corpus
  64. 0 [1, 0, 1, 0, 0, 0, 0, 1]
  65. 1 [1, 0, 1, 0, 0, 0, 1, 1]
  66. 2 [0, 0, 1, 1, 0, 2, 1, 0]
  67. :class:`HashingVectorizer` can also be used in append mode by providing the
  68. name of the output_columns that should hold the encoded values.
  69. >>> vectorizer = HashingVectorizer(["corpus"], num_features=8, output_columns=["corpus_hashed"])
  70. >>> vectorizer.fit_transform(ds).to_pandas() # doctest: +SKIP
  71. corpus corpus_hashed
  72. 0 Jimmy likes volleyball [1, 0, 1, 0, 0, 0, 0, 1]
  73. 1 Bob likes volleyball too [1, 0, 1, 0, 0, 0, 1, 1]
  74. 2 Bob also likes fruit jerky [0, 0, 1, 1, 0, 2, 1, 0]
  75. Args:
  76. columns: The columns to separately tokenize and count.
  77. num_features: The number of features used to represent the vocabulary. You
  78. should choose a value large enough to prevent hash collisions between
  79. distinct tokens.
  80. tokenization_fn: The function used to generate tokens. This function
  81. should accept a string as input and return a list of tokens as
  82. output. If unspecified, the tokenizer uses a function equivalent to
  83. ``lambda s: s.split(" ")``.
  84. output_columns: The names of the transformed columns. If None, the transformed
  85. columns will be the same as the input columns. If not None, the length of
  86. ``output_columns`` must match the length of ``columns``, othwerwise an error
  87. will be raised.
  88. .. seealso::
  89. :class:`CountVectorizer`
  90. Another method for counting token frequencies. Unlike :class:`HashingVectorizer`,
  91. :class:`CountVectorizer` creates a feature for each unique token. This
  92. enables you to compute the inverse transformation.
  93. :class:`FeatureHasher`
  94. This preprocessor is similar to :class:`HashingVectorizer`, except it expects
  95. a table describing token frequencies. In contrast,
  96. :class:`FeatureHasher` expects a column containing documents.
  97. """ # noqa: E501
  98. _is_fittable = False
  99. def __init__(
  100. self,
  101. columns: List[str],
  102. num_features: int,
  103. tokenization_fn: Optional[Callable[[str], List[str]]] = None,
  104. *,
  105. output_columns: Optional[List[str]] = None,
  106. ):
  107. super().__init__()
  108. self.columns = columns
  109. self.num_features = num_features
  110. self.tokenization_fn = tokenization_fn or simple_split_tokenizer
  111. self.output_columns = Preprocessor._derive_and_validate_output_columns(
  112. columns, output_columns
  113. )
  114. def _transform_pandas(self, df: pd.DataFrame):
  115. def hash_count(tokens: List[str]) -> Counter:
  116. hashed_tokens = [simple_hash(token, self.num_features) for token in tokens]
  117. return Counter(hashed_tokens)
  118. for col, output_col in zip(self.columns, self.output_columns):
  119. tokenized = df[col].map(self.tokenization_fn)
  120. hashed = tokenized.map(hash_count)
  121. # Create a list to store the hash columns
  122. hash_columns = []
  123. for i in range(self.num_features):
  124. series = hashed.map(lambda counts: counts[i])
  125. series.name = f"hash_{i}"
  126. hash_columns.append(series)
  127. # Concatenate all hash columns into a single list column
  128. df[output_col] = pd.concat(hash_columns, axis=1).values.tolist()
  129. return df
  130. def __repr__(self):
  131. fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn)
  132. return (
  133. f"{self.__class__.__name__}(columns={self.columns!r}, "
  134. f"num_features={self.num_features!r}, tokenization_fn={fn_name}, "
  135. f"output_columns={self.output_columns!r})"
  136. )
  137. @PublicAPI(stability="alpha")
  138. class CountVectorizer(Preprocessor):
  139. """Count the frequency of tokens in a column of strings.
  140. :class:`CountVectorizer` operates on columns that contain strings. For example:
  141. .. code-block::
  142. corpus
  143. 0 I dislike Python
  144. 1 I like Python
  145. This preprocessor creates a list column for each input column. Each list contains
  146. the frequency counts of tokens in order of their first appearance. For example:
  147. .. code-block::
  148. corpus
  149. 0 [1, 1, 1, 0] # Counts for [I, dislike, Python, like]
  150. 1 [1, 0, 1, 1] # Counts for [I, dislike, Python, like]
  151. Examples:
  152. >>> import pandas as pd
  153. >>> import ray
  154. >>> from ray.data.preprocessors import CountVectorizer
  155. >>>
  156. >>> df = pd.DataFrame({
  157. ... "corpus": [
  158. ... "Jimmy likes volleyball",
  159. ... "Bob likes volleyball too",
  160. ... "Bob also likes fruit jerky"
  161. ... ]
  162. ... })
  163. >>> ds = ray.data.from_pandas(df) # doctest: +SKIP
  164. >>>
  165. >>> vectorizer = CountVectorizer(["corpus"])
  166. >>> vectorizer.fit_transform(ds).to_pandas() # doctest: +SKIP
  167. corpus
  168. 0 [1, 0, 1, 1, 0, 0, 0, 0]
  169. 1 [1, 1, 1, 0, 0, 0, 0, 1]
  170. 2 [1, 1, 0, 0, 1, 1, 1, 0]
  171. You can limit the number of tokens in the vocabulary with ``max_features``.
  172. >>> vectorizer = CountVectorizer(["corpus"], max_features=3)
  173. >>> vectorizer.fit_transform(ds).to_pandas() # doctest: +SKIP
  174. corpus
  175. 0 [1, 0, 1]
  176. 1 [1, 1, 1]
  177. 2 [1, 1, 0]
  178. :class:`CountVectorizer` can also be used in append mode by providing the
  179. name of the output_columns that should hold the encoded values.
  180. >>> vectorizer = CountVectorizer(["corpus"], output_columns=["corpus_counts"])
  181. >>> vectorizer.fit_transform(ds).to_pandas() # doctest: +SKIP
  182. corpus corpus_counts
  183. 0 Jimmy likes volleyball [1, 0, 1, 1, 0, 0, 0, 0]
  184. 1 Bob likes volleyball too [1, 1, 1, 0, 0, 0, 0, 1]
  185. 2 Bob also likes fruit jerky [1, 1, 0, 0, 1, 1, 1, 0]
  186. Args:
  187. columns: The columns to separately tokenize and count.
  188. tokenization_fn: The function used to generate tokens. This function
  189. should accept a string as input and return a list of tokens as
  190. output. If unspecified, the tokenizer uses a function equivalent to
  191. ``lambda s: s.split(" ")``.
  192. max_features: The maximum number of tokens to encode in the transformed
  193. dataset. If specified, only the most frequent tokens are encoded.
  194. output_columns: The names of the transformed columns. If None, the transformed
  195. columns will be the same as the input columns. If not None, the length of
  196. ``output_columns`` must match the length of ``columns``, othwerwise an error
  197. will be raised.
  198. """ # noqa: E501
  199. def __init__(
  200. self,
  201. columns: List[str],
  202. tokenization_fn: Optional[Callable[[str], List[str]]] = None,
  203. max_features: Optional[int] = None,
  204. *,
  205. output_columns: Optional[List[str]] = None,
  206. ):
  207. super().__init__()
  208. self.columns = columns
  209. self.tokenization_fn = tokenization_fn or simple_split_tokenizer
  210. self.max_features = max_features
  211. self.output_columns = Preprocessor._derive_and_validate_output_columns(
  212. columns, output_columns
  213. )
  214. def _fit(self, dataset: "Dataset") -> Preprocessor:
  215. def stat_fn(key_gen):
  216. def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]:
  217. def get_token_counts(col):
  218. token_series = df[col].apply(self.tokenization_fn)
  219. tokens = token_series.sum()
  220. return Counter(tokens)
  221. return {col: [get_token_counts(col)] for col in self.columns}
  222. value_counts = dataset.map_batches(
  223. get_pd_value_counts, batch_format="pandas"
  224. )
  225. total_counts = {col: Counter() for col in self.columns}
  226. for batch in value_counts.iter_batches(batch_size=None):
  227. for col, counters in batch.items():
  228. for counter in counters:
  229. total_counts[col].update(counter)
  230. def most_common(counter: Counter, n: int):
  231. return Counter(dict(counter.most_common(n)))
  232. top_counts = [
  233. most_common(counter, self.max_features)
  234. for counter in total_counts.values()
  235. ]
  236. return {
  237. key_gen(col): counts # noqa
  238. for (col, counts) in zip(self.columns, top_counts)
  239. }
  240. self.stat_computation_plan.add_callable_stat(
  241. stat_fn=lambda key_gen: stat_fn(key_gen),
  242. stat_key_fn=lambda col: f"token_counts({col})",
  243. columns=self.columns,
  244. )
  245. return self
  246. def _transform_pandas(self, df: pd.DataFrame):
  247. result_columns = []
  248. for col, output_col in zip(self.columns, self.output_columns):
  249. token_counts = self.stats_[f"token_counts({col})"]
  250. sorted_tokens = [token for (token, count) in token_counts.most_common()]
  251. tokenized = df[col].map(self.tokenization_fn).map(Counter)
  252. # Create a list to store token frequencies
  253. token_columns = []
  254. for token in sorted_tokens:
  255. series = tokenized.map(lambda val: val[token])
  256. series.name = token
  257. token_columns.append(series)
  258. # Concatenate all token columns into a single list column
  259. if token_columns:
  260. df[output_col] = pd.concat(token_columns, axis=1).values.tolist()
  261. else:
  262. df[output_col] = [[]] * len(df)
  263. result_columns.append(output_col)
  264. return df
  265. def __repr__(self):
  266. fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn)
  267. return (
  268. f"{self.__class__.__name__}(columns={self.columns!r}, "
  269. f"tokenization_fn={fn_name}, max_features={self.max_features!r}, "
  270. f"output_columns={self.output_columns!r})"
  271. )