| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- from typing import Callable, List, Optional
- import pandas as pd
- from ray.data.preprocessor import Preprocessor
- from ray.data.preprocessors.utils import simple_split_tokenizer
- from ray.util.annotations import PublicAPI
- @PublicAPI(stability="alpha")
- class Tokenizer(Preprocessor):
- """Replace each string with a list of tokens.
- Examples:
- >>> import pandas as pd
- >>> import ray
- >>> df = pd.DataFrame({"text": ["Hello, world!", "foo bar\\nbaz"]})
- >>> ds = ray.data.from_pandas(df) # doctest: +SKIP
- The default ``tokenization_fn`` delimits strings using the space character.
- >>> from ray.data.preprocessors import Tokenizer
- >>> tokenizer = Tokenizer(columns=["text"])
- >>> tokenizer.transform(ds).to_pandas() # doctest: +SKIP
- text
- 0 [Hello,, world!]
- 1 [foo, bar\\nbaz]
- If the default logic isn't adequate for your use case, you can specify a
- custom ``tokenization_fn``.
- >>> import string
- >>> def tokenization_fn(s):
- ... for character in string.punctuation:
- ... s = s.replace(character, "")
- ... return s.split()
- >>> tokenizer = Tokenizer(columns=["text"], tokenization_fn=tokenization_fn)
- >>> tokenizer.transform(ds).to_pandas() # doctest: +SKIP
- text
- 0 [Hello, world]
- 1 [foo, bar, baz]
- :class:`Tokenizer` can also be used in append mode by providing the
- name of the output_columns that should hold the tokenized values.
- >>> tokenizer = Tokenizer(columns=["text"], output_columns=["text_tokenized"])
- >>> tokenizer.transform(ds).to_pandas() # doctest: +SKIP
- text text_tokenized
- 0 Hello, world! [Hello,, world!]
- 1 foo bar\\nbaz [foo, bar\\nbaz]
- Args:
- columns: The columns to tokenize.
- tokenization_fn: The function used to generate tokens. This function
- should accept a string as input and return a list of tokens as
- output. If unspecified, the tokenizer uses a function equivalent to
- ``lambda s: s.split(" ")``.
- output_columns: The names of the transformed columns. If None, the transformed
- columns will be the same as the input columns. If not None, the length of
- ``output_columns`` must match the length of ``columns``, othwerwise an error
- will be raised.
- """
- _is_fittable = False
- def __init__(
- self,
- columns: List[str],
- tokenization_fn: Optional[Callable[[str], List[str]]] = None,
- output_columns: Optional[List[str]] = None,
- ):
- super().__init__()
- self.columns = columns
- # TODO(matt): Add a more robust default tokenizer.
- self.tokenization_fn = tokenization_fn or simple_split_tokenizer
- self.output_columns = Preprocessor._derive_and_validate_output_columns(
- columns, output_columns
- )
- def _transform_pandas(self, df: pd.DataFrame):
- def column_tokenizer(s: pd.Series):
- return s.map(self.tokenization_fn)
- df[self.output_columns] = df.loc[:, self.columns].transform(column_tokenizer)
- return df
- def __repr__(self):
- name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn)
- return (
- f"{self.__class__.__name__}(columns={self.columns!r}, "
- f"tokenization_fn={name}, output_columns={self.output_columns!r})"
- )
|