tokenization_gpt2.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Tokenization classes for OpenAI GPT."""
  15. from tokenizers import Tokenizer, decoders, pre_tokenizers
  16. from tokenizers.models import BPE
  17. from ...tokenization_utils_tokenizers import AddedToken, TokenizersBackend
  18. from ...utils import logging
  19. logger = logging.get_logger(__name__)
  20. VOCAB_FILES_NAMES = {
  21. "vocab_file": "vocab.json",
  22. "merges_file": "merges.txt",
  23. }
  24. class GPT2Tokenizer(TokenizersBackend):
  25. """
  26. Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.
  27. This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
  28. be encoded differently whether it is at the beginning of the sentence (without space) or not:
  29. ```python
  30. >>> from transformers import GPT2Tokenizer
  31. >>> tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
  32. >>> tokenizer("Hello world")["input_ids"]
  33. [15496, 995]
  34. >>> tokenizer(" Hello world")["input_ids"]
  35. [18435, 995]
  36. ```
  37. You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
  38. call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
  39. <Tip>
  40. When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
  41. </Tip>
  42. This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should refer to
  43. this superclass for more information regarding those methods.
  44. Args:
  45. vocab_file (`str`):
  46. Path to the vocabulary file.
  47. merges_file (`str`):
  48. Path to the merges file.
  49. errors (`str`, *optional*, defaults to `"replace"`):
  50. Paradigm to follow when decoding bytes to UTF-8. See
  51. [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
  52. unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
  53. The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
  54. token instead.
  55. bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
  56. The beginning of sequence token.
  57. eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
  58. The end of sequence token.
  59. pad_token (`str`, *optional*):
  60. The token used for padding, for example when batching sequences of different lengths.
  61. add_prefix_space (`bool`, *optional*, defaults to `False`):
  62. Whether or not to add an initial space to the input. This allows to treat the leading word just as any
  63. other word. (GPT2 tokenizer detect beginning of words by the preceding space).
  64. add_bos_token (`bool`, *optional*, defaults to `False`):
  65. Whether or not to add an initial beginning of sentence token to the input. This allows to treat the leading
  66. word just as any other word.
  67. vocab (`str` or `dict[str, int]`, *optional*):
  68. Custom vocabulary dictionary. If not provided, vocabulary is loaded from `vocab_file`.
  69. merges (`str` or `list[str]`, *optional*):
  70. Custom merges list. If not provided, merges are loaded from `merges_file`.
  71. """
  72. vocab_files_names = VOCAB_FILES_NAMES
  73. model_input_names = ["input_ids", "attention_mask"]
  74. model = BPE
  75. def __init__(
  76. self,
  77. vocab: str | dict[str, int] | None = None,
  78. merges: str | list[str] | None = None,
  79. errors: str = "replace",
  80. unk_token: AddedToken | str = "<|endoftext|>",
  81. bos_token: AddedToken | str = "<|endoftext|>",
  82. eos_token: AddedToken | str = "<|endoftext|>",
  83. pad_token: AddedToken | str | None = None,
  84. add_prefix_space=False,
  85. **kwargs,
  86. ):
  87. self.add_prefix_space = add_prefix_space
  88. self._vocab = vocab if vocab is not None else {}
  89. self._merges = merges or []
  90. self._tokenizer = Tokenizer(
  91. BPE(
  92. vocab=self._vocab,
  93. merges=self._merges,
  94. dropout=None,
  95. continuing_subword_prefix="",
  96. end_of_word_suffix="",
  97. fuse_unk=False,
  98. )
  99. )
  100. self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
  101. self._tokenizer.decoder = decoders.ByteLevel()
  102. super().__init__(
  103. errors=errors,
  104. unk_token=unk_token,
  105. bos_token=bos_token,
  106. eos_token=eos_token,
  107. pad_token=pad_token,
  108. add_prefix_space=add_prefix_space,
  109. **kwargs,
  110. )
  111. __all__ = ["GPT2Tokenizer"]