tokenization_openai.py 3.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Tokenization classes for OpenAI GPT."""
  15. from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
  16. from tokenizers.models import BPE
  17. from ...tokenization_utils_tokenizers import TokenizersBackend
  18. from ...utils import logging
  19. logger = logging.get_logger(__name__)
  20. VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
  21. class OpenAIGPTTokenizer(TokenizersBackend):
  22. """
  23. Construct a GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
  24. the following peculiarities:
  25. - lower case all inputs
  26. - uses BERT's BasicTokenizer for pre-BPE tokenization
  27. This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
  28. refer to this superclass for more information regarding those methods.
  29. Args:
  30. vocab_file (`str`, *optional*):
  31. Path to the vocabulary file.
  32. merges_file (`str`, *optional*):
  33. Path to the merges file.
  34. tokenizer_file (`str`, *optional*):
  35. Path to a tokenizers JSON file containing the serialization of a tokenizer.
  36. unk_token (`str`, *optional*, defaults to `"<unk>"`):
  37. The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
  38. token instead.
  39. vocab (`str` or `dict[str, int]`, *optional*):
  40. Custom vocabulary dictionary. If not provided, a blank vocabulary is initialized.
  41. merges (`str` or `list[str]`, *optional*):
  42. Custom merges list. If not provided, an empty list is used.
  43. """
  44. vocab_files_names = VOCAB_FILES_NAMES
  45. model_input_names = ["input_ids", "attention_mask"]
  46. model = BPE
  47. def __init__(
  48. self,
  49. vocab: str | dict[str, int] | None = None,
  50. merges: str | list[str] | None = None,
  51. unk_token: str = "<unk>",
  52. **kwargs,
  53. ):
  54. self._vocab = vocab if vocab is not None else {str(unk_token): 0}
  55. self._merges = merges or []
  56. self._tokenizer = Tokenizer(
  57. BPE(
  58. vocab=self._vocab,
  59. merges=self._merges,
  60. dropout=None,
  61. continuing_subword_prefix="",
  62. end_of_word_suffix="</w>",
  63. fuse_unk=False,
  64. unk_token=str(unk_token),
  65. )
  66. )
  67. # Set normalizer and pre-tokenizer to mimic OpenAI GPT behavior
  68. # OpenAI GPT uses BERT BasicTokenizer with lower_case=True
  69. self._tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
  70. self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
  71. self._tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")
  72. super().__init__(
  73. unk_token=unk_token,
  74. **kwargs,
  75. )
  76. @property
  77. def do_lower_case(self):
  78. return True
  79. __all__ = ["OpenAIGPTTokenizer"]