tokenization_pegasus.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # Copyright 2020 Google and The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Tokenization class for model PEGASUS."""
  15. from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
  16. from tokenizers.models import Unigram
  17. from ...tokenization_utils_tokenizers import TokenizersBackend
  18. from ...utils import logging
  19. logger = logging.get_logger(__name__)
  20. VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
  21. class PegasusTokenizer(TokenizersBackend):
  22. r"""
  23. Construct a PEGASUS tokenizer (backed by HuggingFace's *tokenizers* library). Based on
  24. [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
  25. This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
  26. refer to this superclass for more information regarding those methods.
  27. Args:
  28. vocab_file (`str`, *optional*):
  29. [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
  30. contains the vocabulary necessary to instantiate a tokenizer.
  31. pad_token (`str`, *optional*, defaults to `"<pad>"`):
  32. The token used for padding, for example when batching sequences of different lengths.
  33. eos_token (`str`, *optional*, defaults to `"</s>"`):
  34. The end of sequence token.
  35. <Tip>
  36. When building a sequence using special tokens, this is not the token that is used for the end of sequence.
  37. The token used is the `sep_token`.
  38. </Tip>
  39. unk_token (`str`, *optional*, defaults to `"<unk>"`):
  40. The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
  41. token instead.
  42. mask_token (`str`, *optional*, defaults to `"<mask_2>"`):
  43. The token used for masking single token values. This is the token used when training this model with masked
  44. language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
  45. It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
  46. Summarization](https://huggingface.co/papers/1912.08777).
  47. mask_token_sent (`str`, *optional*, defaults to `"<mask_1>"`):
  48. The token used for masking whole target sentences. This is the token used when training this model with gap
  49. sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
  50. pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for
  51. Abstractive Summarization](https://huggingface.co/papers/1912.08777).
  52. additional_special_tokens (`List[str]`, *optional*):
  53. Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
  54. <unk_2, ..., unk_102> are used as additional special tokens corresponding to the [original PEGASUS
  55. tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66)
  56. that uses the tokens 2 - 104 only for pretraining
  57. offset (`int`, *optional*, defaults to 103):
  58. Offset for additional special tokens.
  59. vocab (`str` or `list[tuple[str, float]]`, *optional*):
  60. Custom vocabulary with `(token, score)` tuples. If not provided, a blank vocabulary is initialized.
  61. """
  62. vocab_files_names = VOCAB_FILES_NAMES
  63. model_input_names = ["input_ids", "attention_mask"]
  64. model = Unigram
  65. def __init__(
  66. self,
  67. vocab: str | list[tuple[str, float]] | None = None,
  68. pad_token="<pad>",
  69. eos_token="</s>",
  70. unk_token="<unk>",
  71. mask_token="<mask_2>",
  72. mask_token_sent="<mask_1>",
  73. _spm_precompiled_charsmap=None,
  74. additional_special_tokens=None,
  75. offset=103,
  76. **kwargs,
  77. ):
  78. self.offset = offset
  79. if additional_special_tokens is None or mask_token_sent not in additional_special_tokens:
  80. additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else []
  81. else:
  82. additional_special_tokens = []
  83. additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
  84. if vocab is None:
  85. vocab = [(str(unk_token), 0.0), (str(pad_token), 0.0), (str(eos_token), 0.0), (str(mask_token), 0.0)]
  86. self._vocab = vocab
  87. self._tokenizer = Tokenizer(Unigram(vocab=vocab, unk_id=self._vocab.index((str(unk_token), 0.0), 1)))
  88. if _spm_precompiled_charsmap is not None:
  89. self._tokenizer.normalizer = normalizers.Sequence(
  90. [normalizers.Precompiled(_spm_precompiled_charsmap), normalizers.Replace(Regex(r" {2,}"), " ")]
  91. )
  92. else:
  93. self._tokenizer.normalizer = normalizers.Sequence(
  94. [normalizers.Replace(Regex(r"\n"), " "), normalizers.Replace(Regex(r" {2,}"), " ")]
  95. )
  96. self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
  97. self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
  98. super().__init__(
  99. pad_token=pad_token,
  100. eos_token=eos_token,
  101. unk_token=unk_token,
  102. mask_token=mask_token,
  103. mask_token_sent=mask_token_sent,
  104. offset=offset,
  105. additional_special_tokens=additional_special_tokens,
  106. **kwargs,
  107. )
  108. self._tokenizer.post_processor = processors.TemplateProcessing(
  109. single=f"$A {eos_token}",
  110. pair=f"$A $B {eos_token}",
  111. special_tokens=[(str(eos_token), self.convert_tokens_to_ids(str(eos_token)))],
  112. )
  113. __all__ = ["PegasusTokenizer"]