tokenization_mbart50.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. # Copyright 2021 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
  15. from tokenizers.models import Unigram
  16. from ...tokenization_python import AddedToken, BatchEncoding
  17. from ...tokenization_utils_tokenizers import TokenizersBackend
  18. from ...utils import logging
  19. logger = logging.get_logger(__name__)
  20. VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
  21. FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"] # fmt: skip
  22. class MBart50Tokenizer(TokenizersBackend):
  23. """
  24. Construct a MBart50 tokenizer (backed by HuggingFace's *tokenizers* library). Based on
  25. [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
  26. This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
  27. refer to this superclass for more information regarding those methods.
  28. Args:
  29. vocab_file (`str`, *optional*):
  30. Path to the vocabulary file.
  31. src_lang (`str`, *optional*):
  32. A string representing the source language.
  33. tgt_lang (`str`, *optional*):
  34. A string representing the target language.
  35. eos_token (`str`, *optional*, defaults to `"</s>"`):
  36. The end of sequence token.
  37. sep_token (`str`, *optional*, defaults to `"</s>"`):
  38. The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
  39. sequence classification or for a text and a question for question answering. It is also used as the last
  40. token of a sequence built with special tokens.
  41. cls_token (`str`, *optional*, defaults to `"<s>"`):
  42. The classifier token which is used when doing sequence classification (classification of the whole sequence
  43. instead of per-token classification). It is the first token of the sequence when built with special tokens.
  44. unk_token (`str`, *optional*, defaults to `"<unk>"`):
  45. The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
  46. token instead.
  47. pad_token (`str`, *optional*, defaults to `"<pad>"`):
  48. The token used for padding, for example when batching sequences of different lengths.
  49. mask_token (`str`, *optional*, defaults to `"<mask>"`):
  50. The token used for masking values. This is the token used when training this model with masked language
  51. modeling. This is the token which the model will try to predict.
  52. Examples:
  53. ```python
  54. >>> from transformers import MBart50Tokenizer
  55. >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
  56. >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
  57. >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
  58. >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
  59. >>> # model(**model_inputs) should work
  60. ```"""
  61. vocab_files_names = VOCAB_FILES_NAMES
  62. model_input_names = ["input_ids", "attention_mask"]
  63. model = Unigram
  64. prefix_tokens: list[int] = []
  65. suffix_tokens: list[int] = []
  66. def __init__(
  67. self,
  68. vocab: str | dict | list | None = None,
  69. _spm_precompiled_charsmap: str | None = None,
  70. src_lang=None,
  71. tgt_lang=None,
  72. eos_token="</s>",
  73. sep_token="</s>",
  74. cls_token="<s>",
  75. unk_token="<unk>",
  76. pad_token="<pad>",
  77. mask_token="<mask>",
  78. **kwargs,
  79. ):
  80. mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
  81. # Do not pass language codes via extra_special_tokens to super().__init__.
  82. # We will mark them as special AFTER backend construction to avoid re-adding tokens
  83. # when loading from pretrained files.
  84. # Always construct a tokenizer_object without referencing external tokenizer files
  85. if isinstance(vocab, list):
  86. # MBart50 uses fairseq vocab alignment matching MBart50Converter:
  87. # <s>=0, <pad>=1, </s>=2, <unk>=3, then tokens, lang codes, <mask>
  88. vocab = [(str(item[0]), float(item[1])) for item in vocab]
  89. vocab_tokens = [item[0] for item in vocab]
  90. has_language_codes = any(lang_code in vocab_tokens for lang_code in FAIRSEQ_LANGUAGE_CODES)
  91. if has_language_codes:
  92. self._vocab_scores = vocab
  93. else:
  94. # Vocab from SentencePieceExtractor is in sentencepiece format:
  95. # <unk>=0, <s>=1, </s>=2, then tokens
  96. # We need to reorder to fairseq format: <s>=0, <pad>=1, </s>=2, <unk>=3, then tokens
  97. # Reorder: fairseq expects <s>, <pad>, </s>, <unk>, then rest of vocab starting from index 3
  98. vocab_list = [
  99. (str(cls_token), 0.0), # 0: <s>
  100. (str(pad_token), 0.0), # 1: <pad>
  101. (str(eos_token), 0.0), # 2: </s>
  102. (str(unk_token), 0.0), # 3: <unk>
  103. ]
  104. # Add remaining tokens from position 3 onwards (skip <unk>, <s>, </s> from sentencepiece)
  105. vocab_list.extend(vocab[3:])
  106. # Add language codes
  107. for lang_code in FAIRSEQ_LANGUAGE_CODES:
  108. vocab_list.append((str(lang_code), 0.0))
  109. # Add mask token
  110. vocab_list.append((str(mask_token), 0.0))
  111. self._vocab_scores = vocab_list
  112. else:
  113. # Minimal fallback: small vocab with specials and language codes
  114. self._vocab_scores = [
  115. (str(cls_token), 0.0),
  116. (str(pad_token), 0.0),
  117. (str(eos_token), 0.0),
  118. (str(unk_token), 0.0),
  119. ("▁", -2.0),
  120. ]
  121. for lang_code in FAIRSEQ_LANGUAGE_CODES:
  122. self._vocab_scores.append((lang_code, 0.0))
  123. self._vocab_scores.append((str(mask_token), 0.0))
  124. # Build backend tokenizer from self._vocab_scores (both branches above set it)
  125. self._tokenizer = Tokenizer(
  126. Unigram(
  127. self._vocab_scores,
  128. unk_id=3,
  129. byte_fallback=False,
  130. )
  131. )
  132. normalizers_ = [normalizers.Replace(Regex(r" {2,}"), " ")]
  133. if _spm_precompiled_charsmap is not None:
  134. normalizers_ = [normalizers.Precompiled(_spm_precompiled_charsmap)] + normalizers_
  135. self._tokenizer.normalizer = normalizers.Sequence(normalizers_)
  136. self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
  137. self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
  138. additional_special_tokens = kwargs.pop("additional_special_tokens", []) or []
  139. additional_special_tokens.extend(FAIRSEQ_LANGUAGE_CODES)
  140. super().__init__(
  141. src_lang=src_lang,
  142. tgt_lang=tgt_lang,
  143. eos_token=eos_token,
  144. sep_token=sep_token,
  145. cls_token=cls_token,
  146. unk_token=unk_token,
  147. pad_token=pad_token,
  148. mask_token=mask_token,
  149. additional_special_tokens=additional_special_tokens,
  150. **kwargs,
  151. )
  152. self.fairseq_offset = 1
  153. # Mark language codes as extra special tokens without re-adding them to the backend.
  154. # Merge with any pre-existing extra_special_tokens (e.g., restored from config on load).
  155. try:
  156. lang_tokens = [AddedToken(code, special=True) for code in FAIRSEQ_LANGUAGE_CODES]
  157. except Exception:
  158. lang_tokens = list(FAIRSEQ_LANGUAGE_CODES)
  159. existing_extra = getattr(self, "_extra_special_tokens", []) or []
  160. # Preserve order: keep existing, append missing language codes
  161. existing_strs = {str(t) for t in existing_extra}
  162. merged_extra = list(existing_extra) + [t for t in lang_tokens if str(t) not in existing_strs]
  163. self._extra_special_tokens = merged_extra
  164. self._src_lang = src_lang if src_lang is not None else "en_XX"
  165. self.tgt_lang = tgt_lang
  166. # Build language code mappings and fairseq mappings
  167. # This will be called again in _post_init after tokenizer.json is loaded
  168. self._build_language_code_mappings()
  169. self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
  170. self.set_src_lang_special_tokens(self._src_lang)
  171. def _build_language_code_mappings(self):
  172. """Build language code to ID mappings and fairseq compatibility mappings."""
  173. self.lang_code_to_id = {
  174. lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
  175. }
  176. self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
  177. # Build fairseq token mappings for backward compatibility
  178. self.fairseq_tokens_to_ids = {
  179. "<s>": 0,
  180. "<pad>": 1,
  181. "</s>": 2,
  182. "<unk>": 3,
  183. }
  184. self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
  185. mask_token = getattr(self, "mask_token", "<mask>")
  186. self.fairseq_tokens_to_ids["<mask>"] = self.convert_tokens_to_ids(str(mask_token))
  187. self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
  188. def _post_init(self):
  189. """Called after tokenizer.json is loaded in from_pretrained."""
  190. # Rebuild language code mappings with the loaded tokenizer
  191. self._build_language_code_mappings()
  192. # Update cur_lang_code_id with the correct ID
  193. if hasattr(self, "_src_lang"):
  194. self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
  195. self.set_src_lang_special_tokens(self._src_lang)
  196. @property
  197. def src_lang(self) -> str:
  198. return self._src_lang
  199. @src_lang.setter
  200. def src_lang(self, new_src_lang: str) -> None:
  201. self._src_lang = new_src_lang
  202. self.set_src_lang_special_tokens(self._src_lang)
  203. def prepare_seq2seq_batch(
  204. self,
  205. src_texts: list[str],
  206. src_lang: str = "en_XX",
  207. tgt_texts: list[str] | None = None,
  208. tgt_lang: str = "ro_RO",
  209. **kwargs,
  210. ) -> BatchEncoding:
  211. self.src_lang = src_lang
  212. self.tgt_lang = tgt_lang
  213. return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
  214. def _switch_to_input_mode(self):
  215. return self.set_src_lang_special_tokens(self.src_lang)
  216. def _switch_to_target_mode(self):
  217. if self.tgt_lang is None:
  218. self.tgt_lang = self._src_lang
  219. return self.set_tgt_lang_special_tokens(self.tgt_lang)
  220. def set_src_lang_special_tokens(self, src_lang: str) -> None:
  221. """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
  222. self.cur_lang_code_id = self.convert_tokens_to_ids(src_lang)
  223. self.prefix_tokens = [self.cur_lang_code_id]
  224. self.suffix_tokens = [self.eos_token_id]
  225. prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
  226. suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
  227. self._tokenizer.post_processor = processors.TemplateProcessing(
  228. single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
  229. pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
  230. special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
  231. )
  232. def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
  233. """Reset the special tokens to the target language setting. prefix=[tgt_lang_code] and suffix=[eos]."""
  234. self.cur_lang_code_id = self.convert_tokens_to_ids(tgt_lang)
  235. self.prefix_tokens = [self.cur_lang_code_id]
  236. self.suffix_tokens = [self.eos_token_id]
  237. prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
  238. suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
  239. self._tokenizer.post_processor = processors.TemplateProcessing(
  240. single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
  241. pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
  242. special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
  243. )
  244. def _build_translation_inputs(
  245. self, raw_inputs, return_tensors: str, src_lang: str | None, tgt_lang: str | None, **extra_kwargs
  246. ):
  247. """Used by translation pipeline, to prepare inputs for the generate function"""
  248. if src_lang is None or tgt_lang is None:
  249. raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
  250. self.src_lang = src_lang
  251. inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
  252. tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
  253. inputs["forced_bos_token_id"] = tgt_lang_id
  254. return inputs
  255. __all__ = ["MBart50Tokenizer"]
  256. # Backward alias
  257. MBart50TokenizerFast = MBart50Tokenizer