tokenization_auto.py 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870
  1. # Copyright 2018 The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Auto Tokenizer class."""
  15. import importlib
  16. import json
  17. import os
  18. import sys
  19. from collections import OrderedDict
  20. from typing import Any
  21. from transformers.utils.import_utils import is_mistral_common_available
  22. from ...configuration_utils import PreTrainedConfig
  23. from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
  24. from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint
  25. from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
  26. from ...utils import (
  27. extract_commit_hash,
  28. is_g2p_en_available,
  29. is_sentencepiece_available,
  30. is_tokenizers_available,
  31. logging,
  32. )
  33. from ...utils.hub import cached_file
  34. from ..encoder_decoder import EncoderDecoderConfig
  35. from .auto_factory import _LazyAutoMapping
  36. from .configuration_auto import (
  37. CONFIG_MAPPING_NAMES,
  38. AutoConfig,
  39. config_class_to_model_type,
  40. model_type_to_module_name,
  41. replace_list_option_in_docstrings,
  42. )
  43. if is_tokenizers_available():
  44. from ...tokenization_utils_tokenizers import TokenizersBackend
  45. else:
  46. TokenizersBackend = None
  47. if is_sentencepiece_available():
  48. from ...tokenization_utils_sentencepiece import SentencePieceBackend
  49. else:
  50. SentencePieceBackend = None
  51. logger = logging.get_logger(__name__)
  52. # V5: Simplified mapping - single tokenizer class per model type (always prefer tokenizers-based)
  53. REGISTERED_TOKENIZER_CLASSES: dict[str, type[Any]] = {}
  54. REGISTERED_FAST_ALIASES: dict[str, type[Any]] = {}
  55. TOKENIZER_MAPPING_NAMES = OrderedDict[str, str | None](
  56. [
  57. ("aimv2", "CLIPTokenizer" if is_tokenizers_available() else None),
  58. ("albert", "AlbertTokenizer" if is_tokenizers_available() else None),
  59. ("align", "BertTokenizer" if is_tokenizers_available() else None),
  60. ("audioflamingo3", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  61. ("aya_vision", "CohereTokenizer" if is_tokenizers_available() else None),
  62. ("bark", "BertTokenizer" if is_tokenizers_available() else None),
  63. ("bart", "RobertaTokenizer" if is_tokenizers_available() else None),
  64. ("barthez", "BarthezTokenizer" if is_tokenizers_available() else None),
  65. ("bartpho", "BartphoTokenizer"),
  66. ("bert", "BertTokenizer" if is_tokenizers_available() else None),
  67. ("bert-generation", "BertGenerationTokenizer" if is_sentencepiece_available() else None),
  68. ("bert-japanese", "BertJapaneseTokenizer"),
  69. ("bertweet", "BertweetTokenizer"),
  70. ("big_bird", "BigBirdTokenizer" if is_tokenizers_available() else None),
  71. ("bigbird_pegasus", "PegasusTokenizer" if is_tokenizers_available() else None),
  72. ("biogpt", "BioGptTokenizer"),
  73. ("blenderbot", "BlenderbotTokenizer" if is_tokenizers_available() else None),
  74. ("blenderbot-small", "BlenderbotSmallTokenizer"),
  75. ("blip", "BertTokenizer" if is_tokenizers_available() else None),
  76. ("blip-2", "GPT2Tokenizer" if is_tokenizers_available() else None),
  77. ("bridgetower", "RobertaTokenizer"),
  78. ("bros", "BertTokenizer" if is_tokenizers_available() else None),
  79. ("byt5", "ByT5Tokenizer"),
  80. ("camembert", "CamembertTokenizer" if is_tokenizers_available() else None),
  81. ("canine", "CanineTokenizer"),
  82. ("chinese_clip", "BertTokenizer" if is_tokenizers_available() else None),
  83. ("clap", "RobertaTokenizer"),
  84. ("clip", "CLIPTokenizer" if is_tokenizers_available() else None),
  85. ("clipseg", "CLIPTokenizer" if is_tokenizers_available() else None),
  86. ("clvp", "ClvpTokenizer"),
  87. ("code_llama", "CodeLlamaTokenizer" if is_tokenizers_available() else None),
  88. ("codegen", "GPT2Tokenizer" if is_tokenizers_available() else None),
  89. ("cohere", "CohereTokenizer" if is_tokenizers_available() else None),
  90. ("cohere2", "CohereTokenizer" if is_tokenizers_available() else None),
  91. ("colqwen2", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  92. ("convbert", "BertTokenizer" if is_tokenizers_available() else None),
  93. ("cpm", "CpmTokenizer" if is_tokenizers_available() else None),
  94. ("cpmant", "CpmAntTokenizer"),
  95. ("ctrl", "CTRLTokenizer"),
  96. ("data2vec-audio", "Wav2Vec2CTCTokenizer"),
  97. ("data2vec-text", "RobertaTokenizer"),
  98. ("dbrx", "GPT2Tokenizer" if is_tokenizers_available() else None),
  99. ("deberta", "DebertaTokenizer" if is_tokenizers_available() else None),
  100. ("deberta-v2", "DebertaV2Tokenizer" if is_tokenizers_available() else None),
  101. ("dia", "DiaTokenizer"),
  102. ("distilbert", "BertTokenizer" if is_tokenizers_available() else None),
  103. ("dpr", "DPRQuestionEncoderTokenizer" if is_tokenizers_available() else None),
  104. ("electra", "BertTokenizer" if is_tokenizers_available() else None),
  105. ("emu3", "GPT2Tokenizer" if is_tokenizers_available() else None),
  106. ("ernie", "BertTokenizer" if is_tokenizers_available() else None),
  107. ("esm", "EsmTokenizer"),
  108. ("falcon_mamba", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
  109. ("fastspeech2_conformer", "FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None),
  110. ("flaubert", "FlaubertTokenizer"),
  111. ("flava", "BertTokenizer" if is_tokenizers_available() else None),
  112. ("flex_olmo", "GPT2Tokenizer" if is_tokenizers_available() else None),
  113. ("florence2", "BartTokenizer" if is_tokenizers_available() else None),
  114. ("fnet", "FNetTokenizer" if is_tokenizers_available() else None),
  115. ("fsmt", "FSMTTokenizer"),
  116. ("funnel", "FunnelTokenizer" if is_tokenizers_available() else None),
  117. ("gemma", "GemmaTokenizer" if is_tokenizers_available() else None),
  118. ("gemma2", "GemmaTokenizer" if is_tokenizers_available() else None),
  119. ("gemma3", "GemmaTokenizer" if is_tokenizers_available() else None),
  120. ("gemma3_text", "GemmaTokenizer" if is_tokenizers_available() else None),
  121. ("gemma3n", "GemmaTokenizer" if is_tokenizers_available() else None),
  122. ("gemma3n_text", "GemmaTokenizer" if is_tokenizers_available() else None),
  123. ("git", "BertTokenizer" if is_tokenizers_available() else None),
  124. ("glm", "TokenizersBackend" if is_tokenizers_available() else None),
  125. ("glm4", "TokenizersBackend" if is_tokenizers_available() else None),
  126. ("glm4_moe", "TokenizersBackend" if is_tokenizers_available() else None),
  127. ("glm4_moe_lite", "TokenizersBackend" if is_tokenizers_available() else None),
  128. ("glm4v", "TokenizersBackend" if is_tokenizers_available() else None),
  129. ("glm4v_moe", "TokenizersBackend" if is_tokenizers_available() else None),
  130. ("glm_image", "TokenizersBackend" if is_tokenizers_available() else None),
  131. ("glmasr", "TokenizersBackend" if is_tokenizers_available() else None),
  132. ("got_ocr2", "TokenizersBackend" if is_tokenizers_available() else None),
  133. ("gpt-sw3", "GPTSw3Tokenizer" if is_sentencepiece_available() else None),
  134. ("gpt2", "GPT2Tokenizer" if is_tokenizers_available() else None),
  135. ("gpt_bigcode", "GPT2Tokenizer" if is_tokenizers_available() else None),
  136. ("gpt_neo", "GPT2Tokenizer" if is_tokenizers_available() else None),
  137. ("gpt_neox", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
  138. ("gpt_neox_japanese", "GPTNeoXJapaneseTokenizer"),
  139. ("gptj", "GPT2Tokenizer" if is_tokenizers_available() else None),
  140. ("granite", "GPT2Tokenizer"),
  141. ("granitemoe", "GPT2Tokenizer"),
  142. ("granitemoehybrid", "GPT2Tokenizer"),
  143. ("granitemoeshared", "GPT2Tokenizer"),
  144. ("grounding-dino", "BertTokenizer" if is_tokenizers_available() else None),
  145. ("groupvit", "CLIPTokenizer" if is_tokenizers_available() else None),
  146. ("herbert", "HerbertTokenizer" if is_tokenizers_available() else None),
  147. ("hubert", "Wav2Vec2CTCTokenizer"),
  148. ("ibert", "RobertaTokenizer"),
  149. ("idefics", "LlamaTokenizer" if is_tokenizers_available() else None),
  150. ("idefics2", "LlamaTokenizer" if is_tokenizers_available() else None),
  151. ("instructblip", "GPT2Tokenizer" if is_tokenizers_available() else None),
  152. ("instructblipvideo", "GPT2Tokenizer" if is_tokenizers_available() else None),
  153. ("internvl", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  154. ("jais2", "GPT2Tokenizer" if is_tokenizers_available() else None),
  155. ("jina_embeddings_v3", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
  156. ("kosmos-2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
  157. ("lasr_ctc", "LasrTokenizer" if is_tokenizers_available() else None),
  158. ("lasr_encoder", "LasrTokenizer" if is_tokenizers_available() else None),
  159. ("layoutlm", "BertTokenizer" if is_tokenizers_available() else None),
  160. ("layoutlmv2", "LayoutLMv2Tokenizer" if is_tokenizers_available() else None),
  161. ("layoutlmv3", "LayoutLMv3Tokenizer" if is_tokenizers_available() else None),
  162. ("layoutxlm", "LayoutXLMTokenizer" if is_tokenizers_available() else None),
  163. ("led", "LEDTokenizer" if is_tokenizers_available() else None),
  164. ("lighton_ocr", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
  165. ("lilt", "RobertaTokenizer" if is_tokenizers_available() else None),
  166. ("longformer", "RobertaTokenizer" if is_tokenizers_available() else None),
  167. ("luke", "LukeTokenizer"),
  168. ("lxmert", "LxmertTokenizer" if is_tokenizers_available() else None),
  169. ("m2m_100", "M2M100Tokenizer" if is_sentencepiece_available() else None),
  170. ("mamba", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
  171. ("mamba2", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
  172. ("marian", "MarianTokenizer" if is_sentencepiece_available() else None),
  173. ("markuplm", "MarkupLMTokenizer" if is_tokenizers_available() else None),
  174. ("mbart", "MBartTokenizer" if is_tokenizers_available() else None),
  175. ("mbart50", "MBart50Tokenizer" if is_tokenizers_available() else None),
  176. ("mega", "RobertaTokenizer"),
  177. ("megatron-bert", "BertTokenizer" if is_tokenizers_available() else None),
  178. ("metaclip_2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
  179. ("mgp-str", "MgpstrTokenizer"),
  180. (
  181. "ministral",
  182. "MistralCommonBackend"
  183. if is_mistral_common_available()
  184. else ("TokenizersBackend" if is_tokenizers_available() else None),
  185. ),
  186. (
  187. "ministral3",
  188. "MistralCommonBackend"
  189. if is_mistral_common_available()
  190. else ("TokenizersBackend" if is_tokenizers_available() else None),
  191. ),
  192. (
  193. "mistral",
  194. "MistralCommonBackend"
  195. if is_mistral_common_available()
  196. else ("TokenizersBackend" if is_tokenizers_available() else None),
  197. ),
  198. (
  199. "mistral3",
  200. "MistralCommonBackend"
  201. if is_mistral_common_available()
  202. else ("TokenizersBackend" if is_tokenizers_available() else None),
  203. ),
  204. (
  205. "mixtral",
  206. "MistralCommonBackend"
  207. if is_mistral_common_available()
  208. else ("TokenizersBackend" if is_tokenizers_available() else None),
  209. ),
  210. ("mluke", "MLukeTokenizer" if is_sentencepiece_available() else None),
  211. ("mm-grounding-dino", "BertTokenizer" if is_tokenizers_available() else None),
  212. ("mobilebert", "MobileBertTokenizer" if is_tokenizers_available() else None),
  213. ("mpnet", "MPNetTokenizer" if is_tokenizers_available() else None),
  214. ("mpt", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
  215. ("mra", "RobertaTokenizer"),
  216. ("mt5", "T5Tokenizer" if is_tokenizers_available() else None),
  217. ("musicgen", "T5Tokenizer" if is_tokenizers_available() else None),
  218. ("musicgen_melody", "T5Tokenizer" if is_tokenizers_available() else None),
  219. ("mvp", "MvpTokenizer" if is_tokenizers_available() else None),
  220. ("myt5", "MyT5Tokenizer"),
  221. ("nezha", "BertTokenizer" if is_tokenizers_available() else None),
  222. ("nllb", "NllbTokenizer" if is_tokenizers_available() else None),
  223. ("nllb-moe", "NllbTokenizer" if is_tokenizers_available() else None),
  224. ("nomic_bert", "BertTokenizer" if is_tokenizers_available() else None),
  225. ("nougat", "NougatTokenizer" if is_tokenizers_available() else None),
  226. ("nystromformer", "AlbertTokenizer" if is_tokenizers_available() else None),
  227. ("olmo", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
  228. ("olmo2", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
  229. ("olmo3", "TokenizersBackend" if is_tokenizers_available() else None),
  230. ("olmo_hybrid", "TokenizersBackend" if is_tokenizers_available() else None),
  231. ("olmoe", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
  232. ("omdet-turbo", "CLIPTokenizer" if is_tokenizers_available() else None),
  233. ("oneformer", "CLIPTokenizer" if is_tokenizers_available() else None),
  234. ("openai-gpt", "OpenAIGPTTokenizer" if is_tokenizers_available() else None),
  235. ("opt", "GPT2Tokenizer" if is_tokenizers_available() else None),
  236. ("ovis2", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  237. ("owlv2", "CLIPTokenizer" if is_tokenizers_available() else None),
  238. ("owlvit", "CLIPTokenizer" if is_tokenizers_available() else None),
  239. ("pegasus", "PegasusTokenizer" if is_tokenizers_available() else None),
  240. ("pegasus_x", "PegasusTokenizer" if is_tokenizers_available() else None),
  241. ("perceiver", "PerceiverTokenizer"),
  242. ("phi", "GPT2Tokenizer" if is_tokenizers_available() else None),
  243. ("phobert", "PhobertTokenizer"),
  244. ("pix2struct", "T5Tokenizer" if is_tokenizers_available() else None),
  245. (
  246. "pixtral",
  247. "MistralCommonBackend"
  248. if is_mistral_common_available()
  249. else ("TokenizersBackend" if is_tokenizers_available() else None),
  250. ),
  251. ("plbart", "PLBartTokenizer" if is_tokenizers_available() else None),
  252. ("prophetnet", "ProphetNetTokenizer"),
  253. ("qdqbert", "BertTokenizer" if is_tokenizers_available() else None),
  254. ("qwen2", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  255. ("qwen2_5_omni", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  256. ("qwen2_5_vl", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  257. ("qwen2_audio", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  258. ("qwen2_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  259. ("qwen2_vl", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  260. ("qwen3", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  261. ("qwen3_5", "Qwen3_5Tokenizer" if is_tokenizers_available() else None),
  262. ("qwen3_5_moe", "Qwen3_5Tokenizer" if is_tokenizers_available() else None),
  263. ("qwen3_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  264. ("qwen3_next", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  265. ("qwen3_omni_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  266. ("qwen3_vl", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  267. ("qwen3_vl_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
  268. ("rag", "RagTokenizer"),
  269. ("realm", "BertTokenizer" if is_tokenizers_available() else None),
  270. ("recurrent_gemma", "GemmaTokenizer" if is_tokenizers_available() else None),
  271. ("reformer", "ReformerTokenizer" if is_tokenizers_available() else None),
  272. ("rembert", "RemBertTokenizer" if is_tokenizers_available() else None),
  273. ("retribert", "BertTokenizer" if is_tokenizers_available() else None),
  274. ("roberta", "RobertaTokenizer"),
  275. ("roberta-prelayernorm", "RobertaTokenizer"),
  276. ("roc_bert", "RoCBertTokenizer"),
  277. ("roformer", "RoFormerTokenizer" if is_tokenizers_available() else None),
  278. ("rwkv", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
  279. ("sam3", "CLIPTokenizer" if is_tokenizers_available() else None),
  280. ("sam3_video", "CLIPTokenizer" if is_tokenizers_available() else None),
  281. ("seamless_m4t", "SeamlessM4TTokenizer" if is_tokenizers_available() else None),
  282. ("seamless_m4t_v2", "SeamlessM4TTokenizer" if is_tokenizers_available() else None),
  283. ("shieldgemma2", "GemmaTokenizer" if is_tokenizers_available() else None),
  284. ("siglip", "SiglipTokenizer" if is_sentencepiece_available() else None),
  285. ("siglip2", "Siglip2Tokenizer" if is_tokenizers_available() else None),
  286. ("speech_to_text", "Speech2TextTokenizer" if is_sentencepiece_available() else None),
  287. ("speecht5", "SpeechT5Tokenizer" if is_sentencepiece_available() else None),
  288. ("splinter", "SplinterTokenizer"),
  289. ("squeezebert", "BertTokenizer" if is_tokenizers_available() else None),
  290. ("stablelm", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
  291. ("starcoder2", "GPT2Tokenizer" if is_tokenizers_available() else None),
  292. ("switch_transformers", "T5Tokenizer" if is_tokenizers_available() else None),
  293. ("t5", "T5Tokenizer" if is_tokenizers_available() else None),
  294. ("t5gemma", "GemmaTokenizer" if is_tokenizers_available() else None),
  295. ("tapas", "TapasTokenizer"),
  296. ("trocr", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
  297. ("tvp", "BertTokenizer" if is_tokenizers_available() else None),
  298. ("udop", "UdopTokenizer" if is_tokenizers_available() else None),
  299. ("umt5", "T5Tokenizer" if is_tokenizers_available() else None),
  300. ("unispeech", "Wav2Vec2CTCTokenizer"),
  301. ("unispeech-sat", "Wav2Vec2CTCTokenizer"),
  302. ("vilt", "BertTokenizer" if is_tokenizers_available() else None),
  303. ("visual_bert", "BertTokenizer" if is_tokenizers_available() else None),
  304. ("vits", "VitsTokenizer"),
  305. (
  306. "voxtral",
  307. "MistralCommonBackend"
  308. if is_mistral_common_available()
  309. else ("TokenizersBackend" if is_tokenizers_available() else None),
  310. ),
  311. (
  312. "voxtral_realtime",
  313. "MistralCommonBackend"
  314. if is_mistral_common_available()
  315. else ("TokenizersBackend" if is_tokenizers_available() else None),
  316. ),
  317. ("wav2vec2", "Wav2Vec2CTCTokenizer"),
  318. ("wav2vec2-bert", "Wav2Vec2CTCTokenizer"),
  319. ("wav2vec2-conformer", "Wav2Vec2CTCTokenizer"),
  320. ("wav2vec2_phoneme", "Wav2Vec2PhonemeCTCTokenizer"),
  321. ("whisper", "WhisperTokenizer" if is_tokenizers_available() else None),
  322. ("xclip", "CLIPTokenizer" if is_tokenizers_available() else None),
  323. ("xglm", "XGLMTokenizer" if is_tokenizers_available() else None),
  324. ("xlm", "XLMTokenizer"),
  325. ("xlm-roberta", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
  326. ("xlm-roberta-xl", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
  327. ("xlnet", "XLNetTokenizer" if is_tokenizers_available() else None),
  328. ("xlstm", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
  329. ("xmod", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
  330. ("yoso", "AlbertTokenizer" if is_tokenizers_available() else None),
  331. ]
  332. )
  333. # Models with incorrect tokenizer_class in their Hub tokenizer_config.json files.
  334. # These models will be forced to use TokenizersBackend.
  335. MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS: set[str] = {
  336. "arctic",
  337. "chameleon",
  338. "chatlm",
  339. "deepseek_v2",
  340. "deepseek_v3",
  341. "deepseek_vl",
  342. "deepseek_vl_hybrid",
  343. "deepseek_vl_v2",
  344. "fuyu",
  345. "h2ovl_chat",
  346. "hyperclovax_vlm",
  347. "internlm2",
  348. "internvl_chat",
  349. "jamba",
  350. "janus",
  351. "llava",
  352. "llava_next",
  353. "minicpmv",
  354. "minimax_m2",
  355. "modernbert",
  356. "molmo",
  357. "molmo2",
  358. "nemotron",
  359. "nvfp4",
  360. "opencua",
  361. "openvla",
  362. "phi3",
  363. "phi3_v",
  364. "phimoe",
  365. "step3p5",
  366. "vipllava",
  367. "cohere_asr",
  368. }
  369. for model_type in MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS:
  370. if model_type not in TOKENIZER_MAPPING_NAMES:
  371. TOKENIZER_MAPPING_NAMES[model_type] = "TokenizersBackend" if is_tokenizers_available() else None
  372. TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)
  373. CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
  374. def load_vocab(vocab_file):
  375. """Loads a vocabulary file into a dictionary."""
  376. with open(vocab_file, "r", encoding="utf-8") as reader:
  377. return json.load(reader)
  378. def load_merges(merges_file):
  379. """Loads a merges file into a list."""
  380. merges = []
  381. with open(merges_file, "r", encoding="utf-8") as reader:
  382. for line in reader:
  383. line = line.strip()
  384. if line and not line.startswith("#"):
  385. merges.append(tuple(line.split()))
  386. return merges
  387. def tokenizer_class_from_name(class_name: str) -> type[Any] | None:
  388. # Bloom tokenizer classes were removed but should map to the fast backend for BC
  389. if class_name in {"BloomTokenizer", "BloomTokenizerFast"}:
  390. return TokenizersBackend
  391. if class_name in REGISTERED_FAST_ALIASES:
  392. return REGISTERED_FAST_ALIASES[class_name]
  393. if class_name in REGISTERED_TOKENIZER_CLASSES:
  394. return REGISTERED_TOKENIZER_CLASSES[class_name]
  395. if class_name == "TokenizersBackend":
  396. return TokenizersBackend
  397. # V5: TOKENIZER_MAPPING_NAMES now maps to single strings, not tuples
  398. for module_name, tokenizer_class in TOKENIZER_MAPPING_NAMES.items():
  399. if tokenizer_class == class_name:
  400. module_name = model_type_to_module_name(module_name)
  401. if (
  402. module_name in ["mistral", "mistral3", "mixtral", "ministral", "ministral3", "pixtral", "voxtral"]
  403. and class_name == "MistralCommonBackend"
  404. ):
  405. module = importlib.import_module(".tokenization_mistral_common", "transformers")
  406. else:
  407. module = importlib.import_module(f".{module_name}", "transformers.models")
  408. try:
  409. result = getattr(module, class_name)
  410. # BC v5: expose XxxFast alias and tokenization_*_fast submodule for pre-v5 remote code.
  411. if (submod := getattr(result, "__module__", None)) and submod in sys.modules:
  412. base_mod = sys.modules[submod]
  413. setattr(base_mod, result.__name__ + "Fast", result)
  414. sys.modules.setdefault(submod + "_fast", base_mod)
  415. return result
  416. except AttributeError:
  417. continue
  418. for tokenizer in TOKENIZER_MAPPING._extra_content.values():
  419. if getattr(tokenizer, "__name__", None) == class_name:
  420. return tokenizer
  421. # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
  422. # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
  423. # init and we return the proper dummy to get an appropriate error message.
  424. main_module = importlib.import_module("transformers")
  425. if hasattr(main_module, class_name):
  426. return getattr(main_module, class_name)
  427. # BC v5: If a XxxFast class is not found, retry without 'Fast' for tokenizers saved pre-v5.
  428. if class_name.endswith("Fast"):
  429. return tokenizer_class_from_name(class_name[:-4])
  430. return None
  431. def get_tokenizer_config(
  432. pretrained_model_name_or_path: str | os.PathLike[str],
  433. cache_dir: str | os.PathLike[str] | None = None,
  434. force_download: bool = False,
  435. proxies: dict[str, str] | None = None,
  436. token: bool | str | None = None,
  437. revision: str | None = None,
  438. local_files_only: bool = False,
  439. subfolder: str = "",
  440. **kwargs,
  441. ) -> dict[str, Any]:
  442. """
  443. Loads the tokenizer configuration from a pretrained model tokenizer configuration.
  444. Args:
  445. pretrained_model_name_or_path (`str` or `os.PathLike`):
  446. This can be either:
  447. - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
  448. huggingface.co.
  449. - a path to a *directory* containing a configuration file saved using the
  450. [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
  451. cache_dir (`str` or `os.PathLike`, *optional*):
  452. Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
  453. cache should not be used.
  454. force_download (`bool`, *optional*, defaults to `False`):
  455. Whether or not to force to (re-)download the configuration files and override the cached versions if they
  456. exist.
  457. proxies (`dict[str, str]`, *optional*):
  458. A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
  459. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
  460. token (`str` or *bool*, *optional*):
  461. The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
  462. when running `hf auth login` (stored in `~/.huggingface`).
  463. revision (`str`, *optional*, defaults to `"main"`):
  464. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  465. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  466. identifier allowed by git.
  467. local_files_only (`bool`, *optional*, defaults to `False`):
  468. If `True`, will only try to load the tokenizer configuration from local files.
  469. subfolder (`str`, *optional*, defaults to `""`):
  470. In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
  471. specify the folder name here.
  472. <Tip>
  473. Passing `token=True` is required when you want to use a private model.
  474. </Tip>
  475. Returns:
  476. `dict`: The configuration of the tokenizer.
  477. Examples:
  478. ```python
  479. # Download configuration from huggingface.co and cache.
  480. tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
  481. # This model does not have a tokenizer config so the result will be an empty dict.
  482. tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")
  483. # Save a pretrained tokenizer locally and you can reload its config
  484. from transformers import AutoTokenizer
  485. tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
  486. tokenizer.save_pretrained("tokenizer-test")
  487. tokenizer_config = get_tokenizer_config("tokenizer-test")
  488. ```"""
  489. commit_hash = kwargs.get("_commit_hash")
  490. resolved_config_file = cached_file(
  491. pretrained_model_name_or_path,
  492. TOKENIZER_CONFIG_FILE,
  493. cache_dir=cache_dir,
  494. force_download=force_download,
  495. proxies=proxies,
  496. token=token,
  497. revision=revision,
  498. local_files_only=local_files_only,
  499. subfolder=subfolder,
  500. _raise_exceptions_for_gated_repo=False,
  501. _raise_exceptions_for_missing_entries=False,
  502. _raise_exceptions_for_connection_errors=False,
  503. _commit_hash=commit_hash,
  504. )
  505. if resolved_config_file is None:
  506. logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
  507. return {}
  508. commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
  509. with open(resolved_config_file, encoding="utf-8") as reader:
  510. result = json.load(reader)
  511. result["_commit_hash"] = commit_hash
  512. return result
  513. class AutoTokenizer:
  514. r"""
  515. This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
  516. created with the [`AutoTokenizer.from_pretrained`] class method.
  517. This class cannot be instantiated directly using `__init__()` (throws an error).
  518. """
  519. def __init__(self):
  520. raise OSError(
  521. "AutoTokenizer is designed to be instantiated "
  522. "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
  523. )
  524. @classmethod
  525. @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
  526. def from_pretrained(
  527. cls, pretrained_model_name_or_path, *inputs, **kwargs
  528. ) -> TokenizersBackend | SentencePieceBackend:
  529. r"""
  530. Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
  531. The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
  532. passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
  533. falling back to using pattern matching on `pretrained_model_name_or_path`:
  534. List options
  535. Params:
  536. pretrained_model_name_or_path (`str` or `os.PathLike`):
  537. Can be either:
  538. - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
  539. - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
  540. using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
  541. - a path to a single saved vocabulary file if and only if the tokenizer only requires a
  542. single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
  543. applicable to all derived classes)
  544. inputs (additional positional arguments, *optional*):
  545. Will be passed along to the Tokenizer `__init__()` method.
  546. config ([`PreTrainedConfig`], *optional*)
  547. The configuration object used to determine the tokenizer class to instantiate.
  548. cache_dir (`str` or `os.PathLike`, *optional*):
  549. Path to a directory in which a downloaded pretrained model configuration should be cached if the
  550. standard cache should not be used.
  551. force_download (`bool`, *optional*, defaults to `False`):
  552. Whether or not to force the (re-)download the model weights and configuration files and override the
  553. cached versions if they exist.
  554. proxies (`dict[str, str]`, *optional*):
  555. A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
  556. 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
  557. revision (`str`, *optional*, defaults to `"main"`):
  558. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  559. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  560. identifier allowed by git.
  561. subfolder (`str`, *optional*):
  562. In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
  563. facebook/rag-token-base), specify it here.
  564. tokenizer_type (`str`, *optional*):
  565. Tokenizer type to be loaded.
  566. backend (`str`, *optional*, defaults to `"tokenizers"`):
  567. Backend to use for tokenization. Valid options are:
  568. - `"tokenizers"`: Use the HuggingFace tokenizers library backend (default)
  569. - `"sentencepiece"`: Use the SentencePiece backend
  570. trust_remote_code (`bool`, *optional*, defaults to `False`):
  571. Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
  572. should only be set to `True` for repositories you trust and in which you have read the code, as it will
  573. execute code present on the Hub on your local machine.
  574. kwargs (additional keyword arguments, *optional*):
  575. Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
  576. `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
  577. `additional_special_tokens`. See parameters in the `__init__()` for more details.
  578. Examples:
  579. ```python
  580. >>> from transformers import AutoTokenizer
  581. >>> # Download vocabulary from huggingface.co and cache.
  582. >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
  583. >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
  584. >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
  585. >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
  586. >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
  587. >>> # Download vocabulary from huggingface.co and define model-specific arguments
  588. >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)
  589. >>> # Explicitly use the tokenizers backend
  590. >>> tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer", backend="tokenizers")
  591. >>> # Explicitly use the sentencepiece backend
  592. >>> tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer", backend="sentencepiece")
  593. ```"""
  594. config = kwargs.pop("config", None)
  595. kwargs["_from_auto"] = True
  596. # V5: Always use fast tokenizers, ignore use_fast parameter
  597. _ = kwargs.pop("use_fast", None)
  598. tokenizer_type = kwargs.pop("tokenizer_type", None)
  599. trust_remote_code = kwargs.pop("trust_remote_code", None)
  600. gguf_file = kwargs.get("gguf_file")
  601. # First, let's see whether the tokenizer_type is passed so that we can leverage it
  602. if tokenizer_type is not None:
  603. tokenizer_class_name = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)
  604. if tokenizer_class_name is None:
  605. raise ValueError(
  606. f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
  607. f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES)}."
  608. )
  609. tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)
  610. if tokenizer_class is None:
  611. raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")
  612. return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  613. if gguf_file:
  614. gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **kwargs)
  615. config_dict = load_gguf_checkpoint(gguf_path, return_tensors=False)["config"]
  616. config = AutoConfig.for_model(**config_dict)
  617. elif config is None:
  618. try:
  619. config = AutoConfig.from_pretrained(
  620. pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
  621. )
  622. except (ValueError, OSError):
  623. config = PreTrainedConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
  624. config_model_type = config.model_type
  625. # Next, let's try to use the tokenizer_config file to get the tokenizer class.
  626. tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
  627. tokenizer_config_class = tokenizer_config.get("tokenizer_class", None)
  628. # Check for auto_map early to handle dynamic tokenizers properly
  629. tokenizer_auto_map = None
  630. if "auto_map" in tokenizer_config:
  631. if isinstance(tokenizer_config["auto_map"], (tuple, list)):
  632. # Legacy format for dynamic tokenizers
  633. tokenizer_auto_map = tokenizer_config["auto_map"]
  634. else:
  635. tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
  636. # if there is a config, we can check that the tokenizer class != than model class and can thus assume we need to use TokenizersBackend
  637. # Skip this early exit if auto_map is present (custom tokenizer with trust_remote_code)
  638. if (
  639. tokenizer_auto_map is None
  640. and tokenizer_config_class is not None
  641. and config_model_type is not None
  642. and config_model_type != ""
  643. and TOKENIZER_MAPPING_NAMES.get(config_model_type) is not None
  644. and (TOKENIZER_MAPPING_NAMES.get(config_model_type).removesuffix("Fast"))
  645. != (tokenizer_config_class.removesuffix("Fast"))
  646. ):
  647. # new model, but we ignore it unless the model type is the same
  648. if TokenizersBackend is not None:
  649. try:
  650. return TokenizersBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  651. except Exception as e:
  652. logger.debug(f"Failed to use TokenizersBackend: {e}")
  653. return tokenizer_class_from_name(tokenizer_config_class).from_pretrained(
  654. pretrained_model_name_or_path, *inputs, **kwargs
  655. )
  656. if "_commit_hash" in tokenizer_config:
  657. kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
  658. if tokenizer_config_class and tokenizer_config_class.endswith("Fast"):
  659. tokenizer_config_class = tokenizer_config_class[:-4]
  660. has_remote_code = tokenizer_auto_map is not None
  661. has_local_code = type(config) in TOKENIZER_MAPPING or (
  662. tokenizer_config_class is not None
  663. and (
  664. tokenizer_class_from_name(tokenizer_config_class) is not None
  665. or tokenizer_class_from_name(tokenizer_config_class + "Fast") is not None
  666. )
  667. )
  668. explicit_local_code = (
  669. has_local_code
  670. and type(config) not in TOKENIZER_MAPPING
  671. and (
  672. tokenizer_config_class is not None
  673. and not (
  674. tokenizer_class_from_name(tokenizer_config_class)
  675. or tokenizer_class_from_name(tokenizer_config_class + "Fast")
  676. ).__module__.startswith("transformers.")
  677. )
  678. )
  679. # V5: Skip remote tokenizer for custom models with incorrect hub tokenizer class
  680. if has_remote_code and config_model_type in MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS:
  681. has_remote_code = False
  682. tokenizer_auto_map = None
  683. if has_remote_code:
  684. # V5: Always prefer fast tokenizer (index 1), fallback to slow (index 0)
  685. if tokenizer_auto_map[1] is not None:
  686. class_ref = tokenizer_auto_map[1]
  687. else:
  688. class_ref = tokenizer_auto_map[0]
  689. if "--" in class_ref:
  690. upstream_repo = class_ref.split("--")[0]
  691. else:
  692. upstream_repo = None
  693. trust_remote_code = resolve_trust_remote_code(
  694. trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code, upstream_repo
  695. )
  696. if has_remote_code and trust_remote_code and not explicit_local_code:
  697. # BC v5: register *Fast aliases before remote code loads.
  698. if tokenizer_config_class:
  699. tokenizer_class_from_name(tokenizer_config_class.removesuffix("Fast"))
  700. tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
  701. _ = kwargs.pop("code_revision", None)
  702. tokenizer_class.register_for_auto_class()
  703. return tokenizer_class.from_pretrained(
  704. pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs
  705. )
  706. elif tokenizer_config_class is not None:
  707. tokenizer_class_candidate = tokenizer_config_class
  708. tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
  709. if tokenizer_class is None and not tokenizer_class_candidate.endswith("Fast"):
  710. tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate + "Fast")
  711. if tokenizer_class is not None and tokenizer_class.__name__ == "PythonBackend":
  712. tokenizer_class = TokenizersBackend
  713. # Fallback to TokenizersBackend if the class wasn't found
  714. if tokenizer_class is None:
  715. tokenizer_class = TokenizersBackend
  716. return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  717. elif getattr(config, "tokenizer_class", None):
  718. _class = config.tokenizer_class
  719. if "PreTrainedTokenizerFast" not in _class and _class.endswith("Fast"):
  720. _class = _class[:-4]
  721. tokenizer_class = tokenizer_class_from_name(_class)
  722. return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  723. # Otherwise we have to be creative.
  724. # if model is an encoder decoder, the encoder tokenizer class is used by default
  725. if isinstance(config, EncoderDecoderConfig):
  726. if type(config.decoder) is not type(config.encoder):
  727. logger.warning(
  728. f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
  729. f"config class: {config.decoder.__class__}. It is not recommended to use the "
  730. "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
  731. "specific tokenizer classes."
  732. )
  733. config = config.encoder
  734. model_type = config_class_to_model_type(type(config).__name__) or getattr(config, "model_type", None)
  735. if model_type is not None:
  736. tokenizer_class = TOKENIZER_MAPPING.get(type(config), TokenizersBackend)
  737. if tokenizer_class is not None:
  738. return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  739. # Fallback: try tokenizer_class from tokenizer_config.json
  740. tokenizer_config_class = tokenizer_config.get("tokenizer_class", None)
  741. if tokenizer_config_class is not None:
  742. if tokenizer_config_class != "TokenizersBackend" and tokenizer_config_class.endswith("Fast"):
  743. tokenizer_config_class = tokenizer_config_class[:-4]
  744. tokenizer_class = tokenizer_class_from_name(tokenizer_config_class)
  745. if tokenizer_class is None and not tokenizer_config_class.endswith("Fast"):
  746. tokenizer_class = tokenizer_class_from_name(tokenizer_config_class + "Fast")
  747. if tokenizer_class is not None and tokenizer_class.__name__ == "PythonBackend":
  748. tokenizer_class = TokenizersBackend
  749. if tokenizer_class is None:
  750. tokenizer_class = TokenizersBackend
  751. return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  752. raise ValueError(
  753. f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
  754. f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING)}."
  755. )
  756. @staticmethod
  757. def register(
  758. config_class, tokenizer_class=None, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False
  759. ):
  760. """
  761. Register a new tokenizer in this mapping.
  762. Args:
  763. config_class ([`PreTrainedConfig`]):
  764. The configuration corresponding to the model to register.
  765. tokenizer_class: The tokenizer class to register (V5 - preferred parameter).
  766. slow_tokenizer_class: (Deprecated) The slow tokenizer to register.
  767. fast_tokenizer_class: (Deprecated) The fast tokenizer to register.
  768. """
  769. if tokenizer_class is None:
  770. # Legacy: prefer fast over slow
  771. if fast_tokenizer_class is not None:
  772. tokenizer_class = fast_tokenizer_class
  773. elif slow_tokenizer_class is not None:
  774. tokenizer_class = slow_tokenizer_class
  775. else:
  776. raise ValueError("You need to pass a `tokenizer_class`")
  777. for candidate in (slow_tokenizer_class, fast_tokenizer_class, tokenizer_class):
  778. if candidate is not None:
  779. REGISTERED_TOKENIZER_CLASSES[candidate.__name__] = candidate
  780. if slow_tokenizer_class is not None and fast_tokenizer_class is not None:
  781. REGISTERED_FAST_ALIASES[slow_tokenizer_class.__name__] = fast_tokenizer_class
  782. TOKENIZER_MAPPING.register(config_class, tokenizer_class, exist_ok=exist_ok)
  783. __all__ = ["TOKENIZER_MAPPING", "AutoTokenizer"]