| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388 |
- # Copyright 2020 The HuggingFace Inc. team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """
- Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
- see tokenization_utils.py
- """
- import copy
- import json
- import os
- from collections import defaultdict
- from collections.abc import Iterable
- from shutil import copyfile
- from typing import Any
- import tokenizers.pre_tokenizers as pre_tokenizers_fast
- from huggingface_hub import is_offline_mode
- from tokenizers import AddedToken, processors
- from tokenizers import Encoding as EncodingFast
- from tokenizers import Tokenizer as TokenizerFast
- from tokenizers.decoders import Decoder as DecoderFast
- from tokenizers.models import BPE, Unigram
- from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
- from transformers.utils.hub import cached_file
- from .convert_slow_tokenizer import SpmConverter
- from .integrations.ggml import convert_gguf_tokenizer
- from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
- from .tokenization_utils_base import (
- INIT_TOKENIZER_DOCSTRING,
- BatchEncoding,
- PreTokenizedInput,
- PreTrainedTokenizerBase,
- TextInput,
- TruncationStrategy,
- generate_merges,
- )
- from .utils import PaddingStrategy, add_end_docstrings, logging
- logger = logging.get_logger(__name__)
- # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
- TOKENIZER_FILE = "tokenizer.json"
- SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
- TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
- TIKTOKEN_VOCAB_FILE = "tokenizer.model"
- # Slow tokenizers have an additional added tokens files
- ADDED_TOKENS_FILE = "added_tokens.json"
- INIT_TOKENIZER_DOCSTRING += """
- tokenizer_object ([`tokenizers.Tokenizer`]):
- A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
- tokenizers](../fast_tokenizers) for more information.
- tokenizer_file ([`str`]):
- A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
- tokenizers.
- """
- MODEL_TO_TRAINER_MAPPING = {
- "BPE": BpeTrainer,
- "Unigram": UnigramTrainer,
- "WordLevel": WordLevelTrainer,
- "WordPiece": WordPieceTrainer,
- }
- VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE, "vocab_file": TIKTOKEN_VOCAB_FILE}
- @add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
- class TokenizersBackend(PreTrainedTokenizerBase):
- """
- Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).
- Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].
- Handles all the shared methods for tokenization and special tokens, as well as methods for
- downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.
- This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
- specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
- """
- vocab_files_names = VOCAB_FILES_NAMES
- model = None
- _tokenizer = None
- @classmethod
- def convert_to_native_format(cls, trust_remote_code=False, **kwargs):
- """s
- Build a `tokenizers.Tokenizer` backend from the available serialization files (tokenizer.json, sentencepiece
- models, tekken.json, vocab/merges).
- """
- # Preserve kwargs for possible downstream use
- local_kwargs = dict(kwargs)
- fast_tokenizer_file = local_kwargs.pop("tokenizer_file", None)
- if (
- fast_tokenizer_file is not None
- and os.path.isfile(fast_tokenizer_file)
- and (cls is TokenizersBackend or "__init__" not in cls.__dict__ or trust_remote_code)
- ):
- local_kwargs["tokenizer_object"] = TokenizerFast.from_file(fast_tokenizer_file)
- return local_kwargs
- elif fast_tokenizer_file is not None and os.path.isfile(fast_tokenizer_file):
- # we extract vocab/merges and pass decoder/pre_tokenizer/post_processor
- # from the file so the reconstructed tokenizer matches the tokenizer.json
- with open(fast_tokenizer_file, encoding="utf-8") as tokenizer_handle:
- tokenizer_json = json.load(tokenizer_handle)
- # Build a minimal tokenizer (empty vocab/merges) to cheaply extract post_processor,
- # padding and truncation as Rust objects — avoids parsing the full vocab via from_file.
- # This optimization applies to BPE, WordPiece, and WordLevel only:
- # - Unigram (SentencePiece) requires a non-empty vocab to initialize correctly in Rust
- # (e.g. AlbertTokenizer, CamembertTokenizer, LlamaTokenizer, T5Tokenizer); passing an
- # empty vocab causes "Unable to load vocab EmptyVocabulary". TODO: investigate if keeping
- # just the UNK token is sufficient to make Unigram work with a minimal vocab.
- # - Older tokenizer.json formats (e.g. XLNetTokenizer, DistilBertTokenizer) omit the
- # "type" field in the "model" section, so we cannot determine the model type from JSON.
- # In both cases we fall back to the original from_file path (no performance improvement).
- model_type = tokenizer_json.get("model", {}).get("type")
- if model_type not in (None, "Unigram"):
- minimal_tokenizer_json = dict(tokenizer_json)
- minimal_model = dict(tokenizer_json["model"])
- minimal_model["vocab"] = {}
- if model_type == "BPE":
- minimal_model["merges"] = []
- minimal_tokenizer_json["model"] = minimal_model
- minimal_tokenizer_json["added_tokens"] = []
- tok_from_file = TokenizerFast.from_str(json.dumps(minimal_tokenizer_json))
- else:
- tok_from_file = TokenizerFast.from_file(fast_tokenizer_file)
- local_kwargs["post_processor"] = tok_from_file.post_processor
- local_kwargs["tokenizer_padding"] = tok_from_file.padding
- local_kwargs["tokenizer_truncation"] = tok_from_file.truncation
- # Preserve truncation and padding baked into tokenizer.json so that classes
- # with a custom __init__ that rebuild the backend tokenizer from scratch
- # can still access these settings.
- if tok_from_file.truncation is not None:
- local_kwargs["_json_truncation"] = tok_from_file.truncation
- if tok_from_file.padding is not None:
- local_kwargs["_json_padding"] = tok_from_file.padding
- # Extract precompiled SentencePiece charsmap from tokenizer.json normalizer
- # when present (e.g. T5 tokenizers converted with SentencePiece >= 2.x).
- normalizer_config = tokenizer_json.get("normalizer")
- if normalizer_config:
- if normalizer_config.get("type", None) == "Sequence":
- normalizer_config = normalizer_config["normalizers"]
- elif not isinstance(normalizer_config, list):
- normalizer_config = [normalizer_config]
- for normalizer in normalizer_config:
- if normalizer.get("type") == "Precompiled" and "precompiled_charsmap" in normalizer:
- import base64
- local_kwargs["_spm_precompiled_charsmap"] = base64.b64decode(
- normalizer["precompiled_charsmap"]
- )
- break
- vocab = tokenizer_json.get("model", {}).get("vocab", None)
- if cls.model is None:
- if isinstance(vocab, list):
- vocab = list(map(tuple, vocab)) # TODO just for now
- elif cls.model.__name__ == "Unigram":
- if isinstance(vocab, list) and vocab and isinstance(vocab[0], (list, tuple)):
- vocab = [tuple(item) for item in vocab]
- elif cls.model.__name__ == "WordLevel":
- vocab = {token: i for i, token in enumerate(vocab)}
- elif cls.model.__name__ == "BPE" or cls.model.__name__ == "WordPiece":
- if isinstance(vocab, list):
- vocab = {token[0] if isinstance(token, list) else token: i for i, token in enumerate(vocab)}
- local_kwargs["vocab"] = vocab
- model_type = getattr(cls, "model", None)
- if "merges" in tokenizer_json.get("model", {}) and (model_type and model_type.__name__ == "BPE"):
- merges = tokenizer_json["model"]["merges"]
- merges = [tuple(merge.split(" ")) if isinstance(merge, str) else tuple(merge) for merge in merges]
- local_kwargs["merges"] = merges
- return local_kwargs
- vocab_file = local_kwargs.get("vocab_file")
- merges_file = local_kwargs.get("merges_file")
- vocab = local_kwargs.get("vocab")
- merges = local_kwargs.get("merges")
- # Tekken converter (Mistral)
- if isinstance(vocab_file, str) and vocab_file.endswith("tekken.json") and os.path.isfile(vocab_file):
- from .convert_slow_tokenizer import MistralConverter
- local_kwargs["vocab"], local_kwargs["merges"] = MistralConverter(
- vocab_file=vocab_file
- ).extract_vocab_merges_from_model(vocab_file)
- return local_kwargs
- # SentencePiece model (with TikToken fallback)
- if isinstance(vocab_file, str) and os.path.isfile(vocab_file) and vocab_file.endswith(".model"):
- try:
- from .convert_slow_tokenizer import SentencePieceExtractor
- # 1. Extract vocab, merges, and spm_precompiled from the .model proto
- extractor = SentencePieceExtractor(vocab_file)
- local_kwargs = extractor.extract(cls.model, **local_kwargs)
- # 2. If a model-specific converter exists, use it.
- try:
- from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
- converter_class = SLOW_TO_FAST_CONVERTERS.get(cls.__name__)
- if converter_class is not None and hasattr(converter_class, "convert_from_spm"):
- local_kwargs = converter_class.convert_from_spm(**local_kwargs)
- except Exception as e:
- logger.warning(
- f"Could not reorder vocab using converter for {cls.__name__} due to {e}. Falling back to raw SentencePiece extraction."
- )
- if hasattr(cls, "convert_from_spm_model"):
- local_kwargs = cls.convert_from_spm_model(**local_kwargs)
- # 3. For non-model specific tokenizers (e.g. TokenizersBackend used
- # for MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS), build a _tokenizer
- # from the proto so normalizer/decoder are configured correctly.
- if "tokenizer_object" not in local_kwargs and (
- cls is TokenizersBackend or "__init__" not in cls.__dict__
- ):
- vocab = local_kwargs.pop("vocab", None)
- merges = local_kwargs.pop("merges", None)
- # Replace placeholder tokens as specified in added_tokens_decoder
- added_tokens_decoder = local_kwargs.get("added_tokens_decoder") or {}
- if vocab is not None and added_tokens_decoder:
- id_to_token = {token_id: token for token, token_id in vocab.items()}
- for token_id, new_token in added_tokens_decoder.items():
- token_id = int(token_id)
- new_token = str(new_token)
- current_token = id_to_token.get(token_id)
- if current_token and current_token != new_token and new_token not in vocab:
- vocab[new_token] = vocab.pop(current_token)
- id_to_token[token_id] = new_token
- tokenizer_object = SpmConverter.build_tokenizer_from_spm_proto(
- proto=extractor.proto,
- vocab=vocab,
- merges=merges,
- )
- if tokenizer_object is not None:
- local_kwargs["tokenizer_object"] = tokenizer_object
- # Set bos/eos tokens from proto spec if available. This is needed when
- # building a tokenizer_object directly from a .model file because the
- # tokenizer_object does not have bos/eos set.
- proto_spec = extractor.proto.trainer_spec
- if proto_spec.bos_id >= 0:
- local_kwargs.setdefault("bos_token", proto_spec.bos_piece or "<s>")
- if proto_spec.eos_id >= 0:
- local_kwargs.setdefault("eos_token", proto_spec.eos_piece or "</s>")
- if proto_spec.unk_id >= 0:
- local_kwargs.setdefault("unk_token", proto_spec.unk_piece or "<unk>")
- except Exception as e: # TODO only catch deserialization error here!
- logger.warning(
- f"Could not extract SentencePiece model from {vocab_file} using sentencepiece library due to {e}. "
- "Falling back to TikToken extractor."
- )
- from .convert_slow_tokenizer import TikTokenConverter
- converter = TikTokenConverter(
- vocab_file=vocab_file, extra_special_tokens=local_kwargs.get("extra_special_tokens")
- )
- local_kwargs["tokenizer_object"] = converter.converted()
- return local_kwargs
- # Fallback to standard vocab/merges files if they existed!
- if vocab is None and isinstance(vocab_file, str) and os.path.isfile(vocab_file):
- local_kwargs["vocab"] = vocab_file
- vocab = local_kwargs["vocab"]
- if merges is None and isinstance(merges_file, str) and os.path.isfile(merges_file):
- local_kwargs["merges"] = merges_file
- merges = local_kwargs["merges"]
- # Generate merges automatically when not provided for BPE tokenizers
- if merges is None and cls.model is not None and cls.model.__name__ == "BPE" and isinstance(vocab, dict):
- # Gather special tokens from kwargs to skip in merge generation
- def _iter_special_tokens(values: Iterable[Any]) -> list[str]:
- collected: list[str] = []
- for val in values:
- if val is None:
- continue
- if isinstance(val, (list, tuple)):
- collected.extend(_iter_special_tokens(val))
- else:
- collected.append(str(val))
- return collected
- special_tokens_keys = [
- "pad_token",
- "unk_token",
- "bos_token",
- "eos_token",
- "sep_token",
- "cls_token",
- "mask_token",
- "additional_special_tokens",
- "extra_special_tokens",
- ]
- skip_tokens: set[str] = set()
- for key in special_tokens_keys:
- if key in local_kwargs:
- skip_tokens.update(_iter_special_tokens([local_kwargs[key]]))
- merges = generate_merges(vocab, skip_tokens=skip_tokens)
- local_kwargs["merges"] = merges
- return local_kwargs
- def __init__(self, *args, **kwargs):
- # Truncation/padding dicts extracted from tokenizer.json by convert_to_native_format
- # when a class with a custom __init__ rebuilds the backend tokenizer from scratch.
- _json_truncation = kwargs.pop("_json_truncation", None)
- _json_padding = kwargs.pop("_json_padding", None)
- # Precompiled SentencePiece charsmap is already used by model-specific tokenizers
- # (before calling super().__init__) and should not be stored in `init_kwargs` to keep the tokenizer serializable.
- kwargs.pop("_spm_precompiled_charsmap", None)
- tokenizer_object = kwargs.pop("tokenizer_object", None)
- gguf_file = kwargs.pop("gguf_file", None)
- fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
- # Note: added_tokens_decoder is NOT popped - it's passed to super().__init__() for processing
- added_tokens_decoder = kwargs.get("added_tokens_decoder", {})
- # Store add_prefix_space before super().__init__() to ensure it's not overridden
- add_prefix_space = kwargs.get("add_prefix_space", False)
- vocab_file = kwargs.get("vocab_file")
- vocab = kwargs.get("vocab")
- merges = kwargs.get("merges")
- fast_tokenizer = None
- if tokenizer_object is not None:
- fast_tokenizer = copy.deepcopy(tokenizer_object)
- elif fast_tokenizer_file is not None and os.path.isfile(fast_tokenizer_file):
- # We have a serialization from tokenizers which let us directly build the backend
- fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
- elif gguf_file is not None:
- # We need to convert a slow tokenizer to build the backend
- gguf_path = cached_file(kwargs.get("name_or_path", ""), gguf_file, **kwargs)
- gguf_param = load_gguf_checkpoint(gguf_path)
- architecture = gguf_param["config"]["model_type"]
- tokenizer_dict = gguf_param["tokenizer"]
- tokenizer_config = gguf_param["tokenizer_config"]
- fast_tokenizer, additional_kwargs = convert_gguf_tokenizer(architecture, tokenizer_dict)
- kwargs.update(tokenizer_config)
- if len(additional_kwargs) > 0:
- kwargs.update(additional_kwargs)
- elif self._tokenizer is None and vocab is not None:
- # Build from vocab/merges extracted by convert_to_native_format
- if merges is not None:
- vocab_dict = vocab if isinstance(vocab, dict) else {w: i for i, (w, _) in enumerate(vocab)}
- fast_tokenizer = TokenizerFast(BPE(vocab=vocab_dict, merges=merges, fuse_unk=True, dropout=None))
- elif isinstance(vocab, dict):
- fast_tokenizer = TokenizerFast(BPE(vocab=vocab, merges=[], fuse_unk=True, dropout=None))
- elif isinstance(vocab, list) and vocab and isinstance(vocab[0], (tuple, list)):
- fast_tokenizer = TokenizerFast(Unigram(vocab=vocab, unk_id=kwargs.get("unk_id", 0)))
- elif self._tokenizer is None:
- raise ValueError(
- "Couldn't instantiate the backend tokenizer from one of: \n"
- "(1) a `tokenizers` library serialization file, \n"
- "(2) a slow tokenizer instance to convert or \n"
- "(3) an equivalent slow tokenizer class to instantiate and convert. \n"
- "You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one."
- )
- # Only set defaults when creating TokenizersBackend from scratch
- if fast_tokenizer_file is None and tokenizer_object is None and self._tokenizer is None:
- kwargs.setdefault("bos_token", "<s>")
- kwargs.setdefault("eos_token", "</s>")
- if fast_tokenizer is not None:
- self._tokenizer = fast_tokenizer
- if self._tokenizer is None:
- raise ValueError("The backend tokenizer is not correctly initialized.")
- _truncation = kwargs.pop("tokenizer_truncation", None) or self._tokenizer.truncation or _json_truncation
- if _truncation is not None:
- self._tokenizer.enable_truncation(**_truncation)
- kwargs.setdefault("max_length", _truncation["max_length"])
- kwargs.setdefault("truncation_side", _truncation["direction"])
- kwargs.setdefault("stride", _truncation["stride"])
- kwargs.setdefault("truncation_strategy", _truncation["strategy"])
- else:
- self._tokenizer.no_truncation()
- _padding = kwargs.pop("tokenizer_padding", None) or self._tokenizer.padding or _json_padding
- if _padding is not None:
- self._tokenizer.enable_padding(**_padding)
- kwargs.setdefault("pad_token", _padding["pad_token"])
- kwargs.setdefault("pad_token_type_id", _padding["pad_type_id"])
- kwargs.setdefault("padding_side", _padding["direction"])
- kwargs.setdefault("max_length", _padding["length"])
- kwargs.setdefault("pad_to_multiple_of", _padding["pad_to_multiple_of"])
- # Set backend to "tokenizers" if not already set
- if "backend" not in kwargs:
- kwargs["backend"] = "tokenizers"
- explicit_bos_eos_in_kwargs = "add_bos_token" in kwargs or "add_eos_token" in kwargs
- self._add_bos_token = kwargs.get("add_bos_token", False)
- self._add_eos_token = kwargs.get("add_eos_token", False)
- if post_processor := kwargs.pop("post_processor", None): # most reliable way to get the post-processor
- self._tokenizer.post_processor = post_processor
- self._should_update_post_processor = explicit_bos_eos_in_kwargs or self._tokenizer.post_processor is None
- # We call this after having initialized the backend tokenizer because we update it.
- super().__init__(**kwargs)
- if vocab_file is not None:
- self.vocab_file = vocab_file
- # Ensure add_prefix_space is set correctly after parent init
- self.add_prefix_space = add_prefix_space
- self._tokenizer.encode_special_tokens = self.split_special_tokens
- added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder}
- tokens_to_add = [
- token
- for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])
- if hash(repr(token)) not in added_tokens_decoder_hash
- ]
- encoder = list(self.added_tokens_encoder.keys()) + [str(token) for token in tokens_to_add]
- # if some of the special tokens are not already in the tokenizer, add them
- # V5: Check both named special tokens and extra special tokens
- # Iterate over _special_tokens_map to preserve AddedToken properties (lstrip, rstrip, etc.)
- for special_token_value in self._special_tokens_map.values():
- if special_token_value is None:
- continue
- if str(special_token_value) not in encoder and special_token_value not in tokens_to_add:
- tokens_to_add.append(special_token_value)
- # Also check extra special tokens
- for token in self._extra_special_tokens:
- if str(token) not in encoder and token not in tokens_to_add:
- tokens_to_add.append(token)
- if len(tokens_to_add) > 0:
- tokens = []
- all_named_tokens = [str(t) for t in self._special_tokens_map.values() if t]
- for token in tokens_to_add:
- if isinstance(token, str):
- # Convert string to AddedToken, assuming it's special
- token = AddedToken(token, special=True)
- elif isinstance(token, AddedToken):
- # Ensure the special flag is set correctly for special tokens
- if not token.special and str(token) in all_named_tokens:
- token.special = True
- tokens.append(token)
- if tokens:
- # These tokens are from the special tokens map
- self.add_tokens(tokens)
- try:
- vocab_size = self._tokenizer.get_vocab_size()
- except NotImplementedError:
- vocab_size = 0
- # Optionally patches mistral tokenizers with wrong regex
- if vocab_size > 100000 and getattr(self._tokenizer, "pre_tokenizer", None) is not None:
- kwargs.pop("tokenizer", None)
- self._tokenizer = self._patch_mistral_regex(
- self._tokenizer,
- self.init_kwargs.get("name_or_path", None),
- init_kwargs=self.init_kwargs,
- fix_mistral_regex=kwargs.pop("fix_mistral_regex", None),
- **kwargs,
- )
- self._should_update_post_processor = (
- self._should_update_post_processor or self._tokenizer.post_processor is None
- )
- if self._should_update_post_processor:
- self.update_post_processor()
- @property
- def is_fast(self) -> bool:
- return True
- @property
- def can_save_slow_tokenizer(self) -> bool:
- """
- `bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
- can only be `True` if the original `"sentencepiece.model"` was not deleted.
- """
- if "vocab_file" in self.vocab_files_names and self.vocab_files_names["vocab_file"].endswith(".model"):
- if hasattr(self, "vocab_file") and self.vocab_file:
- # If the vocab file is a sentencepiece model, we can save it
- return os.path.isfile(self.vocab_file)
- return False
- else:
- return True
- def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str]:
- if not os.path.isdir(save_directory):
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
- return
- out_vocab_file = os.path.join(
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
- )
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
- copyfile(self.vocab_file, out_vocab_file)
- return (out_vocab_file,)
- def update_post_processor(self):
- """
- Updates the underlying post processor with the current `bos_token` and `eos_token`.
- """
- bos = self.bos_token
- bos_token_id = self.bos_token_id
- if bos is None and self.add_bos_token:
- self.add_bos_token = False
- eos = self.eos_token
- eos_token_id = self.eos_token_id
- if eos is None and self.add_eos_token:
- self.add_eos_token = False
- single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
- pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
- special_tokens = []
- if self.add_bos_token:
- special_tokens.append((bos, bos_token_id))
- if self.add_eos_token:
- special_tokens.append((eos, eos_token_id))
- self._tokenizer.post_processor = processors.TemplateProcessing(
- single=single, pair=pair, special_tokens=special_tokens
- )
- @property
- def add_eos_token(self):
- return getattr(self, "_add_eos_token", False)
- @property
- def add_bos_token(self):
- return getattr(self, "_add_bos_token", False)
- @add_eos_token.setter
- def add_eos_token(self, value):
- object.__setattr__(self, "_add_eos_token", value)
- self.update_post_processor()
- @add_bos_token.setter
- def add_bos_token(self, value):
- object.__setattr__(self, "_add_bos_token", value)
- self.update_post_processor()
- def _post_init(self):
- """
- Post-initialization hook that runs after the tokenizer is fully set up.
- This is called by from_pretrained() after loading the tokenizer, which allows
- us to add any special tokens that may have been passed as AddedToken objects.
- Child classes should call super()._post_init() if they override this method.
- """
- tokens_to_add = []
- # V5: Check named special tokens
- for token_value in self._special_tokens_map.values():
- if token_value is None:
- continue
- if isinstance(token_value, AddedToken):
- tokens_to_add.append(token_value)
- elif isinstance(token_value, str):
- tokens_to_add.append(AddedToken(token_value, special=True, normalized=False))
- # V5: Check extra special tokens
- for token in self._extra_special_tokens:
- if isinstance(token, AddedToken):
- tokens_to_add.append(token)
- elif isinstance(token, str):
- tokens_to_add.append(AddedToken(token, special=True, normalized=False))
- if tokens_to_add:
- # Ensure special tokens are added as such to the backend
- self.add_tokens(tokens_to_add, special_tokens=True)
- if getattr(self, "_should_update_post_processor", True) or self._tokenizer.post_processor is None:
- self.update_post_processor()
- @property
- def vocab_size(self) -> int:
- """
- `int`: Size of the base vocabulary (without the added tokens).
- """
- return self._tokenizer.get_vocab_size(with_added_tokens=False)
- def get_vocab(self) -> dict[str, int]:
- return self._tokenizer.get_vocab(with_added_tokens=True)
- @property
- def vocab(self) -> dict[str, int]:
- return self.get_vocab()
- @property
- def added_tokens_encoder(self) -> dict[str, int]:
- """
- Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
- optimisation in `self._added_tokens_encoder` for the slow tokenizers.
- """
- return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}
- @property
- def added_tokens_decoder(self) -> dict[int, AddedToken]:
- """
- Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
- Returns:
- `dict[str, int]`: The added tokens.
- """
- return self._tokenizer.get_added_tokens_decoder()
- # BC v5: expose ``_added_tokens_encoder`` / ``_added_tokens_decoder`` attrs for custom tokenizers that expect
- # them from slow tokenizers. Only supports read, not write (won't sync to Rust backend, use add_tokens() instead
- _added_tokens_encoder = added_tokens_encoder
- _added_tokens_decoder = added_tokens_decoder
- def get_added_vocab(self) -> dict[str, int]:
- """
- Returns the added tokens in the vocabulary as a dictionary of token to index.
- Returns:
- `dict[str, int]`: The added tokens.
- """
- return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}
- def __bool__(self) -> bool:
- """
- Returns True, to avoid expensive `assert tokenizer` gotchas.
- """
- return True
- def __len__(self) -> int:
- """
- Size of the full vocabulary with the added tokens.
- """
- return self._tokenizer.get_vocab_size(with_added_tokens=True)
- @property
- def backend_tokenizer(self) -> TokenizerFast:
- """
- `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
- """
- return self._tokenizer
- @property
- def decoder(self) -> DecoderFast:
- """
- `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
- """
- return self._tokenizer.decoder
- def _convert_encoding(
- self,
- encoding: EncodingFast,
- return_token_type_ids: bool | None = None,
- return_attention_mask: bool | None = None,
- return_overflowing_tokens: bool = False,
- return_special_tokens_mask: bool = False,
- return_offsets_mapping: bool = False,
- return_length: bool = False,
- verbose: bool = True,
- ) -> tuple[dict[str, Any], list[EncodingFast]]:
- """
- Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
- of encodings, take care of building a batch from overflowing tokens.
- Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
- lists (overflows) of lists (tokens).
- Output shape: (overflows, sequence length)
- """
- if return_token_type_ids is None:
- return_token_type_ids = "token_type_ids" in self.model_input_names
- if return_attention_mask is None:
- return_attention_mask = "attention_mask" in self.model_input_names
- if return_overflowing_tokens and encoding.overflowing is not None:
- encodings = [encoding] + encoding.overflowing
- else:
- encodings = [encoding]
- encoding_dict = defaultdict(list)
- for e in encodings:
- encoding_dict["input_ids"].append(e.ids)
- if return_token_type_ids:
- encoding_dict["token_type_ids"].append(e.type_ids)
- if return_attention_mask:
- encoding_dict["attention_mask"].append(e.attention_mask)
- if return_special_tokens_mask:
- encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
- if return_offsets_mapping:
- encoding_dict["offset_mapping"].append(e.offsets)
- if return_length:
- encoding_dict["length"].append(len(e.ids))
- return encoding_dict, encodings
- def _convert_token_to_id_with_added_voc(self, token: str) -> int:
- index = self._tokenizer.token_to_id(token)
- if index is None:
- return self.unk_token_id
- return index
- def _convert_id_to_token(self, index: int) -> str | None:
- return self._tokenizer.id_to_token(int(index))
- def _add_tokens(self, new_tokens: list[str | AddedToken], special_tokens=False) -> int:
- if special_tokens:
- return self._tokenizer.add_special_tokens(new_tokens)
- return self._tokenizer.add_tokens(new_tokens)
- def num_special_tokens_to_add(self, pair: bool = False) -> int:
- """
- Returns the number of added tokens when encoding a sequence with special tokens.
- <Tip>
- This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
- this inside your training loop.
- </Tip>
- Args:
- pair (`bool`, *optional*, defaults to `False`):
- Whether the number of added tokens should be computed in the case of a sequence pair or a single
- sequence.
- Returns:
- `int`: Number of special tokens added to sequences.
- """
- return self._tokenizer.num_special_tokens_to_add(pair)
- def convert_ids_to_tokens(self, ids: int | list[int], skip_special_tokens: bool = False) -> str | list[str]:
- """
- Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
- added tokens.
- Args:
- ids (`int` or `list[int]`):
- The token id (or token ids) to convert to tokens.
- skip_special_tokens (`bool`, *optional*, defaults to `False`):
- Whether or not to remove special tokens in the decoding.
- Returns:
- `str` or `list[str]`: The decoded token(s).
- """
- if isinstance(ids, int):
- return self._tokenizer.id_to_token(ids)
- tokens = []
- # self.all_special_ids is an @property which may be slow, so only compute it once before the loop
- ids_to_skip = set(self.all_special_ids) if skip_special_tokens else set()
- for index in ids:
- index = int(index)
- if index in ids_to_skip:
- continue
- tokens.append(self._tokenizer.id_to_token(index))
- return tokens
- def tokenize(self, text: str, pair: str | None = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
- return self._encode_plus(text=text, text_pair=pair, add_special_tokens=add_special_tokens, **kwargs).tokens()
- def set_truncation_and_padding(
- self,
- padding_strategy: PaddingStrategy,
- truncation_strategy: TruncationStrategy,
- max_length: int,
- stride: int,
- pad_to_multiple_of: int | None,
- padding_side: str | None,
- ):
- """
- Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
- library) and restore the tokenizer settings afterwards.
- The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
- padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
- section.
- Args:
- padding_strategy ([`~utils.PaddingStrategy`]):
- The kind of padding that will be applied to the input
- truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
- The kind of truncation that will be applied to the input
- max_length (`int`):
- The maximum size of a sequence.
- stride (`int`):
- The stride to use when handling overflow.
- pad_to_multiple_of (`int`, *optional*):
- If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
- the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
- padding_side (`str`, *optional*):
- The side on which the model should have padding applied. Should be selected between ['right', 'left'].
- Default value is picked from the class attribute of the same name.
- """
- _truncation = self._tokenizer.truncation
- _padding = self._tokenizer.padding
- # Set truncation and padding on the backend tokenizer
- if truncation_strategy == TruncationStrategy.DO_NOT_TRUNCATE:
- if _truncation is not None:
- self._tokenizer.no_truncation()
- else:
- target = {
- "max_length": max_length,
- "stride": stride,
- "strategy": truncation_strategy.value,
- "direction": self.truncation_side,
- }
- # _truncation might contain more keys that the target `transformers`
- # supports. Use only the target keys to trigger `enable_truncation`.
- # This should enable this code to works on various `tokenizers`
- # targets.
- if _truncation is None:
- current = None
- else:
- current = {k: _truncation.get(k, None) for k in target}
- if current != target:
- self._tokenizer.enable_truncation(**target)
- if padding_strategy == PaddingStrategy.DO_NOT_PAD:
- if _padding is not None:
- self._tokenizer.no_padding()
- else:
- length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None
- target = {
- "length": length,
- "direction": padding_side if padding_side is not None else self.padding_side,
- "pad_id": self.pad_token_id,
- "pad_token": self.pad_token,
- "pad_type_id": self.pad_token_type_id,
- "pad_to_multiple_of": pad_to_multiple_of,
- }
- if _padding != target:
- self._tokenizer.enable_padding(**target)
- def _encode_plus(
- self,
- text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput],
- text_pair: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] | None = None,
- add_special_tokens: bool = True,
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
- max_length: int | None = None,
- stride: int = 0,
- is_split_into_words: bool = False,
- pad_to_multiple_of: int | None = None,
- padding_side: str | None = None,
- return_tensors: bool | None = None,
- return_token_type_ids: bool | None = None,
- return_attention_mask: bool | None = None,
- return_overflowing_tokens: bool = False,
- return_special_tokens_mask: bool = False,
- return_offsets_mapping: bool = False,
- return_length: bool = False,
- verbose: bool = True,
- split_special_tokens: bool | None = None,
- **kwargs,
- ) -> BatchEncoding:
- # Input validation (from _call_one)
- def _is_valid_text_input(t):
- if isinstance(t, str):
- return True
- elif isinstance(t, (list, tuple)):
- if len(t) == 0:
- return True
- elif isinstance(t[0], str):
- return True
- elif isinstance(t[0], (list, tuple)):
- if len(t[0]) == 0 or isinstance(t[0][0], str):
- return True
- elif isinstance(t[0][0], (list, tuple)):
- return len(t[0][0]) == 0 or isinstance(t[0][0][0], str)
- else:
- return False
- else:
- return False
- else:
- return False
- if not _is_valid_text_input(text):
- raise ValueError(
- "text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) "
- "or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs)."
- )
- if text_pair is not None and not _is_valid_text_input(text_pair):
- raise ValueError(
- "text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) "
- "or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs)."
- )
- # Batch detection (from _call_one)
- if is_split_into_words:
- is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
- else:
- is_batched = isinstance(text, (list, tuple))
- if is_batched:
- # Batch validation
- if isinstance(text_pair, str):
- raise TypeError(
- "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as"
- " `text`."
- )
- if text_pair is not None and len(text) != len(text_pair):
- raise ValueError(
- f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
- f" {len(text_pair)}."
- )
- batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
- else:
- # Single input - convert to batch format
- batch_text_or_text_pairs = [(text, text_pair)] if text_pair else [text]
- # Set tokenizer configuration (from _batch_encode_plus)
- if not isinstance(batch_text_or_text_pairs, (tuple, list)):
- raise TypeError(
- f"batch_text_or_text_pairs has to be a list or a tuple (got {type(batch_text_or_text_pairs)})"
- )
- self.set_truncation_and_padding(
- padding_strategy=padding_strategy,
- truncation_strategy=truncation_strategy,
- max_length=max_length,
- stride=stride,
- pad_to_multiple_of=pad_to_multiple_of,
- padding_side=padding_side,
- )
- # Use self.split_special_tokens as default if not explicitly provided
- if split_special_tokens is None:
- split_special_tokens = self.split_special_tokens
- if self._tokenizer.encode_special_tokens != split_special_tokens:
- self._tokenizer.encode_special_tokens = split_special_tokens
- # Direct rust backend call
- encodings = self._tokenizer.encode_batch(
- batch_text_or_text_pairs,
- add_special_tokens=add_special_tokens,
- is_pretokenized=is_split_into_words,
- )
- # Convert encodings to BatchEncoding format
- tokens_and_encodings = [
- self._convert_encoding(
- encoding=encoding,
- return_token_type_ids=return_token_type_ids,
- return_attention_mask=return_attention_mask,
- return_overflowing_tokens=return_overflowing_tokens,
- return_special_tokens_mask=return_special_tokens_mask,
- return_offsets_mapping=return_offsets_mapping,
- return_length=return_length,
- verbose=verbose,
- )
- for encoding in encodings
- ]
- # Convert the output to have dict[list] from list[dict]
- sanitized_tokens = {}
- for key in tokens_and_encodings[0][0]:
- stack = [e for item, _ in tokens_and_encodings for e in item[key]]
- sanitized_tokens[key] = stack
- sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
- # If returning overflowing tokens, we need to return a mapping
- if return_overflowing_tokens:
- overflow_to_sample_mapping = []
- for i, (toks, _) in enumerate(tokens_and_encodings):
- overflow_to_sample_mapping += [i] * len(toks["input_ids"])
- sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
- for input_ids in sanitized_tokens["input_ids"]:
- self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
- batched_output = BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
- # If single input, remove the batch dimension (unless returning overflowing tokens)
- if not is_batched and return_tensors is None and not return_overflowing_tokens:
- batched_output = BatchEncoding(
- {
- key: (value[0] if len(value) > 0 and isinstance(value[0], list) else value)
- for key, value in batched_output.items()
- },
- batched_output.encodings,
- )
- return batched_output
- def convert_tokens_to_string(self, tokens: list[str]) -> str:
- return (
- self.backend_tokenizer.decoder.decode(tokens)
- if self.backend_tokenizer.decoder is not None
- else " ".join(tokens)
- )
- def _decode(
- self,
- token_ids: int | list[int],
- skip_special_tokens: bool = False,
- clean_up_tokenization_spaces: bool | None = None,
- **kwargs,
- ) -> str:
- # Removed: use_source_tokenizer parameter (unused)
- kwargs.pop("use_source_tokenizer", None) # Pop if present to avoid errors
- if isinstance(token_ids, int):
- token_ids = [token_ids]
- if isinstance(token_ids, dict):
- token_ids = token_ids["input_ids"]
- text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
- clean_up_tokenization_spaces = (
- clean_up_tokenization_spaces
- if clean_up_tokenization_spaces is not None
- else self.clean_up_tokenization_spaces
- )
- if clean_up_tokenization_spaces:
- text = self.clean_up_tokenization(text)
- return text
- def _save_pretrained(
- self,
- save_directory: str | os.PathLike,
- file_names: tuple[str, ...],
- legacy_format: bool | None = None,
- filename_prefix: str | None = None,
- ) -> tuple[str, ...]:
- save_directory = str(save_directory)
- tokenizer_file = os.path.join(
- save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE
- )
- self.backend_tokenizer.save(tokenizer_file)
- file_names = file_names + (tokenizer_file,)
- return file_names
- def train_new_from_iterator(
- self,
- text_iterator,
- vocab_size,
- length=None,
- new_special_tokens=None,
- special_tokens_map=None,
- **kwargs,
- ):
- """
- Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
- as the current one.
- Args:
- text_iterator (generator of `list[str]`):
- The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
- if you have everything in memory.
- vocab_size (`int`):
- The size of the vocabulary you want for your tokenizer.
- length (`int`, *optional*):
- The total number of sequences in the iterator. This is used to provide meaningful progress tracking
- new_special_tokens (list of `str` or `AddedToken`, *optional*):
- A list of new special tokens to add to the tokenizer you are training.
- special_tokens_map (`dict[str, str]`, *optional*):
- If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
- token name to new special token name in this argument.
- kwargs (`dict[str, Any]`, *optional*):
- Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.
- Returns:
- [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
- `text_iterator`.
- """
- tokenizer_json = json.loads(self._tokenizer.to_str())
- # Remove added tokens for now (uses IDs of tokens)
- added_tokens = tokenizer_json.pop("added_tokens")
- # Remove post processor for now (uses IDs of tokens)
- post_processor = tokenizer_json.pop("post_processor")
- unk_token = None
- # Remove vocab
- if tokenizer_json["model"]["type"] == "BPE":
- tokenizer_json["model"]["vocab"] = {}
- tokenizer_json["model"]["merges"] = []
- elif tokenizer_json["model"]["type"] == "Unigram":
- if tokenizer_json["model"]["unk_id"] is not None:
- unk_id = tokenizer_json["model"]["unk_id"]
- unk_token = tokenizer_json["model"]["vocab"][unk_id][0]
- if special_tokens_map is not None and unk_token in special_tokens_map:
- unk_token = special_tokens_map[unk_token]
- tokenizer_json["model"]["unk_id"] = 0
- tokenizer_json["model"]["vocab"] = [[unk_token, 0.0]]
- elif tokenizer_json["model"]["type"] in ["WordLevel", "WordPiece"]:
- tokenizer_json["model"]["vocab"] = {}
- else:
- raise ValueError(
- f"This method does not support this type of tokenizer (found {tokenizer_json['model']['type']}) "
- "only BPE, Unigram, WordLevel and WordPiece."
- )
- if (
- special_tokens_map is not None
- and "unk_token" in tokenizer_json["model"]
- and tokenizer_json["model"]["unk_token"] in special_tokens_map
- ):
- tokenizer_json["model"]["unk_token"] = special_tokens_map[tokenizer_json["model"]["unk_token"]]
- tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json))
- # Get the special tokens from the current tokenizer if none are specified.
- special_tokens = []
- for added_token in added_tokens:
- special = added_token.pop("special", None)
- _ = added_token.pop("id", None)
- if tokenizer_json["model"]["type"] != "Unigram" and not special:
- continue
- if special_tokens_map is not None and added_token["content"] in special_tokens_map:
- added_token["content"] = special_tokens_map[added_token["content"]]
- special_tokens.append(AddedToken(**added_token))
- if new_special_tokens is not None:
- special_tokens.extend(new_special_tokens)
- # Trainer needs to know the end of word / continuing subword thingies in BPE
- if (
- tokenizer_json["model"]["type"] == "BPE"
- and "continuing_subword_prefix" not in kwargs
- and tokenizer_json["model"]["continuing_subword_prefix"] is not None
- ):
- kwargs["continuing_subword_prefix"] = tokenizer_json["model"]["continuing_subword_prefix"]
- if (
- tokenizer_json["model"]["type"] == "BPE"
- and "end_of_word_suffix" not in kwargs
- and tokenizer_json["model"]["end_of_word_suffix"] is not None
- ):
- kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"]
- if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None:
- kwargs["unk_token"] = unk_token
- if tokenizer_json["pre_tokenizer"] is not None:
- if (
- tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel"
- or tokenizer_json["pre_tokenizer"]["type"] == "Sequence"
- and "pretokenizers" in tokenizer_json["pre_tokenizer"]
- and any(
- pretokenizer["type"] == "ByteLevel"
- for pretokenizer in tokenizer_json["pre_tokenizer"]["pretokenizers"]
- )
- ):
- kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()
- trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
- trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs)
- tokenizer.train_from_iterator(text_iterator, length=length, trainer=trainer)
- if post_processor is not None:
- trained_tokenizer_json = json.loads(tokenizer.to_str())
- # Almost done, we just have to adjust the token IDs in the post processor
- if "special_tokens" in post_processor:
- for key in post_processor["special_tokens"]:
- tokens = post_processor["special_tokens"][key]["tokens"]
- if special_tokens_map is not None:
- tokens = [special_tokens_map.get(token, token) for token in tokens]
- post_processor["special_tokens"][key]["tokens"] = tokens
- for token in tokens:
- token_id = tokenizer.token_to_id(token)
- if token_id is None:
- raise ValueError(
- "Attempted to set a token in the post processor that does not exist in the mapping"
- )
- post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens]
- for special_token in ["cls", "sep"]:
- if special_token in post_processor:
- token, _ = post_processor[special_token]
- if special_tokens_map is not None and token in special_tokens_map:
- token = special_tokens_map[token]
- token_id = tokenizer.token_to_id(token)
- if token_id is None:
- raise ValueError(
- "Attempted to set a token in the post processor that does not exist in the mapping"
- )
- post_processor[special_token] = [token, token_id]
- trained_tokenizer_json["post_processor"] = post_processor
- tokenizer = TokenizerFast.from_str(json.dumps(trained_tokenizer_json))
- kwargs = self.init_kwargs.copy()
- # V5: Map pad/cls/mask token at the Transformers level (named tokens only)
- for token in PreTrainedTokenizerBase.SPECIAL_TOKENS_ATTRIBUTES:
- if getattr(self, token) is not None:
- special_token = getattr(self, token)
- if special_tokens_map is not None and special_token in special_tokens_map:
- special_token = special_tokens_map[special_token]
- special_token_full = self._special_tokens_map.get(token, None)
- if isinstance(special_token_full, AddedToken):
- # Create an added token with the same parameters except the content
- kwargs[token] = AddedToken(
- special_token,
- single_word=special_token_full.single_word,
- lstrip=special_token_full.lstrip,
- rstrip=special_token_full.rstrip,
- normalized=special_token_full.normalized,
- special=True,
- )
- else:
- kwargs[token] = special_token
- # V5: Handle extra special tokens
- extra_special_tokens = self.extra_special_tokens.copy() if self.extra_special_tokens else []
- if new_special_tokens is not None:
- extra_special_tokens.extend(new_special_tokens)
- if len(extra_special_tokens) > 0:
- kwargs["extra_special_tokens"] = extra_special_tokens
- # Always try to pass tokenizer_object in kwargs first (standard TokenizersBackend usage)
- # If the class creates its own tokenizer and passes it explicitly to super().__init__(),
- # this will cause a TypeError, which we catch and handle by removing tokenizer_object
- # from kwargs and setting _tokenizer directly after initialization.
- kwargs["tokenizer_object"] = tokenizer
- try:
- return self.__class__(**kwargs)
- except TypeError as e:
- # Check if the error is due to multiple values for tokenizer_object
- if "multiple values for keyword argument 'tokenizer_object'" in str(e):
- # Class creates its own tokenizer and passes it explicitly (like LayoutLMv3Tokenizer)
- # Remove tokenizer_object from kwargs and set _tokenizer directly
- kwargs.pop("tokenizer_object", None)
- new_tokenizer = self.__class__(**kwargs)
- new_tokenizer._tokenizer = tokenizer
- return new_tokenizer
- else:
- # Some other TypeError, re-raise it
- raise
- @classmethod
- def _patch_mistral_regex(
- cls,
- tokenizer,
- pretrained_model_name_or_path,
- token=None,
- cache_dir=None,
- local_files_only=False,
- _commit_hash=None,
- is_local=False,
- init_kwargs=None,
- fix_mistral_regex=None,
- **kwargs,
- ):
- """
- Patches mistral related tokenizers with incorrect regex if detected
- 1) Local file with an associated config saved next to it
- >> Model type one of the mistral models (on older versions)
- 2) Remote models on the hub from official mistral models
- >> Tags including `base_model:.*mistralai`
- """
- import re
- from huggingface_hub import model_info
- from packaging import version
- from transformers.utils.hub import cached_file
- def is_base_mistral(model_id: str) -> bool:
- model = model_info(model_id)
- if model.tags is not None:
- if re.search("base_model:.*mistralai", "".join(model.tags)):
- return True
- return False
- if is_offline_mode():
- is_local = True
- if pretrained_model_name_or_path is not None and (
- is_local or (not is_local and is_base_mistral(pretrained_model_name_or_path))
- ):
- _config_file = cached_file(
- pretrained_model_name_or_path,
- "config.json",
- cache_dir=cache_dir,
- token=token,
- local_files_only=local_files_only,
- _raise_exceptions_for_missing_entries=False,
- _raise_exceptions_for_connection_errors=False,
- _commit_hash=_commit_hash,
- )
- # Detected using a (local) mistral tokenizer
- mistral_config_detected = False
- if _config_file is not None:
- with open(_config_file, encoding="utf-8") as f:
- _config = json.load(f)
- transformers_version = _config.get("transformers_version")
- transformers_model_type = _config.get("model_type")
- # Detect if we can skip the mistral fix by
- # a) having a non-mistral tokenizer
- # b) fixed version of transformers
- if transformers_version and version.parse(transformers_version) <= version.parse("4.57.2"):
- if (
- is_local
- and transformers_model_type is not None
- and transformers_model_type
- not in [
- "mistral",
- "mistral3",
- "voxtral",
- "ministral",
- "pixtral",
- ]
- ):
- return tokenizer
- elif transformers_version and version.parse(transformers_version) > version.parse("4.57.3"):
- return tokenizer
- mistral_config_detected = True
- if mistral_config_detected or (not is_local and is_base_mistral(pretrained_model_name_or_path)):
- # Expose the `fix_mistral_regex` flag on the tokenizer when provided, even if no correction is applied.
- if init_kwargs and "fix_mistral_regex" in init_kwargs:
- setattr(tokenizer, "fix_mistral_regex", init_kwargs["fix_mistral_regex"])
- # only warn if its not explicitly passed
- if fix_mistral_regex is None and not getattr(tokenizer, "fix_mistral_regex", False):
- setattr(tokenizer, "fix_mistral_regex", False)
- logger.warning(
- f"The tokenizer you are loading from '{pretrained_model_name_or_path}'"
- f" with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e."
- " This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue."
- )
- elif fix_mistral_regex is True or getattr(tokenizer, "fix_mistral_regex", False):
- setattr(tokenizer, "fix_mistral_regex", True)
- import tokenizers
- split_pretokenizer = tokenizers.pre_tokenizers.Split(
- pattern=tokenizers.Regex(
- r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+"
- ),
- behavior="isolated",
- )
- current_pretokenizer = tokenizer.pre_tokenizer
- # Check if it's already a Sequence
- if isinstance(current_pretokenizer, tokenizers.pre_tokenizers.Sequence):
- # Replace the first element (the Split pattern)
- tokenizer.pre_tokenizer[0] = split_pretokenizer
- else:
- # Replace Metaspace with ByteLevel when adding Split, as Metaspace(split=False) doesn't
- # work correctly with the Split pre-tokenizer and causes spaces to be lost during encoding
- if isinstance(current_pretokenizer, tokenizers.pre_tokenizers.Metaspace):
- current_pretokenizer = tokenizers.pre_tokenizers.ByteLevel(
- add_prefix_space=False, use_regex=False
- )
- # Not a Sequence, so create one with Split + current pretokenizer
- tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence(
- [
- split_pretokenizer,
- current_pretokenizer,
- ]
- )
- return tokenizer
- # Backward-compatible alias: allow referring to TokenizersBackend as PreTrainedTokenizerFast
- PreTrainedTokenizerFast = TokenizersBackend
|