tokenization_mistral_common.py 78 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685
  1. # Copyright 2025 Mistral AI and The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import re
  16. import shutil
  17. from collections.abc import Callable, Sequence
  18. from enum import Enum
  19. from pathlib import Path
  20. from typing import Any, Literal, Union, overload
  21. import numpy as np
  22. from huggingface_hub import create_repo
  23. from transformers.audio_utils import load_audio_as
  24. from transformers.image_utils import get_image_size
  25. from transformers.tokenization_utils_base import (
  26. VERY_LARGE_INTEGER,
  27. AddedToken,
  28. BatchEncoding,
  29. EncodedInput,
  30. PreTokenizedInput,
  31. PreTrainedTokenizerBase,
  32. TextInput,
  33. TruncationStrategy,
  34. )
  35. from transformers.utils import PaddingStrategy, TensorType, add_end_docstrings, logging, to_py_obj
  36. from transformers.utils.import_utils import is_mistral_common_available, is_torch_available, requires
  37. if is_mistral_common_available():
  38. from mistral_common.protocol.instruct.request import ChatCompletionRequest, ReasoningEffort
  39. from mistral_common.protocol.instruct.validator import ValidationMode
  40. from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, SpecialTokens
  41. from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
  42. from mistral_common.tokens.tokenizers.tekken import Tekkenizer
  43. from mistral_common.tokens.tokenizers.utils import (
  44. download_tokenizer_from_hf_hub,
  45. get_one_valid_tokenizer_file,
  46. )
  47. if is_torch_available():
  48. import torch
  49. logger = logging.get_logger(__name__)
  50. ENCODE_KWARGS_DOCSTRING = r"""
  51. add_special_tokens (`bool`, *optional*, defaults to `True`):
  52. Whether or not to add special tokens when encoding the sequences. This will use the underlying
  53. `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are
  54. automatically added to the input ids. This is useful if you want to add `bos` or `eos` tokens
  55. automatically. When Tokenizer is loading with `finetuning` mode it adds both `bos` and `eos`. Else, for "test" mode it only adds `bos`.
  56. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
  57. Activates and controls padding. Accepts the following values:
  58. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  59. sequence is provided).
  60. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  61. acceptable input length for the model if that argument is not provided.
  62. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
  63. lengths).
  64. truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
  65. Activates and controls truncation. Accepts the following values:
  66. - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
  67. to the maximum acceptable input length for the model if that argument is not provided.
  68. - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
  69. greater than the model maximum admissible input size).
  70. max_length (`int`, *optional*):
  71. Controls the maximum length to use by one of the truncation/padding parameters.
  72. If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
  73. is required by one of the truncation/padding parameters. If the model has no specific maximum input
  74. length (like XLNet) truncation/padding to a maximum length will be deactivated.
  75. stride (`int`, *optional*, defaults to 0):
  76. If set to a number along with `max_length`, the overflowing tokens returned when
  77. `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
  78. returned to provide some overlap between truncated and overflowing sequences. The value of this
  79. argument defines the number of overlapping tokens.
  80. pad_to_multiple_of (`int`, *optional*):
  81. If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
  82. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
  83. `>= 7.5` (Volta).
  84. padding_side (`str`, *optional*):
  85. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  86. Default value is picked from the class attribute of the same name.
  87. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  88. If set, will return tensors instead of list of python integers. Acceptable values are:
  89. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  90. """
  91. ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
  92. return_token_type_ids (`bool`, *optional*):
  93. Whether to return token type IDs. For `MistralCommonBackend` it returns a list of zeros of the sequence length as only one sequence is supported.
  94. [What are token type IDs?](../glossary#token-type-ids)
  95. return_attention_mask (`bool`, *optional*):
  96. Whether to return the attention mask. If left to the default, will return the attention mask according
  97. to the specific tokenizer's default, defined by the `return_outputs` attribute.
  98. [What are attention masks?](../glossary#attention-mask)
  99. return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
  100. Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
  101. of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
  102. of returning overflowing tokens.
  103. return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
  104. Whether or not to return special tokens mask information.
  105. return_length (`bool`, *optional*, defaults to `False`):
  106. Whether or not to return the lengths of the encoded inputs.
  107. verbose (`bool`, *optional*, defaults to `True`):
  108. Whether or not to print more information and warnings.
  109. return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
  110. split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
  111. **kwargs: passed to the `self.tokenize()` method
  112. Return:
  113. [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
  114. - **input_ids** -- List of token ids to be fed to a model.
  115. [What are input IDs?](../glossary#input-ids)
  116. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
  117. `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
  118. [What are attention masks?](../glossary#attention-mask)
  119. - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
  120. `return_overflowing_tokens=True`).
  121. - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
  122. `return_overflowing_tokens=True`).
  123. - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
  124. regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
  125. - **length** -- The length of the inputs (when `return_length=True`)
  126. """
  127. class MistralTokenizerType(str, Enum):
  128. """Enum for the different type of tokenizer."""
  129. spm = "spm"
  130. tekken = "tekken"
  131. @overload
  132. def _maybe_remove_lang(text: str, skip_special_tokens: bool) -> str: ...
  133. @overload
  134. def _maybe_remove_lang(text: list[str], skip_special_tokens: bool) -> list[str]: ...
  135. def _maybe_remove_lang(text: str | list[str], skip_special_tokens: bool) -> str | list[str]:
  136. # in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
  137. # is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
  138. # Nevertheless we should remove it to ease users life.
  139. if not skip_special_tokens:
  140. return text
  141. if isinstance(text, str):
  142. return re.sub(r"^lang:[a-z]{2}", "", text)
  143. return [re.sub(r"^lang:[a-z]{2}", "", string) for string in text]
  144. _MAP_SPECIAL_TOKENS = {
  145. "bos_token": SpecialTokens.bos.value,
  146. "eos_token": SpecialTokens.eos.value,
  147. "pad_token": SpecialTokens.pad.value,
  148. "unk_token": SpecialTokens.unk.value,
  149. }
  150. _VALID_INIT_KWARGS = {"_from_auto", "backend", "files_loaded"}
  151. @requires(backends=("mistral-common",))
  152. class MistralCommonBackend(PreTrainedTokenizerBase):
  153. """
  154. Class to wrap `mistral-common` tokenizers.
  155. `mistral-common` is the official tokenizer library for Mistral AI models. To use it, you need to install it with:
  156. ```bash
  157. pip install transformers[mistral-common]
  158. ```
  159. Otherwise the tokenizer falls back to the Transformers implementation of the tokenizer.
  160. For more info on `mistral-common`, see [mistral-common](https://github.com/mistralai/mistral-common).
  161. This class is a wrapper around a `mistral_common.tokens.tokenizers.mistral.MistralTokenizer`.
  162. It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer and inherits from the `PreTrainedTokenizerBase` class.
  163. Here are the key behavior differences with the `PythonBackend` class:
  164. - Pair of sequences are not supported. The signature has been kept for compatibility but all arguments related to pair of sequences are ignored. The return values for pairs are returned as `None`.
  165. - The `is_split_into_words` argument is not supported.
  166. - It is not possible to add new tokens to the tokenizer. Special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("<s>")` will not return the ID of the `<s>` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `"<s>"`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens).
  167. If you have suggestions to improve this class, please open an issue on the [mistral-common GitHub repository](https://github.com/mistralai/mistral-common/issues) if it is related to the tokenizer or on the [Transformers GitHub repository](https://github.com/huggingface/transformers/issues) if it is related to the Hugging Face interface.
  168. """
  169. model_input_names: list[str] = ["input_ids", "attention_mask"]
  170. padding_side: str = "left"
  171. truncation_side: str = "right"
  172. SPECIAL_TOKENS_ATTRIBUTES = [
  173. "bos_token",
  174. "eos_token",
  175. "unk_token",
  176. "pad_token",
  177. ]
  178. def __init__(
  179. self,
  180. tokenizer_path: str | os.PathLike | Path,
  181. mode: ValidationMode = ValidationMode.test,
  182. model_max_length: int = VERY_LARGE_INTEGER,
  183. padding_side: str = "left",
  184. truncation_side: str = "right",
  185. model_input_names: list[str] | None = None,
  186. clean_up_tokenization_spaces: bool = False,
  187. **kwargs,
  188. ):
  189. """
  190. Constructs a `MistralCommonBackend`.
  191. - **model_input_names** (`list[str]`) -- A list of inputs expected in the forward pass of the model.
  192. - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
  193. Should be `'right'` or `'left'`.
  194. - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
  195. applied. Should be `'right'` or `'left'`.
  196. Args:
  197. tokenizer_path (`str` or `os.PathLike` or `Path`):
  198. Path to the tokenizer file to load the `MistralTokenizer`.
  199. mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
  200. The mode to use for the tokenizer. This will be passed to the `MistralTokenizer` constructor. Possible values are:
  201. - `"finetuning"` or `ValidationMode.finetuning`: The fine-tuning mode.
  202. - `"test"` or `ValidationMode.test`: The test mode.
  203. It changes how the tokenizer validates the input and prepares the request to the model.
  204. model_max_length (`int`, *optional*):
  205. The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
  206. loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this will be set to the
  207. value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will
  208. default to VERY_LARGE_INTEGER (`int(1e30)`).
  209. padding_side (`str`, *optional*):
  210. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  211. Default value is picked from the class attribute of the same name.
  212. truncation_side (`str`, *optional*):
  213. The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
  214. Default value is picked from the class attribute of the same name.
  215. model_input_names (`List[str]`, *optional*):
  216. The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
  217. `"attention_mask"`). Default value is picked from the class attribute of the same name.
  218. clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
  219. Whether or not the model should clean up the spaces that were added when splitting the input text during the
  220. tokenization process.
  221. """
  222. if kwargs and not set(kwargs.keys()).issubset(_VALID_INIT_KWARGS):
  223. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported to init `MistralCommonBackend`.")
  224. self.init_kwargs = {
  225. "tokenizer_path": tokenizer_path,
  226. "mode": mode,
  227. "model_max_length": model_max_length,
  228. "padding_side": padding_side,
  229. "truncation_side": truncation_side,
  230. "model_input_names": model_input_names,
  231. "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
  232. }
  233. self._tokenizer_path = Path(tokenizer_path)
  234. self._mode = self._get_validation_mode(mode)
  235. self.tokenizer: MistralTokenizer = MistralTokenizer.from_file(str(self._tokenizer_path), mode=self._mode)
  236. self._tokenizer_type = (
  237. MistralTokenizerType.tekken
  238. if isinstance(self.tokenizer.instruct_tokenizer.tokenizer, Tekkenizer)
  239. else MistralTokenizerType.spm
  240. )
  241. self._cache_get_vocab: dict[str, int] | None = None
  242. self._all_special_ids = self._get_all_special_ids()
  243. self._all_special_tokens = self.convert_ids_to_tokens(self.all_special_ids)
  244. super().__init__(
  245. truncation_side=truncation_side,
  246. padding_side=padding_side,
  247. model_max_length=model_max_length,
  248. clean_up_tokenization_spaces=clean_up_tokenization_spaces,
  249. extra_special_tokens=None, # Not used by this backend.
  250. model_specific_special_tokens=None, # Not used by this backend.
  251. model_input_names=model_input_names or self.model_input_names,
  252. **_MAP_SPECIAL_TOKENS,
  253. **kwargs,
  254. )
  255. @property
  256. def mode(self) -> ValidationMode:
  257. """
  258. `ValidationMode`: The mode used by the tokenizer. Possible values are:
  259. - `"finetuning"` or `ValidationMode.finetuning`: The finetuning mode.
  260. - `"test"` or `ValidationMode.test`: The test mode.
  261. It changes how the tokenizer validates the input and prepares the request to the model.
  262. """
  263. return self._mode
  264. @property
  265. def all_special_ids(self) -> list[int]:
  266. """
  267. `list[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.).
  268. """
  269. return sorted(self._all_special_ids)
  270. @property
  271. def all_special_tokens(self) -> list[str]:
  272. """
  273. `list[str]`: A list of all unique special tokens.
  274. """
  275. return self._all_special_tokens
  276. @property
  277. def vocab_size(self) -> int:
  278. """
  279. Returns the size of the vocabulary.
  280. `int`: Size of the vocabulary.
  281. """
  282. return self.tokenizer.instruct_tokenizer.tokenizer.n_words
  283. def get_vocab(self) -> dict[str, int]:
  284. """
  285. Returns the vocabulary as a dictionary of token to index.
  286. This is a lossy conversion. There may be multiple token ids that decode to the same
  287. string due to partial UTF-8 byte sequences being converted to �.
  288. Returns:
  289. `Dict[str, int]`: The vocabulary.
  290. """
  291. if self._cache_get_vocab is None:
  292. # We reverse the order to make sure that the first token is the one to be returned when there are multiple tokens with the same string representation.
  293. vocab = self.tokenizer.instruct_tokenizer.tokenizer.vocab()
  294. self._cache_get_vocab = {token: self._piece_to_id(token, False) for token in vocab}
  295. # Order the dict.
  296. self._cache_get_vocab = dict(
  297. sorted(((k, v) for k, v in self._cache_get_vocab.items()), key=lambda x: x[1])
  298. )
  299. return self._cache_get_vocab
  300. def __len__(self):
  301. """
  302. Size of the full vocabulary with the added tokens.
  303. """
  304. return self.vocab_size
  305. @add_end_docstrings(
  306. ENCODE_KWARGS_DOCSTRING,
  307. """
  308. **kwargs: Not supported by `MistralCommonBackend.encode`.
  309. Will raise an error if used.
  310. """,
  311. """
  312. Returns:
  313. `list[int]`, `torch.Tensor`: The tokenized ids of the text.
  314. """,
  315. )
  316. def encode(
  317. self,
  318. text: TextInput | EncodedInput,
  319. text_pair: None = None,
  320. add_special_tokens: bool = True,
  321. padding: bool | str | PaddingStrategy = False,
  322. truncation: bool | str | TruncationStrategy | None = None,
  323. max_length: int | None = None,
  324. stride: int = 0,
  325. pad_to_multiple_of: int | None = None,
  326. padding_side: str | None = None,
  327. return_tensors: str | TensorType | None = None,
  328. verbose: bool = True,
  329. return_offsets_mapping: Literal[False] = False,
  330. split_special_tokens: Literal[False] = False,
  331. **kwargs,
  332. ) -> list[int]:
  333. """
  334. Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
  335. Args:
  336. text (`str` or `list[int]`):
  337. The first sequence to be encoded. This can be a string or a list of integers (tokenized string ids).
  338. text_pair (`None`, *optional*):
  339. Not supported by `MistralCommonBackend.encode`. Kept to match `PreTrainedTokenizerBase.encode` signature.
  340. """
  341. if return_offsets_mapping or split_special_tokens:
  342. raise ValueError(
  343. "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
  344. )
  345. if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
  346. raise ValueError(
  347. "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
  348. )
  349. if kwargs:
  350. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.encode`.")
  351. if text_pair:
  352. raise ValueError("`MistralCommonBackend.encode` does not support `text_pair`.")
  353. return super().encode(
  354. text=text,
  355. text_pair=text_pair,
  356. add_special_tokens=add_special_tokens,
  357. padding=padding,
  358. truncation=truncation,
  359. max_length=max_length,
  360. stride=stride,
  361. return_tensors=return_tensors,
  362. pad_to_multiple_of=pad_to_multiple_of,
  363. padding_side=padding_side,
  364. verbose=verbose,
  365. )
  366. def _decode(
  367. self,
  368. token_ids: int | list[int],
  369. skip_special_tokens: bool = False,
  370. clean_up_tokenization_spaces: bool | None = None,
  371. **kwargs,
  372. ) -> str:
  373. if kwargs:
  374. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
  375. token_ids = to_py_obj(token_ids)
  376. if isinstance(token_ids, int):
  377. token_ids = [token_ids]
  378. special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
  379. text = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
  380. # Apply tokenizer-specific cleanup if available and requested
  381. clean_up_tokenization_spaces = (
  382. clean_up_tokenization_spaces
  383. if clean_up_tokenization_spaces is not None
  384. else self.clean_up_tokenization_spaces
  385. )
  386. if clean_up_tokenization_spaces:
  387. text = self.clean_up_tokenization(text)
  388. return _maybe_remove_lang(text=text, skip_special_tokens=skip_special_tokens)
  389. def decode(
  390. self,
  391. token_ids: Union[int, list[int], list[list[int]], np.ndarray, "torch.Tensor"],
  392. skip_special_tokens: bool = False,
  393. clean_up_tokenization_spaces: bool | None = None,
  394. **kwargs,
  395. ) -> str | list[str]:
  396. """
  397. Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
  398. tokens and clean up tokenization spaces.
  399. Args:
  400. token_ids (`Union[int, list[int], list[list[int]], np.ndarray, torch.Tensor]`):
  401. A single sequence or a batch (list of sequences) of tokenized input ids. Can be obtained using the
  402. `__call__` method.
  403. skip_special_tokens (`bool`, *optional*, defaults to `False`):
  404. Whether or not to remove special tokens in the decoding.
  405. clean_up_tokenization_spaces (`bool`, *optional*):
  406. Whether or not to clean up the tokenization spaces. If `None`, will default to
  407. `self.clean_up_tokenization_spaces`.
  408. kwargs (additional keyword arguments, *optional*):
  409. Not supported by `MistralCommonBackend.decode`.
  410. Will raise an error if used.
  411. Returns:
  412. `Union[str, list[str]]`: The decoded string for a single sequence, or a list of decoded strings for a
  413. batch of sequences.
  414. """
  415. if kwargs:
  416. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
  417. return super().decode(
  418. token_ids=token_ids,
  419. skip_special_tokens=skip_special_tokens,
  420. clean_up_tokenization_spaces=clean_up_tokenization_spaces,
  421. )
  422. def batch_decode(
  423. self,
  424. sequences: Union[list[int], list[list[int]], np.ndarray, "torch.Tensor"],
  425. skip_special_tokens: bool = False,
  426. clean_up_tokenization_spaces: bool | None = None,
  427. **kwargs,
  428. ) -> list[str]:
  429. """
  430. Convert a list of lists of token ids into a list of strings by calling decode.
  431. This method is provided for backwards compatibility. The `decode` method now handles batched input natively,
  432. so you can use `decode` directly instead of `batch_decode`.
  433. Args:
  434. sequences (`Union[list[int], list[list[int]], np.ndarray, torch.Tensor]`):
  435. List of tokenized input ids. Can be obtained using the `__call__` method.
  436. skip_special_tokens (`bool`, *optional*, defaults to `False`):
  437. Whether or not to remove special tokens in the decoding.
  438. clean_up_tokenization_spaces (`bool`, *optional*):
  439. Whether or not to clean up the tokenization spaces. If `None`, will default to
  440. `self.clean_up_tokenization_spaces`.
  441. kwargs (additional keyword arguments, *optional*):
  442. Not supported by `MistralCommonBackend.batch_decode`.
  443. Will raise an error if used.
  444. Returns:
  445. `list[str]`: The list of decoded sentences.
  446. """
  447. if kwargs:
  448. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.batch_decode`.")
  449. return super().batch_decode(
  450. sequences=sequences,
  451. skip_special_tokens=skip_special_tokens,
  452. clean_up_tokenization_spaces=clean_up_tokenization_spaces,
  453. )
  454. @overload
  455. def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
  456. @overload
  457. def convert_ids_to_tokens(self, ids: list[int], skip_special_tokens: bool = False) -> list[str]: ...
  458. def convert_ids_to_tokens(self, ids: int | list[int], skip_special_tokens: bool = False) -> str | list[str]:
  459. """
  460. Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
  461. added tokens.
  462. Args:
  463. ids (`int` or `list[int]`):
  464. The token id (or token ids) to convert to tokens.
  465. skip_special_tokens (`bool`, *optional*, defaults to `False`):
  466. Whether or not to remove special tokens in the decoding.
  467. Returns:
  468. `str` or `list[str]`: The decoded token(s).
  469. """
  470. if isinstance(ids, int):
  471. return_int = True
  472. ids = [ids]
  473. else:
  474. return_int = False
  475. tokens: list[str] = []
  476. for token_id in ids:
  477. if self.tokenizer.instruct_tokenizer.tokenizer.is_special(token_id) and skip_special_tokens:
  478. continue
  479. tokens.append(self.tokenizer.instruct_tokenizer.tokenizer.id_to_piece(token_id))
  480. if return_int and tokens == []:
  481. raise ValueError(f"Invalid token id {ids[0]}.")
  482. elif return_int:
  483. return tokens[0]
  484. return tokens
  485. def _tekken_piece_to_id(self, piece: str, warn: bool) -> int:
  486. tekken_tokenizer = self.tokenizer.instruct_tokenizer.tokenizer
  487. assert isinstance(tekken_tokenizer, Tekkenizer), type(tekken_tokenizer)
  488. piece_bytes = piece.encode("utf-8")
  489. shift = tekken_tokenizer.num_special_tokens
  490. try:
  491. return shift + tekken_tokenizer._tekken_token2id_nospecial[piece_bytes]
  492. except KeyError:
  493. piece_str = piece_bytes.decode("utf-8")
  494. if piece_str in tekken_tokenizer._special_tokens_reverse_vocab:
  495. return tekken_tokenizer._special_tokens_reverse_vocab[piece_str]
  496. if warn:
  497. logger.warning("Failed to convert token %s to id, replacing with <unk>", piece_bytes)
  498. return tekken_tokenizer.unk_id
  499. def _piece_to_id(self, piece: str, warn: bool) -> int:
  500. if self._tokenizer_type == MistralTokenizerType.spm:
  501. return self.tokenizer.instruct_tokenizer.tokenizer._model.piece_to_id(piece)
  502. elif self._tokenizer_type == MistralTokenizerType.tekken:
  503. return self._tekken_piece_to_id(piece, warn)
  504. else:
  505. raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
  506. def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
  507. """
  508. Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
  509. vocabulary.
  510. Args:
  511. tokens (`str` or `list[str]`): One or several token(s) to convert to token id(s).
  512. Returns:
  513. `int` or `list[int]`: The token id or list of token ids.
  514. """
  515. if isinstance(tokens, str):
  516. one_token = True
  517. tokens = [tokens]
  518. else:
  519. one_token = False
  520. ids: list[int] = []
  521. for token in tokens:
  522. ids.append(self._piece_to_id(token, True))
  523. if one_token:
  524. return ids[0]
  525. return ids
  526. def _text_to_ids(self, text: TextInput, add_special_tokens: bool) -> list[int]:
  527. """
  528. Converts a string into a sequence of tokens ids, using the tokenizer.
  529. """
  530. add_eos = add_special_tokens and self._mode == ValidationMode.finetuning
  531. tokens_ids = self.tokenizer.instruct_tokenizer.tokenizer.encode(text, bos=add_special_tokens, eos=add_eos)
  532. return tokens_ids
  533. def tokenize(
  534. self,
  535. text: TextInput,
  536. return_offsets_mapping: Literal[False] = False,
  537. split_special_tokens: Literal[False] = False,
  538. **kwargs,
  539. ) -> list[str]:
  540. """
  541. Converts a string into a sequence of tokens, using the tokenizer.
  542. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies.
  543. Args:
  544. text (`str`):
  545. The sequence to be encoded.
  546. return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
  547. split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
  548. **kwargs (additional keyword arguments):
  549. Not supported by `MistralCommonBackend.tokenize`.
  550. Will raise an error if used.
  551. Returns:
  552. `list[str]`: The list of tokens.
  553. """
  554. if return_offsets_mapping or split_special_tokens:
  555. raise ValueError(
  556. "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
  557. )
  558. if kwargs:
  559. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.tokenize`.")
  560. return self.convert_ids_to_tokens(self._text_to_ids(text, add_special_tokens=False), skip_special_tokens=False)
  561. def _get_all_special_ids(self) -> set[int]:
  562. if self._tokenizer_type == MistralTokenizerType.tekken:
  563. return self.tokenizer.instruct_tokenizer.tokenizer._special_token_ids
  564. elif self._tokenizer_type == MistralTokenizerType.spm:
  565. return {
  566. token_id
  567. for token_id in range(self.tokenizer.instruct_tokenizer.tokenizer.n_words)
  568. if self.tokenizer.instruct_tokenizer.tokenizer.is_special(token_id)
  569. }
  570. else:
  571. raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
  572. def get_special_tokens_mask(
  573. self, token_ids_0: list[int], token_ids_1: None = None, already_has_special_tokens: bool = False
  574. ) -> list[int]:
  575. """
  576. Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
  577. special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
  578. Args:
  579. token_ids_0 (`list[int]`): List of ids of the sequence.
  580. token_ids_1 (`None`, *optional*): None, kept to match Transformers' implementation.
  581. already_has_special_tokens (`bool`, *optional*, defaults to `False`):
  582. Whether or not the token list is already formatted with special tokens for the model.
  583. Returns:
  584. A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
  585. """
  586. if token_ids_1 is not None:
  587. raise ValueError(
  588. "`token_ids_1` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
  589. )
  590. if already_has_special_tokens:
  591. return [1 if int(token_id) in self._all_special_ids else 0 for token_id in token_ids_0]
  592. if self.mode == ValidationMode.test:
  593. # [BOS] seq0
  594. return [1] + ([0] * len(token_ids_0))
  595. else:
  596. # [BOS] seq0 [EOS]
  597. return [1] + ([0] * len(token_ids_0)) + [1]
  598. def _encode_plus( # type: ignore[override]
  599. self,
  600. text: TextInput | PreTokenizedInput | EncodedInput,
  601. text_pair: None = None,
  602. add_special_tokens: bool = True,
  603. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
  604. truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
  605. max_length: int | None = None,
  606. stride: int = 0,
  607. is_split_into_words: bool = False,
  608. pad_to_multiple_of: int | None = None,
  609. padding_side: str | None = None,
  610. return_tensors: str | TensorType | None = None,
  611. return_token_type_ids: bool | None = None,
  612. return_attention_mask: bool | None = None,
  613. return_overflowing_tokens: bool = False,
  614. return_special_tokens_mask: bool = False,
  615. return_length: bool = False,
  616. verbose: bool = True,
  617. return_offsets_mapping: Literal[False] = False,
  618. split_special_tokens: Literal[False] = False,
  619. **kwargs,
  620. ) -> BatchEncoding:
  621. # Detect batched inputs (list of sequences)
  622. if text_pair is not None:
  623. raise ValueError("`MistralCommonBackend` does not support `text_pair != None` for `_encode_plus`.")
  624. if return_offsets_mapping or split_special_tokens:
  625. raise ValueError(
  626. "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
  627. )
  628. if kwargs:
  629. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend._encode_plus`.")
  630. is_batched = isinstance(text, (list, tuple)) and (
  631. (not text and not is_split_into_words)
  632. or (text and is_split_into_words and isinstance(text[0], (list, tuple)))
  633. or (text and not is_split_into_words and isinstance(text[0], (str, list, tuple)))
  634. )
  635. if is_batched:
  636. batch_outputs = {}
  637. one_overflowed = False
  638. for current_text in text:
  639. current_output = self._encode_plus(
  640. text=current_text,
  641. text_pair=None,
  642. add_special_tokens=add_special_tokens,
  643. padding_strategy=PaddingStrategy.DO_NOT_PAD, # we pad in batch afterward
  644. truncation_strategy=truncation_strategy,
  645. max_length=max_length,
  646. stride=stride,
  647. is_split_into_words=is_split_into_words,
  648. pad_to_multiple_of=None, # we pad in batch afterward
  649. padding_side=None, # we pad in batch afterward
  650. return_tensors=None, # We convert the whole batch to tensors at the end
  651. return_token_type_ids=return_token_type_ids,
  652. return_attention_mask=False, # we pad in batch afterward
  653. return_overflowing_tokens=return_overflowing_tokens,
  654. return_special_tokens_mask=return_special_tokens_mask,
  655. return_length=return_length,
  656. verbose=verbose,
  657. )
  658. for key, value in current_output.items():
  659. batch_outputs.setdefault(key, []).append(value)
  660. # To ensure the list is built for each sample, we need to add this.
  661. if return_overflowing_tokens and not return_tensors:
  662. if "overflowing_tokens" not in current_output:
  663. batch_outputs.setdefault("overflowing_tokens", []).append([0])
  664. batch_outputs.setdefault("num_truncated_tokens", []).append([0])
  665. else:
  666. one_overflowed = True
  667. # Remove overflow-related keys before tensor conversion if return_tensors is set
  668. # Slow tokenizers don't support returning these as tensors
  669. if return_overflowing_tokens and (return_tensors or not one_overflowed):
  670. batch_outputs.pop("overflowing_tokens", None)
  671. batch_outputs.pop("num_truncated_tokens", None)
  672. batch_outputs = self.pad(
  673. batch_outputs,
  674. padding=padding_strategy.value,
  675. max_length=max_length,
  676. pad_to_multiple_of=pad_to_multiple_of,
  677. padding_side=padding_side,
  678. return_attention_mask=return_attention_mask,
  679. )
  680. return BatchEncoding(batch_outputs, tensor_type=return_tensors)
  681. def get_input_ids(text):
  682. if isinstance(text, str):
  683. return self._text_to_ids(text, False)
  684. elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
  685. return text
  686. else:
  687. raise ValueError(f"Input {text} is not valid. Should be a string, or a list/tuple of integers.")
  688. first_ids = get_input_ids(text)
  689. return self.prepare_for_model(
  690. first_ids,
  691. pair_ids=None,
  692. add_special_tokens=add_special_tokens,
  693. padding=padding_strategy.value,
  694. truncation=truncation_strategy.value,
  695. max_length=max_length,
  696. stride=stride,
  697. pad_to_multiple_of=pad_to_multiple_of,
  698. padding_side=padding_side,
  699. return_tensors=return_tensors,
  700. prepend_batch_axis=True,
  701. return_attention_mask=return_attention_mask,
  702. return_token_type_ids=return_token_type_ids,
  703. return_overflowing_tokens=return_overflowing_tokens,
  704. return_special_tokens_mask=return_special_tokens_mask,
  705. return_length=return_length,
  706. verbose=verbose,
  707. )
  708. @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
  709. def prepare_for_model(
  710. self,
  711. ids: list[int],
  712. pair_ids: None = None,
  713. add_special_tokens: bool = True,
  714. padding: bool | str | PaddingStrategy = False,
  715. truncation: bool | str | TruncationStrategy | None = None,
  716. max_length: int | None = None,
  717. stride: int = 0,
  718. pad_to_multiple_of: int | None = None,
  719. padding_side: str | None = None,
  720. return_tensors: str | TensorType | None = None,
  721. return_token_type_ids: bool | None = None,
  722. return_attention_mask: bool | None = None,
  723. return_overflowing_tokens: bool = False,
  724. return_special_tokens_mask: bool = False,
  725. return_length: bool = False,
  726. verbose: bool = True,
  727. prepend_batch_axis: bool = False,
  728. return_offsets_mapping: Literal[False] = False,
  729. split_special_tokens: Literal[False] = False,
  730. **kwargs,
  731. ) -> BatchEncoding:
  732. """
  733. Prepares a sequence of input id so that it can be used by the model. It
  734. adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
  735. manages a moving window (with user defined stride) for overflowing tokens.
  736. Args:
  737. ids (`list[int]`):
  738. Tokenized input ids of the first sequence.
  739. pair_ids (`None`, *optional*):
  740. Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
  741. """
  742. if return_offsets_mapping or split_special_tokens:
  743. raise ValueError(
  744. "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
  745. )
  746. if pair_ids is not None:
  747. raise ValueError(
  748. "`pair_ids` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
  749. )
  750. if kwargs:
  751. raise ValueError(
  752. f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
  753. )
  754. padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
  755. padding=padding,
  756. truncation=truncation,
  757. max_length=max_length,
  758. pad_to_multiple_of=pad_to_multiple_of,
  759. verbose=verbose,
  760. **kwargs,
  761. )
  762. # Validation
  763. if (
  764. return_overflowing_tokens
  765. and truncation_strategy == TruncationStrategy.LONGEST_FIRST
  766. and pair_ids is not None
  767. ):
  768. raise ValueError(
  769. "Not possible to return overflowing tokens for pair of sequences with the "
  770. "`longest_first`. Please select another truncation strategy than `longest_first`, "
  771. "for instance `only_second` or `only_first`."
  772. )
  773. # Defaults
  774. if return_token_type_ids is None:
  775. return_token_type_ids = "token_type_ids" in self.model_input_names
  776. if return_attention_mask is None:
  777. return_attention_mask = "attention_mask" in self.model_input_names
  778. # Truncation
  779. num_special = self.num_special_tokens_to_add(pair=False) if add_special_tokens else 0
  780. total_len = len(ids) + len(pair_ids or []) + num_special
  781. overflowing_tokens = []
  782. if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
  783. ids, _, overflowing_tokens = self.truncate_sequences(
  784. ids,
  785. pair_ids=None,
  786. num_tokens_to_remove=total_len - max_length,
  787. truncation_strategy=truncation_strategy,
  788. stride=stride,
  789. )
  790. # Add special tokens
  791. if add_special_tokens:
  792. sequence = self.build_inputs_with_special_tokens(ids, None)
  793. token_type_ids = self.create_token_type_ids_from_sequences(ids, None)
  794. else:
  795. sequence = ids
  796. token_type_ids = [0] * len(sequence)
  797. # Build output
  798. encoded_inputs = {"input_ids": sequence}
  799. if return_token_type_ids:
  800. encoded_inputs["token_type_ids"] = token_type_ids
  801. if return_special_tokens_mask:
  802. encoded_inputs["special_tokens_mask"] = (
  803. self.get_special_tokens_mask(ids, None) if add_special_tokens else [0] * len(sequence)
  804. )
  805. if return_overflowing_tokens and not return_tensors and overflowing_tokens:
  806. encoded_inputs["overflowing_tokens"] = overflowing_tokens
  807. encoded_inputs["num_truncated_tokens"] = total_len - max_length if max_length else 0
  808. # Check sequence length and warn if needed
  809. self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
  810. # Pad
  811. if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
  812. encoded_inputs = self.pad(
  813. encoded_inputs,
  814. max_length=max_length,
  815. padding=padding_strategy.value,
  816. pad_to_multiple_of=pad_to_multiple_of,
  817. padding_side=padding_side,
  818. return_attention_mask=return_attention_mask,
  819. )
  820. if return_length:
  821. encoded_inputs["length"] = len(encoded_inputs["input_ids"])
  822. return BatchEncoding(encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis)
  823. def truncate_sequences( # type: ignore[override]
  824. self,
  825. ids: list[int],
  826. pair_ids: None = None,
  827. num_tokens_to_remove: int = 0,
  828. truncation_strategy: str | TruncationStrategy = "longest_first",
  829. stride: int = 0,
  830. **kwargs,
  831. ) -> tuple[list[int], None, list[int]]:
  832. """
  833. Truncates a sequence pair in-place following the strategy.
  834. Args:
  835. ids (`list[int]`):
  836. Tokenized input ids. Can be obtained from a string by chaining the `tokenize` and
  837. `convert_tokens_to_ids` methods.
  838. pair_ids (`None`, *optional*):
  839. Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.truncate_sequences`.
  840. num_tokens_to_remove (`int`, *optional*, defaults to 0):
  841. Number of tokens to remove using the truncation strategy.
  842. truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `'longest_first'`):
  843. The strategy to follow for truncation. Can be:
  844. - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
  845. maximum acceptable input length for the model if that argument is not provided.
  846. - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
  847. than the model maximum admissible input size).
  848. stride (`int`, *optional*, defaults to 0):
  849. If set to a positive number, the overflowing tokens returned will contain some tokens from the main
  850. sequence returned. The value of this argument defines the number of additional tokens.
  851. Returns:
  852. `Tuple[list[int], None, list[int]]`: The truncated `ids` and the list of
  853. overflowing tokens. `None` is returned to match Transformers signature.
  854. """
  855. if pair_ids:
  856. raise ValueError("`pair_ids` is not supported by `MistralCommonBackend.truncate_sequences`.")
  857. if not isinstance(truncation_strategy, TruncationStrategy):
  858. truncation_strategy = TruncationStrategy(truncation_strategy)
  859. if truncation_strategy in [
  860. TruncationStrategy.ONLY_FIRST,
  861. TruncationStrategy.ONLY_SECOND,
  862. ]:
  863. raise ValueError(f"{truncation_strategy=} is not supported by `MistralCommonBackend`.")
  864. if num_tokens_to_remove <= 0:
  865. return ids, None, []
  866. overflowing_tokens = []
  867. if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
  868. window_len = min(len(ids), stride + num_tokens_to_remove)
  869. if self.truncation_side == "left":
  870. overflowing_tokens = ids[:window_len]
  871. ids = ids[num_tokens_to_remove:]
  872. else:
  873. overflowing_tokens = ids[-window_len:]
  874. ids = ids[:-num_tokens_to_remove]
  875. return ids, None, overflowing_tokens
  876. def apply_chat_template( # type: ignore[override]
  877. self,
  878. conversation: list[dict[str, str]] | list[list[dict[str, str]]],
  879. tools: list[dict | Callable] | None = None,
  880. add_generation_prompt: bool = False,
  881. continue_final_message: bool = False,
  882. tokenize: bool = True,
  883. padding: bool | str | PaddingStrategy = False,
  884. truncation: bool = False,
  885. max_length: int | None = None,
  886. return_tensors: str | TensorType | None = None,
  887. return_dict: bool = True,
  888. reasoning_effort: ReasoningEffort | None = None,
  889. **kwargs,
  890. ) -> str | list[int] | list[str] | list[list[int]] | BatchEncoding:
  891. """
  892. Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token
  893. ids.
  894. Args:
  895. conversation (Union[List[Dict[str, str]], List[List[Dict[str, str]]]]): A list of dicts
  896. with "role" and "content" keys, representing the chat history so far.
  897. tools (`List[Union[Dict, Callable]]`, *optional*):
  898. A list of tools (callable functions) that will be accessible to the model. If the template does not
  899. support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
  900. giving the name, description and argument types for the tool. See our
  901. [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
  902. for more information.
  903. add_generation_prompt (`bool`, *optional*):
  904. This argument is a no-op for `MistralCommonBackend`. However, it cannot be used at the same time as `continue_final_message` to keep the API consistent.
  905. If any conversation ends with an assistant message, it will raise an error. In such cases, use `continue_final_message` instead.
  906. continue_final_message (bool, *optional*):
  907. If this is set, the chat will be formatted so that the final
  908. message in the chat is open-ended, without any EOS tokens. The model will continue this message
  909. rather than starting a new one. This allows you to "prefill" part of
  910. the model's response for it. Cannot be used at the same time as `add_generation_prompt`.
  911. tokenize (`bool`, defaults to `True`):
  912. Whether to tokenize the output. If `False`, the output will be a string.
  913. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
  914. Select a strategy to pad the returned sequences (according to the model's padding side and padding
  915. index) among:
  916. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  917. sequence if provided).
  918. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  919. acceptable input length for the model if that argument is not provided.
  920. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
  921. lengths).
  922. truncation (`bool`, defaults to `False`):
  923. Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`.
  924. max_length (`int`, *optional*):
  925. Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If
  926. not specified, the tokenizer's `max_length` attribute will be used as a default.
  927. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  928. If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable
  929. values are:
  930. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  931. return_dict (`bool`, defaults to `False`):
  932. Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
  933. If at least one conversation contains an image, its pixel values will be returned in the `pixel_values` key and image sizes in the `image_sizes` key.
  934. reasoning_effort (`ReasoningEffort`, *optional*):
  935. The reasoning effort to use for the chat completion for models that support it. Possible values are:
  936. - `ReasoningEffort.none`: The model will not reason.
  937. - `ReasoningEffort.high`: The model will use a reasoning approach.
  938. If not specified, the default reasoning effort will be used.
  939. kwargs (additional keyword arguments, *optional*):
  940. Not supported by `MistralCommonBackend.apply_chat_template`.
  941. Will raise an error if used.
  942. Returns:
  943. `Union[str, list[int], list[str], list[list[int]], BatchEncoding]`: The tokenized chat so far, including control tokens. This output is ready to pass to the model, either directly or via methods like `generate()`.
  944. """
  945. if kwargs:
  946. raise ValueError(
  947. f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.apply_chat_template`."
  948. )
  949. if not isinstance(truncation, bool):
  950. raise TypeError("`truncation` must be a boolean for `apply_chat_template` method.")
  951. if add_generation_prompt and continue_final_message:
  952. raise ValueError("Cannot use both `add_generation_prompt` and `continue_final_message`.")
  953. if isinstance(conversation, (list, tuple)) and (
  954. isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "messages")
  955. ):
  956. conversations = conversation
  957. is_batched = True
  958. else:
  959. conversations = [conversation]
  960. is_batched = False
  961. if add_generation_prompt:
  962. for conversation in conversations:
  963. last_message = conversation[-1]
  964. if last_message.get("role") == "assistant":
  965. raise ValueError(
  966. "The last message in the conversation is already an assistant message. Consider using `continue_final_message` instead."
  967. )
  968. def _maybe_adapt_message(message: dict[str, Any]) -> None:
  969. """Adapt message to `mistral-common` format and leave validation to `mistral-common`."""
  970. if not isinstance(message, dict):
  971. return message
  972. maybe_list_content: str | list[dict[str, str | dict[str, Any]]] | None = message.get("content")
  973. if not maybe_list_content or isinstance(maybe_list_content, str):
  974. return message
  975. normalized_content: list[dict[str, str | dict[str, Any]]] = []
  976. message = message.copy()
  977. for content in maybe_list_content:
  978. content_type = content.get("type", None)
  979. if not content_type:
  980. continue
  981. elif content_type == "image":
  982. maybe_url: str | None = content.get("url")
  983. maybe_path: str | None = content.get("path")
  984. maybe_base64: str | None = content.get("base64")
  985. if maybe_url:
  986. image_content = maybe_url
  987. elif maybe_path:
  988. if not maybe_path.startswith("file://"):
  989. maybe_path = Path(maybe_path).resolve().as_uri()
  990. image_content = maybe_path
  991. elif maybe_base64:
  992. if not maybe_base64.startswith("data:image"):
  993. maybe_base64 = "data:image/unk;base64," + maybe_base64
  994. image_content = maybe_base64
  995. else:
  996. raise ValueError("Image content must be specified.")
  997. normalized_content.append({"type": "image_url", "image_url": {"url": image_content}})
  998. elif content_type == "audio":
  999. maybe_url: str | None = content.get("url")
  1000. maybe_path: str | None = content.get("path")
  1001. maybe_base64: str | None = content.get("base64")
  1002. if maybe_url or maybe_path:
  1003. audio_data = load_audio_as(maybe_url or maybe_path, return_format="dict", force_mono=True)
  1004. normalized_content.append({"type": "input_audio", "input_audio": audio_data})
  1005. continue
  1006. if not maybe_base64:
  1007. raise ValueError("Audio content must be specified.")
  1008. normalized_content.append({"type": "audio_url", "audio_url": {"url": maybe_base64}})
  1009. else:
  1010. normalized_content.append(content)
  1011. message["content"] = normalized_content
  1012. return message
  1013. outputs = []
  1014. images: list[np.ndarray] = []
  1015. audios: list[np.ndarray] = []
  1016. for conversation in conversations:
  1017. messages: list[dict[str, str | list[dict[str, str | dict[str, Any]]]]] = []
  1018. for message in conversation:
  1019. message = _maybe_adapt_message(message)
  1020. messages.append(message)
  1021. chat_request = ChatCompletionRequest.from_openai(
  1022. messages=messages,
  1023. tools=tools,
  1024. continue_final_message=continue_final_message,
  1025. reasoning_effort=reasoning_effort,
  1026. )
  1027. tokenized_request = self.tokenizer.encode_chat_completion(chat_request)
  1028. if tokenize:
  1029. outputs.append(tokenized_request.tokens)
  1030. else:
  1031. outputs.append(tokenized_request.text)
  1032. images.extend(tokenized_request.images)
  1033. audios.extend([el.audio_array for el in tokenized_request.audios])
  1034. if not is_batched:
  1035. outputs = outputs[0]
  1036. if tokenize:
  1037. out = self(
  1038. outputs,
  1039. padding=padding,
  1040. truncation=truncation,
  1041. max_length=max_length,
  1042. add_special_tokens=False,
  1043. return_tensors=return_tensors,
  1044. )
  1045. if return_dict:
  1046. if images:
  1047. pixel_values: list[np.ndarray] | np.ndarray | torch.Tensor
  1048. if return_tensors == "pt":
  1049. if not is_torch_available():
  1050. raise ImportError(
  1051. "Unable to convert output to PyTorch tensors format, PyTorch is not installed."
  1052. )
  1053. pixel_values = torch.from_numpy(np.stack(images))
  1054. elif return_tensors == "np":
  1055. pixel_values = np.array(images)
  1056. elif return_tensors is None:
  1057. pixel_values = images
  1058. else:
  1059. raise ValueError(f"Unsupported return_tensors type: {return_tensors}")
  1060. out.data["pixel_values"] = pixel_values
  1061. if images:
  1062. out.data["image_sizes"] = self._get_image_sizes_for_tensor(images, return_tensors)
  1063. if audios:
  1064. if return_tensors is not None:
  1065. raise NotImplementedError(
  1066. "When passing audio content in apply_chat_template, `return_tensors` must be None since we cannot batch the audio inputs. The returned audio will be a list of numpy arrays."
  1067. )
  1068. # Transformers convention is audio for plural audio (audio does not take a "s")
  1069. out.data["audio"] = audios
  1070. return out
  1071. else:
  1072. return out["input_ids"]
  1073. else:
  1074. logger.warning(
  1075. "`MistralCommonBackend.apply_chat_template(..., tokenize=False)` is unsafe and may lead to unexpected behavior."
  1076. " Please consider using `tokenize=True` instead and don't encode the output manually."
  1077. )
  1078. return outputs
  1079. def _get_image_sizes_for_tensor(
  1080. self, images: list[np.ndarray], return_tensors: str | TensorType | None
  1081. ) -> "list[list[int]] | np.ndarray | torch.Tensor":
  1082. """
  1083. Convert image sizes to the appropriate format based on return_tensors.
  1084. Args:
  1085. images: List of image arrays
  1086. return_tensors: The tensor type to return
  1087. Returns:
  1088. Image sizes in the appropriate format
  1089. """
  1090. image_sizes = []
  1091. for image in images:
  1092. height, width = get_image_size(image)
  1093. image_sizes.append([height, width])
  1094. if return_tensors == "pt":
  1095. return torch.tensor(image_sizes, dtype=torch.long)
  1096. elif return_tensors == "np":
  1097. return np.array(image_sizes, dtype=np.int64)
  1098. else:
  1099. return image_sizes
  1100. def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
  1101. """
  1102. Build model inputs from a sequence by adding special tokens.
  1103. This method dynamically builds inputs based on the tokenizer's `mode`:
  1104. - `"test"`: seq0 [EOS]
  1105. - `"finetuning"`: [BOS] seq0
  1106. Args:
  1107. token_ids_0 (`list[int]`):
  1108. List of IDs to which the special tokens will be added.
  1109. token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
  1110. Returns:
  1111. `list[int]`: List of input IDs with the appropriate special tokens.
  1112. """
  1113. if token_ids_1 is not None:
  1114. raise ValueError(
  1115. "`MistralCommonBackend` does not implement `token_ids_1 != None` for `build_inputs_with_special_tokens`."
  1116. )
  1117. if self.mode == ValidationMode.test:
  1118. # [BOS] seq0
  1119. return [self.bos_token_id] + token_ids_0
  1120. else:
  1121. # [BOS] seq0 [EOS]
  1122. return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
  1123. def create_token_type_ids_from_sequences(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
  1124. """
  1125. Create a mask of zeroes from the token ids with special tokens added.
  1126. Kept to match Transformers' implementation.
  1127. Args:
  1128. token_ids_0 (`list[int]`):
  1129. List of IDs.
  1130. token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
  1131. Returns:
  1132. `list[int]`: Token type IDs according to the configured pattern.
  1133. """
  1134. if token_ids_1 is not None:
  1135. raise ValueError(
  1136. "`MistralCommonBackend` does not implement `token_ids_1 != None` for `create_token_type_ids_from_sequences`."
  1137. )
  1138. sequence = self.build_inputs_with_special_tokens(token_ids_0)
  1139. return [0] * len(sequence)
  1140. def num_special_tokens_to_add(self, pair: Literal[False] = False) -> int:
  1141. """
  1142. Returns the number of added tokens when encoding a sequence with special tokens.
  1143. <Tip>
  1144. This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
  1145. this inside your training loop.
  1146. </Tip>
  1147. Args:
  1148. pair (`Literal[False]`, *optional*): False, kept to match Transformer's signature.
  1149. Returns:
  1150. `int`: Number of special tokens added to sequences.
  1151. """
  1152. if pair:
  1153. raise ValueError(
  1154. "`MistralCommonBackend` does not implement `pair = True` for `num_special_tokens_to_add`."
  1155. )
  1156. return len(self.build_inputs_with_special_tokens([], None))
  1157. @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
  1158. def __call__(
  1159. self,
  1160. text: TextInput | EncodedInput | list[TextInput] | list[EncodedInput] | None = None,
  1161. text_pair: None = None,
  1162. text_target: None = None,
  1163. text_pair_target: None = None,
  1164. add_special_tokens: bool = True,
  1165. padding: bool | str | PaddingStrategy = False,
  1166. truncation: bool | str | TruncationStrategy | None = None,
  1167. max_length: int | None = None,
  1168. stride: int = 0,
  1169. pad_to_multiple_of: int | None = None,
  1170. padding_side: str | None = None,
  1171. return_tensors: str | TensorType | None = None,
  1172. return_attention_mask: bool | None = None,
  1173. return_overflowing_tokens: bool = False,
  1174. return_special_tokens_mask: bool = False,
  1175. return_length: bool = False,
  1176. verbose: bool = True,
  1177. return_offsets_mapping: Literal[False] = False,
  1178. split_special_tokens: Literal[False] = False,
  1179. **kwargs,
  1180. ) -> BatchEncoding:
  1181. """
  1182. Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
  1183. sequences.
  1184. Args:
  1185. text (`str`, `list[str]`, `list[list[str]]`, *optional*):
  1186. The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of int
  1187. (encoded strings).
  1188. text_pair (`None`, *optional*):
  1189. Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
  1190. text_target (`None`, *optional*):
  1191. Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
  1192. text_pair_target (`None`, *optional*):
  1193. Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
  1194. """
  1195. if return_offsets_mapping or split_special_tokens:
  1196. raise ValueError(
  1197. "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
  1198. )
  1199. if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
  1200. raise ValueError(
  1201. "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
  1202. )
  1203. if kwargs:
  1204. raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.__call__`.")
  1205. if text_pair or text_target or text_pair_target:
  1206. raise ValueError(
  1207. "`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonBackend`."
  1208. )
  1209. return super().__call__(
  1210. text=text,
  1211. text_pair=text_pair,
  1212. text_target=text_target,
  1213. add_special_tokens=add_special_tokens,
  1214. padding=padding,
  1215. truncation=truncation,
  1216. max_length=max_length,
  1217. stride=stride,
  1218. pad_to_multiple_of=pad_to_multiple_of,
  1219. padding_side=padding_side,
  1220. return_tensors=return_tensors,
  1221. return_attention_mask=return_attention_mask,
  1222. return_overflowing_tokens=return_overflowing_tokens,
  1223. return_special_tokens_mask=return_special_tokens_mask,
  1224. return_length=return_length,
  1225. verbose=verbose,
  1226. )
  1227. @classmethod
  1228. def from_pretrained(
  1229. cls,
  1230. pretrained_model_name_or_path: str | os.PathLike,
  1231. *init_inputs,
  1232. mode: str | ValidationMode = ValidationMode.test,
  1233. cache_dir: str | os.PathLike | None = None,
  1234. force_download: bool = False,
  1235. local_files_only: bool = False,
  1236. token: str | bool | None = None,
  1237. revision: str = "main",
  1238. model_max_length: int = VERY_LARGE_INTEGER,
  1239. padding_side: str = "left",
  1240. truncation_side: str = "right",
  1241. model_input_names: list[str] | None = None,
  1242. clean_up_tokenization_spaces: bool = False,
  1243. **kwargs,
  1244. ):
  1245. r"""
  1246. Instantiate a `MistralCommonBackend` from a predefined
  1247. tokenizer.
  1248. Args:
  1249. pretrained_model_name_or_path (`str` or `os.PathLike`):
  1250. Can be either:
  1251. - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
  1252. - A path to a *directory* containing the tokenizer config, for instance saved
  1253. using the [`MistralCommonBackend.tokenization_mistral_common.save_pretrained`] method, e.g.,
  1254. `./my_model_directory/`.
  1255. mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
  1256. Validation mode for the `MistralTokenizer` tokenizer. Possible values are:
  1257. - `"finetuning"` or `ValidationMode.finetuning`: The fine-tuning mode.
  1258. - `"test"` or `ValidationMode.test`: The test mode.
  1259. It changes how the tokenizer validates the input and prepares the request to the model.
  1260. cache_dir (`str` or `os.PathLike`, *optional*):
  1261. Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
  1262. standard cache should not be used.
  1263. force_download (`bool`, *optional*, defaults to `False`):
  1264. Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
  1265. exist.
  1266. token (`str` or *bool*, *optional*):
  1267. The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
  1268. when running `hf auth login` (stored in `~/.huggingface`).
  1269. local_files_only (`bool`, *optional*, defaults to `False`):
  1270. Whether or not to only rely on local files and not to attempt to download any files.
  1271. revision (`str`, *optional*, defaults to `"main"`):
  1272. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  1273. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  1274. identifier allowed by git.
  1275. max_length (`int`, *optional*):
  1276. Controls the maximum length to use by one of the truncation/padding parameters.
  1277. If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
  1278. is required by one of the truncation/padding parameters. If the model has no specific maximum input
  1279. length (like XLNet) truncation/padding to a maximum length will be deactivated.
  1280. padding_side (`str`, *optional*, defaults to `"left"`):
  1281. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  1282. Default value is picked from the class attribute of the same name.
  1283. truncation_side (`str`, *optional*, defaults to `"right"`):
  1284. The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
  1285. model_input_names (`List[str]`, *optional*):
  1286. The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
  1287. `"attention_mask"`). Default value is picked from the class attribute of the same name.
  1288. clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
  1289. Whether or not the model should clean up the spaces that were added when splitting the input text during the
  1290. tokenization process.
  1291. kwargs (additional keyword arguments, *optional*):
  1292. Not supported by `MistralCommonBackend.from_pretrained`.
  1293. Will raise an error if used.
  1294. """
  1295. if init_inputs:
  1296. raise ValueError("`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`.")
  1297. # Handle kwargs and AutoTokenizer/AutoProcessor case
  1298. valid_kwargs = _VALID_INIT_KWARGS.union(
  1299. {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "subfolder"}
  1300. )
  1301. if kwargs and not set(kwargs.keys()).issubset(valid_kwargs):
  1302. raise ValueError(
  1303. f"Some kwargs in {list(kwargs.keys())} are not supported by `MistralCommonBackend.from_pretrained`."
  1304. )
  1305. mode = cls._get_validation_mode(mode)
  1306. if not os.path.isdir(pretrained_model_name_or_path):
  1307. tokenizer_path = download_tokenizer_from_hf_hub(
  1308. repo_id=pretrained_model_name_or_path,
  1309. cache_dir=cache_dir,
  1310. token=token,
  1311. revision=revision,
  1312. force_download=force_download,
  1313. local_files_only=local_files_only,
  1314. )
  1315. else:
  1316. candidate_files = os.listdir(pretrained_model_name_or_path)
  1317. tokenizer_path = os.path.join(pretrained_model_name_or_path, get_one_valid_tokenizer_file(candidate_files))
  1318. return cls(
  1319. tokenizer_path=tokenizer_path,
  1320. mode=mode,
  1321. model_max_length=model_max_length,
  1322. padding_side=padding_side,
  1323. truncation_side=truncation_side,
  1324. model_input_names=model_input_names,
  1325. clean_up_tokenization_spaces=clean_up_tokenization_spaces,
  1326. )
  1327. def save_pretrained( # type: ignore[override]
  1328. self,
  1329. save_directory: str | os.PathLike | Path,
  1330. push_to_hub: bool = False,
  1331. token: str | bool | None = None,
  1332. commit_message: str | None = None,
  1333. repo_id: str | None = None,
  1334. private: bool | None = None,
  1335. **kwargs,
  1336. ) -> tuple[str, ...]:
  1337. """
  1338. Save the full tokenizer state.
  1339. This method make sure the full tokenizer can then be re-loaded using the
  1340. [`~MistralCommonBackend.tokenization_mistral_common.from_pretrained`] class method.
  1341. Args:
  1342. save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved.
  1343. push_to_hub (`bool`, *optional*, defaults to `False`):
  1344. Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
  1345. repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
  1346. namespace).
  1347. token (`str` or *bool*, *optional*, defaults to `None`):
  1348. The token to use to push to the model hub. If `True`, will use the token in the `HF_TOKEN` environment
  1349. variable.
  1350. commit_message (`str`, *optional*): The commit message to use when pushing to the hub.
  1351. repo_id (`str`, *optional*): The name of the repository to which push to the Hub.
  1352. private (`bool`, *optional*): Whether the model repository is private or not.
  1353. kwargs (`Dict[str, Any]`, *optional*):
  1354. Not supported by `MistralCommonBackend.save_pretrained`.
  1355. Will raise an error if used.
  1356. Returns:
  1357. A tuple of `str`: The files saved.
  1358. """
  1359. if kwargs:
  1360. raise ValueError(
  1361. f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.save_pretrained`."
  1362. )
  1363. save_directory = Path(save_directory)
  1364. save_directory.mkdir(parents=True, exist_ok=True)
  1365. shutil.copy(self._tokenizer_path, save_directory)
  1366. if push_to_hub:
  1367. repo_id = repo_id or str(save_directory).split(os.path.sep)[-1]
  1368. repo_id = create_repo(repo_id, token=token, private=private, exist_ok=True).repo_id
  1369. files_timestamps = self._get_files_timestamps(save_directory)
  1370. self._upload_modified_files(
  1371. save_directory,
  1372. repo_id,
  1373. files_timestamps,
  1374. commit_message=commit_message,
  1375. token=token,
  1376. )
  1377. return (str(save_directory / self._tokenizer_path.name),)
  1378. @staticmethod
  1379. def _get_validation_mode(mode: str | ValidationMode) -> ValidationMode:
  1380. """Get the validation mode from a string or a ValidationMode."""
  1381. _invalid_mode_msg = (
  1382. f"Invalid `mistral-common` tokenizer mode: {mode}. Possible values are 'finetuning' or 'test'."
  1383. )
  1384. if isinstance(mode, str):
  1385. try:
  1386. mode = ValidationMode[mode]
  1387. except KeyError:
  1388. raise ValueError(_invalid_mode_msg)
  1389. elif not isinstance(mode, (str, ValidationMode)):
  1390. raise ValueError(_invalid_mode_msg)
  1391. if mode not in [ValidationMode.finetuning, ValidationMode.test]:
  1392. raise ValueError(_invalid_mode_msg)
  1393. return mode
  1394. def __repr__(self) -> str:
  1395. # MistralCommonBackend does not implement added_tokens_decoder, so we need a custom repr
  1396. return (
  1397. f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
  1398. f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length},"
  1399. f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
  1400. f" special_tokens={self.special_tokens_map})"
  1401. )
  1402. def added_tokens_decoder(self):
  1403. raise NotImplementedError("`MistralCommonBackend` does not implement `added_tokens_decoder`.")
  1404. def add_special_tokens(
  1405. self,
  1406. special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
  1407. replace_extra_special_tokens: bool = True,
  1408. ):
  1409. r"""`MistralCommonBackend` does not implement `add_special_tokens` by design.
  1410. If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
  1411. """
  1412. raise NotImplementedError("`MistralCommonBackend` does not implement `add_special_tokens`.")
  1413. def add_tokens( # type: ignore[override]
  1414. self,
  1415. special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
  1416. replace_extra_special_tokens: bool = True,
  1417. ):
  1418. """
  1419. `MistralCommonBackend` does not implement `add_special_tokens` by design.
  1420. If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
  1421. """
  1422. raise NotImplementedError("`MistralCommonBackend` does not implement `add_tokens`.")
  1423. def convert_added_tokens(cls, obj: AddedToken | Any, save: bool = False, add_type_field: bool = True): # type: ignore[override]
  1424. """
  1425. `MistralCommonBackend` does not implement `convert_added_tokens` by design.
  1426. If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
  1427. """
  1428. raise NotImplementedError("`MistralCommonBackend` does not implement `convert_added_tokens`.")
  1429. def get_chat_template(self, chat_template: str | None = None, tools: list[dict] | None = None) -> str:
  1430. """`MistralCommonBackend` does not implement `get_chat_template` by design as `mistral-common` does not use chat templates."""
  1431. raise NotImplementedError("`MistralCommonBackend` does not implement `get_chat_template`.")
  1432. def save_chat_templates(
  1433. self,
  1434. save_directory: str | os.PathLike,
  1435. tokenizer_config: dict,
  1436. filename_prefix: str | None,
  1437. save_jinja_files: bool,
  1438. ):
  1439. """`MistralCommonBackend` does not implement `save_chat_templates` by design as `mistral-common` does not use chat templates."""
  1440. raise NotImplementedError("`MistralCommonBackend` does not implement `save_chat_templates`.")
  1441. def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]:
  1442. """
  1443. `MistralCommonBackend` does not implement `save_vocabulary` by design.
  1444. This is because `mistral-common` is configured by one tokenizer file. If you'd like to save the vocabulary, please consider using the `save_pretrained` method instead.
  1445. """
  1446. raise NotImplementedError("`MistralCommonBackend` does not implement `save_vocabulary`.")
  1447. # Backward compatibility alias for codebases still importing the legacy name.
  1448. MistralCommonTokenizer = MistralCommonBackend