tokenization_utils_base.py 171 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565
  1. # base
  2. # Copyright 2020 The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
  17. fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
  18. of output with special method for the Fast tokenizers)
  19. """
  20. from __future__ import annotations
  21. import copy
  22. import json
  23. import os
  24. import re
  25. import warnings
  26. from collections import OrderedDict, UserDict
  27. from collections.abc import Callable, Collection, Mapping, Sequence, Sized
  28. from dataclasses import dataclass
  29. from pathlib import Path
  30. from typing import TYPE_CHECKING, Any, NamedTuple, Union
  31. import numpy as np
  32. from huggingface_hub import create_repo, is_offline_mode, list_repo_files
  33. from packaging import version
  34. from . import __version__
  35. from .dynamic_module_utils import custom_object_save
  36. from .utils import (
  37. CHAT_TEMPLATE_DIR,
  38. CHAT_TEMPLATE_FILE,
  39. ExplicitEnum,
  40. PaddingStrategy,
  41. PushToHubMixin,
  42. TensorType,
  43. add_end_docstrings,
  44. cached_file,
  45. copy_func,
  46. extract_commit_hash,
  47. is_mlx_available,
  48. is_numpy_array,
  49. is_protobuf_available,
  50. is_tokenizers_available,
  51. is_torch_available,
  52. is_torch_device,
  53. is_torch_tensor,
  54. list_repo_templates,
  55. logging,
  56. requires_backends,
  57. to_py_obj,
  58. )
  59. from .utils.chat_parsing_utils import recursive_parse
  60. from .utils.chat_template_utils import render_jinja_template
  61. from .utils.import_utils import PROTOBUF_IMPORT_ERROR
  62. if TYPE_CHECKING:
  63. if is_torch_available():
  64. import torch
  65. def import_protobuf_decode_error(error_message=""):
  66. if is_protobuf_available():
  67. from google.protobuf.message import DecodeError
  68. return DecodeError
  69. else:
  70. raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message))
  71. def flatten(arr: list):
  72. res = []
  73. if len(arr) > 0:
  74. for sub_arr in arr:
  75. if isinstance(arr[0], (list, tuple)):
  76. res.extend(flatten(sub_arr))
  77. else:
  78. res.append(sub_arr)
  79. return res
  80. if is_tokenizers_available() or TYPE_CHECKING:
  81. from tokenizers import Encoding as EncodingFast
  82. if is_tokenizers_available():
  83. from tokenizers import AddedToken
  84. else:
  85. @dataclass(frozen=False, eq=True)
  86. class AddedToken:
  87. """
  88. AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
  89. way it should behave.
  90. The `normalized` will default to `not special` if it is not specified, similarly to the definition in
  91. `tokenizers`.
  92. """
  93. def __init__(
  94. self, content: str, single_word=False, lstrip=False, rstrip=False, special=False, normalized=None
  95. ):
  96. self.content = content
  97. self.single_word = single_word
  98. self.lstrip = lstrip
  99. self.rstrip = rstrip
  100. self.special = special
  101. self.normalized = normalized if normalized is not None else not special
  102. def __getstate__(self):
  103. return self.__dict__
  104. def __str__(self):
  105. return self.content
  106. logger = logging.get_logger(__name__)
  107. VERY_LARGE_INTEGER = int(1e30) # This is used to set the max input length for a model with infinite size input
  108. LARGE_INTEGER = int(1e20) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER
  109. # Define type aliases and NamedTuples
  110. TextInput = str
  111. PreTokenizedInput = list[str]
  112. EncodedInput = list[int]
  113. TextInputPair = tuple[str, str]
  114. PreTokenizedInputPair = tuple[list[str], list[str]]
  115. EncodedInputPair = tuple[list[int], list[int]]
  116. # Define type aliases for text-related non-text modalities
  117. AudioInput = Union[np.ndarray, "torch.Tensor", list[np.ndarray], list["torch.Tensor"]]
  118. # Slow tokenizers used to be saved in three separated files
  119. SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
  120. ADDED_TOKENS_FILE = "added_tokens.json"
  121. TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
  122. # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
  123. FULL_TOKENIZER_FILE = "tokenizer.json"
  124. _re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")
  125. class TruncationStrategy(ExplicitEnum):
  126. """
  127. Possible values for the `truncation` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in
  128. an IDE.
  129. """
  130. ONLY_FIRST = "only_first"
  131. ONLY_SECOND = "only_second"
  132. LONGEST_FIRST = "longest_first"
  133. DO_NOT_TRUNCATE = "do_not_truncate"
  134. class CharSpan(NamedTuple):
  135. """
  136. Character span in the original string.
  137. Args:
  138. start (`int`): Index of the first character in the original string.
  139. end (`int`): Index of the character following the last character in the original string.
  140. """
  141. start: int
  142. end: int
  143. class TokenSpan(NamedTuple):
  144. """
  145. Token span in an encoded string (list of tokens).
  146. Args:
  147. start (`int`): Index of the first token in the span.
  148. end (`int`): Index of the token following the last token in the span.
  149. """
  150. start: int
  151. end: int
  152. class BatchEncoding(UserDict):
  153. """
  154. Holds the output of the [`~tokenization_utils_base.PreTrainedTokenizerBase.__call__`],
  155. [`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] and
  156. [`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc).
  157. This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
  158. utility methods to map from word/character space to token space.
  159. Args:
  160. data (`dict`, *optional*):
  161. Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods
  162. ('input_ids', 'attention_mask', etc.).
  163. encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):
  164. If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character
  165. space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this
  166. information.
  167. tensor_type (`Union[None, str, TensorType]`, *optional*):
  168. You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
  169. initialization.
  170. prepend_batch_axis (`bool`, *optional*, defaults to `False`):
  171. Whether or not to add a batch axis when converting to tensors (see `tensor_type` above). Note that this
  172. parameter has an effect if the parameter `tensor_type` is set, *otherwise has no effect*.
  173. n_sequences (`Optional[int]`, *optional*):
  174. You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
  175. initialization.
  176. """
  177. def __init__(
  178. self,
  179. data: dict[str, Any] | None = None,
  180. encoding: EncodingFast | Sequence[EncodingFast] | None = None,
  181. tensor_type: None | str | TensorType = None,
  182. prepend_batch_axis: bool = False,
  183. n_sequences: int | None = None,
  184. ):
  185. super().__init__(data)
  186. # If encoding is not None, the fast tokenization is used
  187. if encoding is not None and isinstance(encoding, EncodingFast):
  188. encoding = [encoding]
  189. self._encodings = encoding
  190. if n_sequences is None and encoding is not None and encoding:
  191. n_sequences = encoding[0].n_sequences
  192. self._n_sequences = n_sequences
  193. self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
  194. @property
  195. def n_sequences(self) -> int | None:
  196. """
  197. `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
  198. [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of
  199. sentences)
  200. """
  201. return self._n_sequences
  202. def __getitem__(self, item: int | str) -> Any | EncodingFast:
  203. """
  204. If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
  205. etc.).
  206. If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
  207. If the key is a slice, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.)
  208. with the constraint of slice.
  209. """
  210. if isinstance(item, str):
  211. return self.data[item]
  212. elif self._encodings is not None:
  213. return self._encodings[item]
  214. elif isinstance(item, slice):
  215. return {key: self.data[key][item] for key in self.data}
  216. else:
  217. raise KeyError(
  218. "Invalid key. Only three types of key are available: "
  219. "(1) string, (2) integers for backend Encoding, and (3) slices for data subsetting."
  220. )
  221. def __getattr__(self, item: str):
  222. try:
  223. return self.data[item]
  224. except KeyError:
  225. raise AttributeError
  226. def __getstate__(self):
  227. return {"data": self.data, "encodings": self._encodings}
  228. def __setstate__(self, state):
  229. if "data" in state:
  230. self.data = state["data"]
  231. if "encodings" in state:
  232. self._encodings = state["encodings"]
  233. # After this point:
  234. # Extended properties and methods only available for fast (Rust-based) tokenizers
  235. # provided by HuggingFace tokenizers library.
  236. @property
  237. def is_fast(self) -> bool:
  238. """
  239. TOOD: ita i will rm this `bool`: Whether or not this BatchEncoding was created by a fast tokenizer.
  240. """
  241. return self._encodings is not None
  242. @property
  243. def encodings(self) -> list[EncodingFast] | None:
  244. """
  245. `Optional[list[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns `None` if
  246. the input was tokenized through Python (i.e., not a fast) tokenizer.
  247. """
  248. return self._encodings
  249. def tokens(self, batch_index: int = 0) -> list[str]:
  250. """
  251. Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
  252. integer indices) at a given batch index (only works for the output of a fast tokenizer).
  253. Args:
  254. batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
  255. Returns:
  256. `list[str]`: The list of tokens at that index.
  257. """
  258. if not self._encodings:
  259. raise ValueError(
  260. "tokens() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
  261. " class)."
  262. )
  263. return self._encodings[batch_index].tokens
  264. def sequence_ids(self, batch_index: int = 0) -> list[int | None]:
  265. """
  266. Return a list mapping the tokens to the id of their original sentences:
  267. - `None` for special tokens added around or between sequences,
  268. - `0` for tokens corresponding to words in the first sequence,
  269. - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
  270. encoded.
  271. Args:
  272. batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
  273. Returns:
  274. `list[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added
  275. by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
  276. sequence.
  277. """
  278. if not self._encodings:
  279. raise ValueError(
  280. "sequence_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
  281. " class)."
  282. )
  283. return self._encodings[batch_index].sequence_ids
  284. def word_ids(self, batch_index: int = 0) -> list[int | None]:
  285. """
  286. Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
  287. Args:
  288. batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
  289. Returns:
  290. `list[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
  291. tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
  292. (several tokens will be mapped to the same word index if they are parts of that word).
  293. """
  294. if not self._encodings:
  295. raise ValueError(
  296. "word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
  297. " class)."
  298. )
  299. return self._encodings[batch_index].word_ids
  300. def token_to_sequence(self, batch_or_token_index: int, token_index: int | None = None) -> int:
  301. """
  302. Get the index of the sequence represented by the given token. In the general use case, this method returns `0`
  303. for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair
  304. Can be called as:
  305. - `self.token_to_sequence(token_index)` if batch size is 1
  306. - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1
  307. This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
  308. words are defined by the user). In this case it allows to easily associate encoded tokens with provided
  309. tokenized words.
  310. Args:
  311. batch_or_token_index (`int`):
  312. Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
  313. the token in the sequence.
  314. token_index (`int`, *optional*):
  315. If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
  316. sequence.
  317. Returns:
  318. `int`: Index of the word in the input sequence.
  319. """
  320. if not self._encodings:
  321. raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
  322. if token_index is not None:
  323. batch_index = batch_or_token_index
  324. else:
  325. batch_index = 0
  326. token_index = batch_or_token_index
  327. if batch_index < 0:
  328. batch_index = self._batch_size + batch_index
  329. if token_index < 0:
  330. token_index = self._seq_len + token_index
  331. return self._encodings[batch_index].token_to_sequence(token_index)
  332. def token_to_word(self, batch_or_token_index: int, token_index: int | None = None) -> int:
  333. """
  334. Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
  335. Can be called as:
  336. - `self.token_to_word(token_index)` if batch size is 1
  337. - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1
  338. This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
  339. words are defined by the user). In this case it allows to easily associate encoded tokens with provided
  340. tokenized words.
  341. Args:
  342. batch_or_token_index (`int`):
  343. Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
  344. the token in the sequence.
  345. token_index (`int`, *optional*):
  346. If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
  347. sequence.
  348. Returns:
  349. `int`: Index of the word in the input sequence.
  350. """
  351. if not self._encodings:
  352. raise ValueError("token_to_word() is not available when using Python based tokenizers")
  353. if token_index is not None:
  354. batch_index = batch_or_token_index
  355. else:
  356. batch_index = 0
  357. token_index = batch_or_token_index
  358. if batch_index < 0:
  359. batch_index = self._batch_size + batch_index
  360. if token_index < 0:
  361. token_index = self._seq_len + token_index
  362. return self._encodings[batch_index].token_to_word(token_index)
  363. def word_to_tokens(
  364. self, batch_or_word_index: int, word_index: int | None = None, sequence_index: int = 0
  365. ) -> TokenSpan | None:
  366. """
  367. Get the encoded token span corresponding to a word in a sequence of the batch.
  368. Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with:
  369. - **start** -- Index of the first token.
  370. - **end** -- Index of the token following the last token.
  371. Can be called as:
  372. - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
  373. - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to
  374. 1
  375. This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
  376. are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
  377. words.
  378. Args:
  379. batch_or_word_index (`int`):
  380. Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
  381. the word in the sequence.
  382. word_index (`int`, *optional*):
  383. If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
  384. sequence.
  385. sequence_index (`int`, *optional*, defaults to 0):
  386. If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
  387. or 1) the provided word index belongs to.
  388. Returns:
  389. ([`~tokenization_utils_base.TokenSpan`], *optional*): Span of tokens in the encoded sequence. Returns
  390. `None` if no tokens correspond to the word. This can happen especially when the token is a special token
  391. that has been used to format the tokenization. For example when we add a class token at the very beginning
  392. of the tokenization.
  393. """
  394. if not self._encodings:
  395. raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
  396. if word_index is not None:
  397. batch_index = batch_or_word_index
  398. else:
  399. batch_index = 0
  400. word_index = batch_or_word_index
  401. if batch_index < 0:
  402. batch_index = self._batch_size + batch_index
  403. if word_index < 0:
  404. word_index = self._seq_len + word_index
  405. span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
  406. return TokenSpan(*span) if span is not None else None
  407. def token_to_chars(self, batch_or_token_index: int, token_index: int | None = None) -> CharSpan | None:
  408. """
  409. Get the character span corresponding to an encoded token in a sequence of the batch.
  410. Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with:
  411. - **start** -- Index of the first character in the original string associated to the token.
  412. - **end** -- Index of the character following the last character in the original string associated to the
  413. token.
  414. Can be called as:
  415. - `self.token_to_chars(token_index)` if batch size is 1
  416. - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1
  417. Args:
  418. batch_or_token_index (`int`):
  419. Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
  420. the token in the sequence.
  421. token_index (`int`, *optional*):
  422. If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in
  423. the sequence.
  424. Returns:
  425. [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token
  426. (e.g. <s>, </s>) doesn't correspond to any chars in the origin string.
  427. """
  428. if not self._encodings:
  429. raise ValueError("token_to_chars() is not available when using Python based tokenizers")
  430. if token_index is not None:
  431. batch_index = batch_or_token_index
  432. else:
  433. batch_index = 0
  434. token_index = batch_or_token_index
  435. span_indices = self._encodings[batch_index].token_to_chars(token_index)
  436. return CharSpan(*span_indices) if span_indices is not None else None
  437. def char_to_token(self, batch_or_char_index: int, char_index: int | None = None, sequence_index: int = 0) -> int:
  438. """
  439. Get the index of the token in the encoded output comprising a character in the original string for a sequence
  440. of the batch.
  441. Can be called as:
  442. - `self.char_to_token(char_index)` if batch size is 1
  443. - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1
  444. This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
  445. are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
  446. words.
  447. Args:
  448. batch_or_char_index (`int`):
  449. Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
  450. the word in the sequence
  451. char_index (`int`, *optional*):
  452. If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
  453. sequence.
  454. sequence_index (`int`, *optional*, defaults to 0):
  455. If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
  456. or 1) the provided character index belongs to.
  457. Returns:
  458. `int`: Index of the token, or None if the char index refers to a whitespace only token and whitespace is
  459. trimmed with `trim_offsets=True`.
  460. """
  461. if not self._encodings:
  462. raise ValueError("char_to_token() is not available when using Python based tokenizers")
  463. if char_index is not None:
  464. batch_index = batch_or_char_index
  465. else:
  466. batch_index = 0
  467. char_index = batch_or_char_index
  468. return self._encodings[batch_index].char_to_token(char_index, sequence_index)
  469. def word_to_chars(
  470. self, batch_or_word_index: int, word_index: int | None = None, sequence_index: int = 0
  471. ) -> CharSpan:
  472. """
  473. Get the character span in the original string corresponding to given word in a sequence of the batch.
  474. Character spans are returned as a CharSpan NamedTuple with:
  475. - start: index of the first character in the original string
  476. - end: index of the character following the last character in the original string
  477. Can be called as:
  478. - `self.word_to_chars(word_index)` if batch size is 1
  479. - `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1
  480. Args:
  481. batch_or_word_index (`int`):
  482. Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
  483. the word in the sequence
  484. word_index (`int`, *optional*):
  485. If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
  486. sequence.
  487. sequence_index (`int`, *optional*, defaults to 0):
  488. If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
  489. or 1) the provided word index belongs to.
  490. Returns:
  491. `CharSpan` or `list[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan
  492. are NamedTuple with:
  493. - start: index of the first character associated to the token in the original string
  494. - end: index of the character following the last character associated to the token in the original
  495. string
  496. """
  497. if not self._encodings:
  498. raise ValueError("word_to_chars() is not available when using Python based tokenizers")
  499. if word_index is not None:
  500. batch_index = batch_or_word_index
  501. else:
  502. batch_index = 0
  503. word_index = batch_or_word_index
  504. return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
  505. def char_to_word(self, batch_or_char_index: int, char_index: int | None = None, sequence_index: int = 0) -> int:
  506. """
  507. Get the word in the original string corresponding to a character in the original string of a sequence of the
  508. batch.
  509. Can be called as:
  510. - `self.char_to_word(char_index)` if batch size is 1
  511. - `self.char_to_word(batch_index, char_index)` if batch size is greater than 1
  512. This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
  513. are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
  514. words.
  515. Args:
  516. batch_or_char_index (`int`):
  517. Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
  518. the character in the original string.
  519. char_index (`int`, *optional*):
  520. If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the
  521. original string.
  522. sequence_index (`int`, *optional*, defaults to 0):
  523. If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
  524. or 1) the provided character index belongs to.
  525. Returns:
  526. `int` or `list[int]`: Index or indices of the associated encoded token(s).
  527. """
  528. if not self._encodings:
  529. raise ValueError("char_to_word() is not available when using Python based tokenizers")
  530. if char_index is not None:
  531. batch_index = batch_or_char_index
  532. else:
  533. batch_index = 0
  534. char_index = batch_or_char_index
  535. return self._encodings[batch_index].char_to_word(char_index, sequence_index)
  536. def convert_to_tensors(self, tensor_type: str | TensorType | None = None, prepend_batch_axis: bool = False):
  537. """
  538. Convert the inner content to tensors.
  539. Args:
  540. tensor_type (`str` or [`~utils.TensorType`], *optional*):
  541. The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
  542. `None`, no modification is done.
  543. prepend_batch_axis (`int`, *optional*, defaults to `False`):
  544. Whether or not to add the batch dimension during the conversion.
  545. """
  546. if tensor_type is None:
  547. return self
  548. # Convert to TensorType
  549. if not isinstance(tensor_type, TensorType):
  550. tensor_type = TensorType(tensor_type)
  551. if tensor_type == TensorType.PYTORCH:
  552. if not is_torch_available():
  553. raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
  554. import torch
  555. def as_tensor(value, dtype=None):
  556. if isinstance(value, list) and len(value) > 0 and isinstance(value[0], np.ndarray):
  557. return torch.from_numpy(np.array(value))
  558. if len(flatten(value)) == 0 and dtype is None:
  559. dtype = torch.int64
  560. return torch.tensor(value, dtype=dtype)
  561. is_tensor = torch.is_tensor
  562. elif tensor_type == TensorType.MLX:
  563. if not is_mlx_available():
  564. raise ImportError("Unable to convert output to MLX tensors format, MLX is not installed.")
  565. import mlx.core as mx
  566. def as_tensor(value, dtype=None):
  567. if len(flatten(value)) == 0 and dtype is None:
  568. dtype = mx.int32
  569. return mx.array(value, dtype=dtype)
  570. def is_tensor(obj):
  571. return isinstance(obj, mx.array)
  572. else:
  573. def as_tensor(value, dtype=None):
  574. if (
  575. isinstance(value, (list, tuple))
  576. and len(value) > 0
  577. and isinstance(value[0], (list, tuple, np.ndarray))
  578. ):
  579. value_lens = [len(val) for val in value]
  580. if len(set(value_lens)) > 1 and dtype is None:
  581. # we have a ragged list so handle explicitly
  582. value = as_tensor([np.asarray(val) for val in value], dtype=object)
  583. if len(flatten(value)) == 0 and dtype is None:
  584. dtype = np.int64
  585. return np.asarray(value, dtype=dtype)
  586. is_tensor = is_numpy_array
  587. # Do the tensor conversion in batch
  588. for key, value in self.items():
  589. try:
  590. if prepend_batch_axis:
  591. value = [value]
  592. if not is_tensor(value):
  593. tensor = as_tensor(value)
  594. # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
  595. # # at-least2d
  596. # if tensor.ndim > 2:
  597. # tensor = tensor.squeeze(0)
  598. # elif tensor.ndim < 2:
  599. # tensor = tensor[None, :]
  600. self[key] = tensor
  601. except Exception as e:
  602. if key == "overflowing_tokens":
  603. raise ValueError(
  604. "Unable to create tensor returning overflowing tokens of different lengths. "
  605. "Please see if a fast version of this tokenizer is available to have this feature available."
  606. ) from e
  607. raise ValueError(
  608. "Unable to create tensor, you should probably activate truncation and/or padding with"
  609. " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
  610. f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
  611. " expected)."
  612. ) from e
  613. return self
  614. def to(self, device: str | torch.device, *, non_blocking: bool = False) -> BatchEncoding:
  615. """
  616. Send all values to device by calling `v.to(device, non_blocking=non_blocking)` (PyTorch only).
  617. Args:
  618. device (`str` or `torch.device`): The device to put the tensors on.
  619. non_blocking (`bool`): Whether to perform the copy asynchronously.
  620. Returns:
  621. [`BatchEncoding`]: The same instance after modification.
  622. """
  623. requires_backends(self, ["torch"])
  624. # This check catches things like APEX blindly calling "to" on all inputs to a module
  625. # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
  626. # into a HalfTensor
  627. if isinstance(device, str) or is_torch_device(device) or isinstance(device, int):
  628. self.data = {
  629. k: v.to(device=device, non_blocking=non_blocking) if hasattr(v, "to") and callable(v.to) else v
  630. for k, v in self.data.items()
  631. }
  632. else:
  633. logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
  634. return self
  635. ENCODE_KWARGS_DOCSTRING = r"""
  636. add_special_tokens (`bool`, *optional*, defaults to `True`):
  637. Whether or not to add special tokens when encoding the sequences. This will use the underlying
  638. `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are
  639. automatically added to the input ids. This is useful if you want to add `bos` or `eos` tokens
  640. automatically.
  641. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
  642. Activates and controls padding. Accepts the following values:
  643. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  644. sequence is provided).
  645. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  646. acceptable input length for the model if that argument is not provided.
  647. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
  648. lengths).
  649. truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
  650. Activates and controls truncation. Accepts the following values:
  651. - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
  652. to the maximum acceptable input length for the model if that argument is not provided. This will
  653. truncate token by token, removing a token from the longest sequence in the pair if a pair of
  654. sequences (or a batch of pairs) is provided.
  655. - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
  656. maximum acceptable input length for the model if that argument is not provided. This will only
  657. truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
  658. - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
  659. maximum acceptable input length for the model if that argument is not provided. This will only
  660. truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
  661. - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
  662. greater than the model maximum admissible input size).
  663. max_length (`int`, *optional*):
  664. Controls the maximum length to use by one of the truncation/padding parameters.
  665. If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
  666. is required by one of the truncation/padding parameters. If the model has no specific maximum input
  667. length (like XLNet) truncation/padding to a maximum length will be deactivated.
  668. stride (`int`, *optional*, defaults to 0):
  669. If set to a number along with `max_length`, the overflowing tokens returned when
  670. `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
  671. returned to provide some overlap between truncated and overflowing sequences. The value of this
  672. argument defines the number of overlapping tokens.
  673. is_split_into_words (`bool`, *optional*, defaults to `False`):
  674. Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
  675. tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
  676. which it will tokenize. This is useful for NER or token classification.
  677. pad_to_multiple_of (`int`, *optional*):
  678. If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
  679. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
  680. `>= 7.5` (Volta).
  681. padding_side (`str`, *optional*):
  682. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  683. Default value is picked from the class attribute of the same name.
  684. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  685. If set, will return tensors instead of list of python integers. Acceptable values are:
  686. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  687. - `'np'`: Return Numpy `np.ndarray` objects.
  688. """
  689. ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
  690. return_token_type_ids (`bool`, *optional*):
  691. Whether to return token type IDs. If left to the default, will return the token type IDs according to
  692. the specific tokenizer's default, defined by the `return_outputs` attribute.
  693. [What are token type IDs?](../glossary#token-type-ids)
  694. return_attention_mask (`bool`, *optional*):
  695. Whether to return the attention mask. If left to the default, will return the attention mask according
  696. to the specific tokenizer's default, defined by the `return_outputs` attribute.
  697. [What are attention masks?](../glossary#attention-mask)
  698. return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
  699. Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
  700. of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
  701. of returning overflowing tokens.
  702. return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
  703. Whether or not to return special tokens mask information.
  704. return_offsets_mapping (`bool`, *optional*, defaults to `False`):
  705. Whether or not to return `(char_start, char_end)` for each token.
  706. This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
  707. Python's tokenizer, this method will raise `NotImplementedError`.
  708. return_length (`bool`, *optional*, defaults to `False`):
  709. Whether or not to return the lengths of the encoded inputs.
  710. verbose (`bool`, *optional*, defaults to `True`):
  711. Whether or not to print more information and warnings.
  712. **kwargs: passed to the `self.tokenize()` method
  713. Return:
  714. [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
  715. - **input_ids** -- List of token ids to be fed to a model.
  716. [What are input IDs?](../glossary#input-ids)
  717. - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
  718. if *"token_type_ids"* is in `self.model_input_names`).
  719. [What are token type IDs?](../glossary#token-type-ids)
  720. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
  721. `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
  722. [What are attention masks?](../glossary#attention-mask)
  723. - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
  724. `return_overflowing_tokens=True`).
  725. - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
  726. `return_overflowing_tokens=True`).
  727. - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
  728. regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
  729. - **length** -- The length of the inputs (when `return_length=True`)
  730. """
  731. INIT_TOKENIZER_DOCSTRING = r"""
  732. Class attributes (overridden by derived classes)
  733. - **vocab_files_names** (`dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
  734. vocabulary file required by the model, and as associated values, the filename for saving the associated file
  735. (string).
  736. - **pretrained_vocab_files_map** (`dict[str, dict[str, str]]`) -- A dictionary of dictionaries, with the
  737. high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
  738. low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
  739. associated pretrained vocabulary file.
  740. - **model_input_names** (`list[str]`) -- A list of inputs expected in the forward pass of the model.
  741. - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
  742. Should be `'right'` or `'left'`.
  743. - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
  744. applied. Should be `'right'` or `'left'`.
  745. Args:
  746. model_max_length (`int`, *optional*):
  747. The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
  748. loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this will be set to the
  749. value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will
  750. default to VERY_LARGE_INTEGER (`int(1e30)`).
  751. padding_side (`str`, *optional*):
  752. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  753. Default value is picked from the class attribute of the same name.
  754. truncation_side (`str`, *optional*):
  755. The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
  756. Default value is picked from the class attribute of the same name.
  757. chat_template (`str`, *optional*):
  758. A Jinja template string that will be used to format lists of chat messages. See
  759. https://huggingface.co/docs/transformers/chat_templating for a full description.
  760. model_input_names (`list[string]`, *optional*):
  761. The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
  762. `"attention_mask"`). Default value is picked from the class attribute of the same name.
  763. bos_token (`str` or `tokenizers.AddedToken`, *optional*):
  764. A special token representing the beginning of a sentence.
  765. eos_token (`str` or `tokenizers.AddedToken`, *optional*):
  766. A special token representing the end of a sentence.
  767. unk_token (`str` or `tokenizers.AddedToken`, *optional*):
  768. A special token representing an out-of-vocabulary token.
  769. sep_token (`str` or `tokenizers.AddedToken`, *optional*):
  770. A special token separating two different sentences in the same input (used by BERT for instance).
  771. pad_token (`str` or `tokenizers.AddedToken`, *optional*):
  772. A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
  773. attention mechanisms or loss computation.
  774. cls_token (`str` or `tokenizers.AddedToken`, *optional*):
  775. A special token representing the class of the input (used by BERT for instance).
  776. mask_token (`str` or `tokenizers.AddedToken`, *optional*):
  777. A special token representing a masked token (used by masked-language modeling pretraining objectives, like
  778. BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.
  779. extra_special_tokens (list of `str` or `tokenizers.AddedToken`, *optional*):
  780. A list of extra model-specific special tokens. Add them here to ensure they are skipped when decoding with
  781. `skip_special_tokens` is set to True. If they are not part of the vocabulary, they will be added at the end
  782. of the vocabulary.
  783. split_special_tokens (`bool`, *optional*, defaults to `False`):
  784. Whether or not the special tokens should be split during the tokenization process. Passing will affect the
  785. internal state of the tokenizer. The default behavior is to not split special tokens. This means that if
  786. `<s>` is the `bos_token`, then `tokenizer.tokenize("<s>") = ['<s>`]. Otherwise, if
  787. `split_special_tokens=True`, then `tokenizer.tokenize("<s>")` will be give `['<','s', '>']`.
  788. """
  789. @add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
  790. class PreTrainedTokenizerBase(PushToHubMixin):
  791. """
  792. Base class for all tokenizer backends.
  793. """
  794. vocab_files_names: dict[str, str] = {}
  795. pretrained_vocab_files_map: dict[str, dict[str, str]] = {}
  796. _auto_class: str | None = None
  797. # first name has to correspond to main model input name
  798. # to make sure `tokenizer.pad(...)` works correctly
  799. model_input_names: list[str] = ["input_ids", "attention_mask"]
  800. padding_side: str = "right"
  801. truncation_side: str = "right"
  802. slow_tokenizer_class = None
  803. # Special tokens support (moved from SpecialTokensMixin)
  804. # V5: Clean separation of named special tokens from extra special tokens
  805. SPECIAL_TOKENS_ATTRIBUTES = [
  806. "bos_token",
  807. "eos_token",
  808. "unk_token",
  809. "sep_token",
  810. "pad_token",
  811. "cls_token",
  812. "mask_token",
  813. ]
  814. def __init__(self, **kwargs):
  815. self.init_inputs = ()
  816. for key in kwargs:
  817. if hasattr(self, key) and callable(getattr(self, key)):
  818. raise AttributeError(f"{key} conflicts with the method {key} in {self.__class__.__name__}")
  819. # V5: Convert deprecated additional_special_tokens to extra_special_tokens before storing init_kwargs
  820. if "additional_special_tokens" in kwargs and "extra_special_tokens" not in kwargs:
  821. kwargs["extra_special_tokens"] = kwargs.pop("additional_special_tokens")
  822. self.init_kwargs = copy.deepcopy(kwargs)
  823. self.name_or_path = kwargs.pop("name_or_path", "")
  824. self._processor_class = kwargs.pop("processor_class", None)
  825. self._pad_token_type_id = 0
  826. self.verbose = kwargs.pop("verbose", False)
  827. # V5: Separate storage for named special tokens and extra special tokens
  828. self._special_tokens_map = dict.fromkeys(self.SPECIAL_TOKENS_ATTRIBUTES)
  829. self._extra_special_tokens = [] # List of extra model-specific special tokens
  830. # V5: track both explicit and auto-detected model-specific tokens
  831. explicit_model_specific_tokens = kwargs.pop("model_specific_special_tokens", None)
  832. if explicit_model_specific_tokens is None:
  833. explicit_model_specific_tokens = {}
  834. elif not isinstance(explicit_model_specific_tokens, dict):
  835. raise TypeError("model_specific_special_tokens must be a dictionary of token name to token value")
  836. auto_model_specific_tokens = {}
  837. # Directly set hidden values to allow init with tokens not yet in vocab
  838. for key in list(kwargs.keys()):
  839. if key in self.SPECIAL_TOKENS_ATTRIBUTES:
  840. value = kwargs.pop(key)
  841. if value is None:
  842. continue
  843. if isinstance(value, (str, AddedToken)):
  844. self._special_tokens_map[key] = value
  845. else:
  846. raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}")
  847. elif key == "extra_special_tokens":
  848. value = kwargs.pop(key)
  849. if value is None:
  850. continue
  851. if isinstance(value, dict):
  852. self._set_model_specific_special_tokens(special_tokens=value)
  853. elif isinstance(value, (list, tuple)):
  854. self._extra_special_tokens = list(value)
  855. else:
  856. raise TypeError("extra_special_tokens must be a list/tuple of tokens or a dict of named tokens")
  857. elif (
  858. key.endswith("_token")
  859. and key not in self.SPECIAL_TOKENS_ATTRIBUTES
  860. and isinstance(kwargs[key], (str, AddedToken))
  861. ):
  862. value = kwargs.pop(key)
  863. if value is None:
  864. continue
  865. auto_model_specific_tokens[key] = value
  866. # For backward compatibility we fallback to set model_max_length from max_len if provided
  867. model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
  868. self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
  869. self.padding_side = kwargs.pop("padding_side", self.padding_side)
  870. if self.padding_side not in ["right", "left"]:
  871. raise ValueError(
  872. f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
  873. )
  874. self.truncation_side = kwargs.pop("truncation_side", self.truncation_side)
  875. if self.truncation_side not in ["right", "left"]:
  876. raise ValueError(
  877. f"Truncation side should be selected between 'right' and 'left', current value: {self.truncation_side}"
  878. )
  879. self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
  880. # By default, clean up tokenization spaces for both fast and slow tokenizers
  881. self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
  882. # By default, do not split special tokens for both fast and slow tokenizers
  883. self.split_special_tokens = kwargs.pop("split_special_tokens", False)
  884. self._in_target_context_manager = False
  885. self.chat_template = kwargs.pop("chat_template", None)
  886. if isinstance(self.chat_template, (list, tuple)):
  887. # Chat templates are stored as lists of dicts with fixed key names,
  888. # we reconstruct that into a single dict while loading them.
  889. self.chat_template = {template["name"]: template["template"] for template in self.chat_template}
  890. self.response_schema = kwargs.pop("response_schema", None)
  891. model_specific_tokens = {**auto_model_specific_tokens, **explicit_model_specific_tokens}
  892. if model_specific_tokens:
  893. self._set_model_specific_special_tokens(special_tokens=model_specific_tokens)
  894. self.deprecation_warnings = {}
  895. # Backend information (V5: tracking which backend and files were used)
  896. self.backend = kwargs.pop("backend", None)
  897. self.files_loaded = kwargs.pop("files_loaded", [])
  898. def _set_processor_class(self, processor_class: str):
  899. """Sets processor class so it can be serialized in `tokenizer_config.json`."""
  900. self._processor_class = processor_class
  901. # ---- Special tokens API (moved from SpecialTokensMixin) ----
  902. def add_special_tokens(
  903. self,
  904. special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
  905. replace_extra_special_tokens=True,
  906. ) -> int:
  907. """
  908. Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
  909. special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
  910. current vocabulary).
  911. When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the
  912. model so that its embedding matrix matches the tokenizer.
  913. In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
  914. Using `add_special_tokens` will ensure your special tokens can be used in several ways:
  915. - Special tokens can be skipped when decoding using `skip_special_tokens = True`.
  916. - Special tokens are carefully handled by the tokenizer (they are never split), similar to `AddedTokens`.
  917. - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
  918. makes it easy to develop model-agnostic training and fine-tuning scripts.
  919. When possible, special tokens are already registered for provided pretrained models (for instance
  920. [`BertTokenizer`] `cls_token` is already registered to be `'[CLS]'` and XLM's one is also registered to be
  921. `'</s>'`).
  922. Args:
  923. special_tokens_dict (dictionary *str* to *str*, `tokenizers.AddedToken`, or `Sequence[Union[str, AddedToken]]`):
  924. Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`,
  925. `sep_token`, `pad_token`, `cls_token`, `mask_token`, `extra_special_tokens`].
  926. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
  927. assign the index of the `unk_token` to them).
  928. replace_extra_special_tokens (`bool`, *optional*, defaults to `True`):
  929. If `True`, the existing list of extra special tokens will be replaced by the list provided in
  930. `special_tokens_dict`. Otherwise, `extra_special_tokens` will be extended. In the former
  931. case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged
  932. as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the
  933. `added_tokens_encoder` and `added_tokens_decoder`. This means that the previous
  934. `extra_special_tokens` are still added tokens, and will not be split by the model.
  935. Returns:
  936. `int`: Number of tokens added to the vocabulary.
  937. Examples:
  938. ```python
  939. # Let's see how to add a new classification token to GPT-2
  940. tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
  941. model = GPT2Model.from_pretrained("openai-community/gpt2")
  942. special_tokens_dict = {"cls_token": "<CLS>"}
  943. num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
  944. print("We have added", num_added_toks, "tokens")
  945. # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
  946. model.resize_token_embeddings(len(tokenizer))
  947. assert tokenizer.cls_token == "<CLS>"
  948. ```"""
  949. if not special_tokens_dict:
  950. return 0
  951. # V5: Allowed keys are SPECIAL_TOKENS_ATTRIBUTES + "extra_special_tokens"
  952. # Backward compatibility: convert "additional_special_tokens" to "extra_special_tokens"
  953. special_tokens_dict = dict(special_tokens_dict)
  954. if "additional_special_tokens" in special_tokens_dict:
  955. special_tokens_dict.setdefault(
  956. "extra_special_tokens", special_tokens_dict.pop("additional_special_tokens")
  957. )
  958. allowed_keys = set(self.SPECIAL_TOKENS_ATTRIBUTES) | {"extra_special_tokens"}
  959. tokens_to_add = []
  960. for key, value in special_tokens_dict.items():
  961. if key not in allowed_keys:
  962. raise ValueError(f"Key {key} is not a valid special token. Valid keys are: {allowed_keys}")
  963. if self.verbose:
  964. logger.info(f"Assigning {value} to the {key} key of the tokenizer")
  965. if key == "extra_special_tokens":
  966. if not isinstance(value, (list, tuple)) or not all(isinstance(t, (str, AddedToken)) for t in value):
  967. raise ValueError(f"Tokens {value} for key {key} should all be str or AddedToken instances")
  968. new_tokens = [
  969. (
  970. AddedToken(t, rstrip=False, lstrip=False, normalized=False, special=True)
  971. if isinstance(t, str)
  972. else t
  973. )
  974. for t in value
  975. if replace_extra_special_tokens or str(t) not in self.extra_special_tokens
  976. ]
  977. if replace_extra_special_tokens and new_tokens:
  978. self._extra_special_tokens = list(new_tokens)
  979. else:
  980. self._extra_special_tokens.extend(new_tokens)
  981. tokens_to_add.extend(new_tokens)
  982. else:
  983. if not isinstance(value, (str, AddedToken)):
  984. raise ValueError(f"Token {value} for key {key} should be a str or an AddedToken instance")
  985. if isinstance(value, str):
  986. value = AddedToken(value, rstrip=False, lstrip=False, normalized=False, special=True)
  987. setattr(self, key, value)
  988. tokens_to_add.append(value)
  989. return self.add_tokens(tokens_to_add, special_tokens=True)
  990. def add_tokens(
  991. self, new_tokens: str | AddedToken | Sequence[str | AddedToken], special_tokens: bool = False
  992. ) -> int:
  993. """
  994. #TODO remove this from here! PreTrainedTOkeniuzerBase should be agnostic of AddedToken.
  995. Add a list of new tokens. If the new tokens are not in the vocabulary, they are added to the end. Added tokens and
  996. tokens from the vocabulary of the tokenization algorithm are therefore not treated in the same way.
  997. Args:
  998. new_tokens (`str`, `tokenizers.AddedToken` or a sequence of *str* or `tokenizers.AddedToken`):
  999. Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string
  1000. token to let you personalize its behavior: whether this token should only match against a single word,
  1001. whether this token should strip all potential whitespaces on the left side, whether this token should
  1002. strip all potential whitespaces on the right side, etc.
  1003. special_tokens (`bool`, *optional*, defaults to `False`):
  1004. Specifies if the token is special. This mostly changes the normalization behavior
  1005. See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.
  1006. Returns:
  1007. `int`: Number of tokens added to the vocabulary.
  1008. Examples:
  1009. ```python
  1010. # Let's see how to increase the vocabulary of Bert model and tokenizer
  1011. tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
  1012. model = BertModel.from_pretrained("google-bert/bert-base-uncased")
  1013. num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
  1014. print("We have added", num_added_toks, "tokens")
  1015. # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
  1016. model.resize_token_embeddings(len(tokenizer))
  1017. ```"""
  1018. if not new_tokens:
  1019. return 0
  1020. if not isinstance(new_tokens, (list, tuple)):
  1021. new_tokens = [new_tokens]
  1022. return self._add_tokens(new_tokens, special_tokens=special_tokens)
  1023. def _add_tokens(self, new_tokens: list[str] | list[AddedToken], special_tokens: bool = False) -> int:
  1024. raise NotImplementedError
  1025. @property
  1026. def pad_token_type_id(self) -> int:
  1027. return self._pad_token_type_id
  1028. def __setattr__(self, key, value):
  1029. # Handle _id/_ids suffix (eg. bos_token_id -> bos_token)
  1030. key_without_id = key.removesuffix("_ids").removesuffix("_id") if key.endswith(("_id", "_ids")) else key
  1031. # Named special tokens (bos_token, eos_token, etc.)
  1032. if key_without_id in self.SPECIAL_TOKENS_ATTRIBUTES:
  1033. if key != key_without_id and value is not None:
  1034. value = self.convert_ids_to_tokens(value)
  1035. if value is not None and not isinstance(value, (str, AddedToken)):
  1036. raise ValueError(f"Cannot set a non-string value as the {key_without_id}")
  1037. self._special_tokens_map[key_without_id] = value
  1038. return
  1039. # Extra special tokens: model-specific special tokens without standard names (eg. <mask_1>)
  1040. if key_without_id == "extra_special_tokens":
  1041. if key != key_without_id and value is not None and isinstance(value, (list, tuple)):
  1042. value = [self.convert_ids_to_tokens(v) for v in value]
  1043. if not isinstance(value, (list, tuple)) and value is not None:
  1044. raise ValueError(f"extra_special_tokens must be a list or tuple, got {type(value)}")
  1045. self._extra_special_tokens = [] if value is None else list(value)
  1046. return
  1047. super().__setattr__(key, value)
  1048. def __getattr__(self, key):
  1049. # Handle _id/_ids suffix (eg. bos_token_id -> bos_token)
  1050. key_without_id = key.removesuffix("_ids").removesuffix("_id") if key.endswith(("_id", "_ids")) else key
  1051. # Named special tokens (bos_token, eos_token, etc.)
  1052. if key_without_id in self.SPECIAL_TOKENS_ATTRIBUTES:
  1053. # Use __dict__.get to avoid recursive __getattr__ when _special_tokens_map
  1054. # is not yet initialized (e.g. during fast tokenizer __init__)
  1055. token_value = self.__dict__.get("_special_tokens_map", {}).get(key_without_id)
  1056. if token_value is None:
  1057. if self.verbose:
  1058. logger.error(f"Using {key}, but it is not set yet.")
  1059. return None
  1060. return self.convert_tokens_to_ids(str(token_value)) if key != key_without_id else str(token_value)
  1061. # Extra special tokens
  1062. if key_without_id == "extra_special_tokens":
  1063. tokens = [str(tok) for tok in self.__dict__.get("_extra_special_tokens", [])]
  1064. return self.convert_tokens_to_ids(tokens) if key != key_without_id else tokens
  1065. if key not in self.__dict__:
  1066. # Also check the class hierarchy (handles class-level defaults, e.g. in
  1067. # dynamically loaded remote code where __getattr__ may be called before
  1068. # the instance attribute is set)
  1069. for cls in type(self).__mro__:
  1070. if key in vars(cls):
  1071. return vars(cls)[key]
  1072. raise AttributeError(f"{self.__class__.__name__} has no attribute {key}")
  1073. return super().__getattr__(key)
  1074. def get_special_tokens_mask(
  1075. self, token_ids_0: list[int], token_ids_1: list[int] | None = None, already_has_special_tokens: bool = False
  1076. ) -> list[int]:
  1077. """
  1078. Retrieve sequence ids from a token list that has no special tokens added.
  1079. For fast tokenizers, data collators call this with `already_has_special_tokens=True` to build a mask over an
  1080. already-formatted sequence. In that case, we compute the mask by checking membership in `all_special_ids`.
  1081. Args:
  1082. token_ids_0: List of IDs for the (possibly already formatted) sequence.
  1083. token_ids_1: Unused when `already_has_special_tokens=True`. Must be None in that case.
  1084. already_has_special_tokens: Whether the sequence is already formatted with special tokens.
  1085. Returns:
  1086. A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
  1087. """
  1088. if already_has_special_tokens:
  1089. if token_ids_1 is not None:
  1090. raise ValueError(
  1091. "You should not supply a second sequence if the provided sequence of ids is already formatted "
  1092. "with special tokens for the model."
  1093. )
  1094. special_ids = set(self.all_special_ids)
  1095. return [1 if int(tid) in special_ids else 0 for tid in token_ids_0]
  1096. # Default base implementation for non-formatted sequences is not provided here.
  1097. # Concrete tokenizer classes should override this for their specific formatting rules.
  1098. raise NotImplementedError(
  1099. f"{self.__class__.__name__} does not implement get_special_tokens_mask for non-formatted sequences"
  1100. )
  1101. @property
  1102. def special_tokens_map(self) -> dict[str, str]:
  1103. """
  1104. `dict[str, str]`: A flat dictionary mapping named special token attributes to their string values.
  1105. Only includes the standard named special tokens (bos_token, eos_token, etc.), not extra_special_tokens.
  1106. This provides a clean, flat structure without mixed types.
  1107. Returns:
  1108. A dictionary with keys like 'bos_token', 'eos_token', etc., and string values.
  1109. **V5 Change**: This now returns only named tokens. Use `extra_special_tokens` for the additional tokens.
  1110. """
  1111. return {
  1112. attr: str(self._special_tokens_map[attr])
  1113. for attr in self.SPECIAL_TOKENS_ATTRIBUTES
  1114. if self._special_tokens_map.get(attr) is not None
  1115. }
  1116. # Note: extra_special_tokens and extra_special_tokens_ids are handled by __getattr__ and __setattr__
  1117. # We don't define them as @property to keep the implementation simpler
  1118. @property
  1119. def all_special_tokens(self) -> list[str]:
  1120. """
  1121. `list[str]`: A list of all unique special tokens (named + extra) as strings.
  1122. Includes both named special tokens (bos_token, eos_token, etc.) and extra special tokens.
  1123. Converts tokens of `tokenizers.AddedToken` type to string.
  1124. """
  1125. seen = set()
  1126. all_toks = []
  1127. # Add named special tokens
  1128. for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
  1129. value = self._special_tokens_map.get(attr)
  1130. if value is not None:
  1131. token_str = str(value)
  1132. if token_str not in seen:
  1133. all_toks.append(token_str)
  1134. seen.add(token_str)
  1135. # Add extra special tokens
  1136. for token in self._extra_special_tokens:
  1137. token_str = str(token)
  1138. if token_str not in seen:
  1139. all_toks.append(token_str)
  1140. seen.add(token_str)
  1141. return all_toks
  1142. @property
  1143. def all_special_ids(self) -> list[int]:
  1144. """
  1145. `list[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
  1146. """
  1147. return self.convert_tokens_to_ids(self.all_special_tokens)
  1148. def _set_model_specific_special_tokens(self, special_tokens: dict[str, str | AddedToken]):
  1149. """
  1150. Adds new model-specific special tokens (e.g., for multimodal models).
  1151. These tokens are added to the named special tokens map and will be saved in tokenizer config.
  1152. For example: if the model tokenizer is multimodal, we can support special image or audio tokens.
  1153. Args:
  1154. special_tokens: Dictionary of {token_name: token_value}
  1155. """
  1156. self.SPECIAL_TOKENS_ATTRIBUTES = self.SPECIAL_TOKENS_ATTRIBUTES + list(special_tokens.keys())
  1157. for key, value in special_tokens.items():
  1158. if isinstance(value, (str, AddedToken)):
  1159. self._special_tokens_map[key] = value
  1160. else:
  1161. raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}")
  1162. @property
  1163. def added_tokens_decoder(self) -> dict[int, AddedToken]:
  1164. raise NotImplementedError()
  1165. def __repr__(self) -> str:
  1166. added_tokens_decoder_rep = "\n\t".join([f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()])
  1167. if added_tokens_decoder_rep:
  1168. added_tokens_decoder_rep = f"\n\t{added_tokens_decoder_rep}\n"
  1169. return (
  1170. f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
  1171. f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length},"
  1172. f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
  1173. f" special_tokens={self.special_tokens_map},"
  1174. f" added_tokens_decoder={{{added_tokens_decoder_rep}}})"
  1175. )
  1176. def __len__(self) -> int:
  1177. raise NotImplementedError()
  1178. @property
  1179. def vocab_size(self) -> int:
  1180. """
  1181. `int`: Size of the base vocabulary (without the added tokens).
  1182. """
  1183. raise NotImplementedError()
  1184. def get_vocab(self) -> dict[str, int]:
  1185. """
  1186. Returns the vocabulary as a dictionary of token to index.
  1187. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
  1188. vocab.
  1189. Returns:
  1190. `dict[str, int]`: The vocabulary.
  1191. """
  1192. raise NotImplementedError()
  1193. def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
  1194. """
  1195. Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
  1196. vocabulary.
  1197. Args:
  1198. tokens (`str` or `list[str]`): One or several token(s) to convert to token id(s).
  1199. Returns:
  1200. `int` or `list[int]`: The token id or list of token ids.
  1201. """
  1202. if isinstance(tokens, str):
  1203. return self._convert_token_to_id_with_added_voc(tokens)
  1204. return [self._convert_token_to_id_with_added_voc(token) for token in tokens]
  1205. def convert_ids_to_tokens(self, ids: int | list[int], skip_special_tokens: bool = False) -> str | list[str]:
  1206. """
  1207. Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
  1208. added tokens.
  1209. Args:
  1210. ids (`int` or `list[int]`):
  1211. The token id (or token ids) to convert to tokens.
  1212. skip_special_tokens (`bool`, *optional*, defaults to `False`):
  1213. Whether or not to remove special tokens in the decoding.
  1214. Returns:
  1215. `str` or `list[str]`: The decoded token(s).
  1216. """
  1217. raise NotImplementedError()
  1218. @classmethod
  1219. def from_pretrained(
  1220. cls,
  1221. pretrained_model_name_or_path: str | os.PathLike,
  1222. *init_inputs,
  1223. cache_dir: str | os.PathLike | None = None,
  1224. force_download: bool = False,
  1225. local_files_only: bool = False,
  1226. token: str | bool | None = None,
  1227. revision: str = "main",
  1228. trust_remote_code=False,
  1229. **kwargs,
  1230. ):
  1231. r"""
  1232. Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined
  1233. tokenizer.
  1234. Args:
  1235. pretrained_model_name_or_path (`str` or `os.PathLike`):
  1236. Can be either:
  1237. - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
  1238. - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
  1239. using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,
  1240. `./my_model_directory/`.
  1241. - (**Deprecated**, not applicable to all derived classes) a path to a single saved vocabulary
  1242. file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,
  1243. `./my_model_directory/vocab.txt`.
  1244. cache_dir (`str` or `os.PathLike`, *optional*):
  1245. Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
  1246. standard cache should not be used.
  1247. force_download (`bool`, *optional*, defaults to `False`):
  1248. Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
  1249. exist.
  1250. proxies (`dict[str, str]`, *optional*):
  1251. A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
  1252. 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
  1253. token (`str` or *bool*, *optional*):
  1254. The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
  1255. when running `hf auth login` (stored in `~/.huggingface`).
  1256. local_files_only (`bool`, *optional*, defaults to `False`):
  1257. Whether or not to only rely on local files and not to attempt to download any files.
  1258. revision (`str`, *optional*, defaults to `"main"`):
  1259. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  1260. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  1261. identifier allowed by git.
  1262. subfolder (`str`, *optional*):
  1263. In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
  1264. facebook/rag-token-base), specify it here.
  1265. inputs (additional positional arguments, *optional*):
  1266. Will be passed along to the Tokenizer `__init__` method.
  1267. trust_remote_code (`bool`, *optional*, defaults to `False`):
  1268. Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
  1269. should only be set to `True` for repositories you trust and in which you have read the code, as it will
  1270. execute code present on the Hub on your local machine.
  1271. kwargs (additional keyword arguments, *optional*):
  1272. Will be passed to the Tokenizer `__init__` method. Can be used to set special tokens like `bos_token`,
  1273. `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
  1274. `extra_special_tokens`. See parameters in the `__init__` for more details.
  1275. <Tip>
  1276. Passing `token=True` is required when you want to use a private model.
  1277. </Tip>
  1278. Examples:
  1279. ```python
  1280. # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer
  1281. # Download vocabulary from huggingface.co and cache.
  1282. tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
  1283. # Download vocabulary from huggingface.co (user-uploaded) and cache.
  1284. tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
  1285. # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
  1286. tokenizer = BertTokenizer.from_pretrained("./test/saved_model/")
  1287. # If the tokenizer uses a single vocabulary file, you can point directly to this file
  1288. tokenizer = BertTokenizer.from_pretrained("./test/saved_model/my_vocab.txt")
  1289. # You can link tokens to special vocabulary when instantiating
  1290. tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", unk_token="<unk>")
  1291. # You should be sure '<unk>' is in the vocabulary when doing that.
  1292. # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
  1293. assert tokenizer.unk_token == "<unk>"
  1294. ```"""
  1295. proxies = kwargs.pop("proxies", None)
  1296. subfolder = kwargs.pop("subfolder", None)
  1297. from_pipeline = kwargs.pop("_from_pipeline", None)
  1298. from_auto_class = kwargs.pop("_from_auto", False)
  1299. commit_hash = kwargs.pop("_commit_hash", None)
  1300. gguf_file = kwargs.get("gguf_file")
  1301. user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class}
  1302. if from_pipeline is not None:
  1303. user_agent["using_pipeline"] = from_pipeline
  1304. if is_offline_mode() and not local_files_only:
  1305. logger.info("Offline mode: forcing local_files_only=True")
  1306. local_files_only = True
  1307. pretrained_model_name_or_path = str(pretrained_model_name_or_path)
  1308. vocab_files = {}
  1309. additional_files_names = {}
  1310. init_configuration = {}
  1311. is_local = os.path.isdir(pretrained_model_name_or_path)
  1312. single_file_id = None
  1313. if os.path.isfile(pretrained_model_name_or_path):
  1314. # For legacy support: allow single-file loading if:
  1315. # 1. Only one vocab file is required, OR
  1316. # 2. It's a fast tokenizer with tokenizer_file (which is optional), OR
  1317. # 3. It's a GGUF file
  1318. vocab_files_count = len(cls.vocab_files_names)
  1319. has_optional_tokenizer_file = vocab_files_count > 1 and "tokenizer_file" in cls.vocab_files_names
  1320. if vocab_files_count > 1 and not gguf_file and not has_optional_tokenizer_file:
  1321. raise ValueError(
  1322. f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
  1323. "supported for this tokenizer. Use a model identifier or the path to a directory instead."
  1324. )
  1325. file_id = "vocab_file"
  1326. if pretrained_model_name_or_path.endswith("tokenizer.json"):
  1327. file_id = "tokenizer_file"
  1328. vocab_files[file_id] = pretrained_model_name_or_path
  1329. single_file_id = file_id
  1330. else:
  1331. if gguf_file:
  1332. vocab_files["vocab_file"] = gguf_file
  1333. else:
  1334. # At this point pretrained_model_name_or_path is either a directory or a model identifier name
  1335. additional_files_names = {
  1336. "added_tokens_file": ADDED_TOKENS_FILE, # kept only for legacy
  1337. "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, # kept only for legacy
  1338. "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
  1339. # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders
  1340. "tokenizer_file": FULL_TOKENIZER_FILE,
  1341. "chat_template_file": CHAT_TEMPLATE_FILE,
  1342. }
  1343. vocab_files = {**cls.vocab_files_names, **additional_files_names}
  1344. # Check for versioned tokenizer files
  1345. if "tokenizer_file" in vocab_files:
  1346. fast_tokenizer_file = FULL_TOKENIZER_FILE
  1347. resolved_config_file = cached_file(
  1348. pretrained_model_name_or_path,
  1349. TOKENIZER_CONFIG_FILE,
  1350. cache_dir=cache_dir,
  1351. force_download=force_download,
  1352. proxies=proxies,
  1353. token=token,
  1354. revision=revision,
  1355. local_files_only=local_files_only,
  1356. subfolder=subfolder,
  1357. user_agent=user_agent,
  1358. _raise_exceptions_for_missing_entries=False,
  1359. _commit_hash=commit_hash,
  1360. )
  1361. if resolved_config_file is not None:
  1362. with open(resolved_config_file, encoding="utf-8") as reader:
  1363. tokenizer_config = json.load(reader)
  1364. if "fast_tokenizer_files" in tokenizer_config:
  1365. fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
  1366. commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
  1367. vocab_files["tokenizer_file"] = fast_tokenizer_file
  1368. # This block looks for any extra chat template files
  1369. if is_local:
  1370. template_dir = Path(pretrained_model_name_or_path, CHAT_TEMPLATE_DIR)
  1371. if template_dir.is_dir():
  1372. for template_file in template_dir.glob("*.jinja"):
  1373. template_name = template_file.name.removesuffix(".jinja")
  1374. vocab_files[f"chat_template_{template_name}"] = f"{CHAT_TEMPLATE_DIR}/{template_file.name}"
  1375. else:
  1376. for template in list_repo_templates(
  1377. pretrained_model_name_or_path,
  1378. local_files_only=local_files_only,
  1379. revision=revision,
  1380. cache_dir=cache_dir,
  1381. token=token,
  1382. ):
  1383. template = template.removesuffix(".jinja")
  1384. vocab_files[f"chat_template_{template}"] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja"
  1385. remote_files = []
  1386. if not is_local and not local_files_only:
  1387. try:
  1388. remote_files = list_repo_files(pretrained_model_name_or_path)
  1389. except Exception:
  1390. remote_files = []
  1391. elif pretrained_model_name_or_path and os.path.isdir(pretrained_model_name_or_path):
  1392. remote_files = os.listdir(pretrained_model_name_or_path)
  1393. if "tokenizer_file" in vocab_files and not re.search(vocab_files["tokenizer_file"], "".join(remote_files)):
  1394. # mistral tokenizer names are different, but we can still convert them if
  1395. # mistral common is not there
  1396. other_pattern = r"tekken\.json|tokenizer\.model\.*|tiktoken\.model" + "|".join(
  1397. getattr(cls, "VOCAB_FILES_NAMES", {}).keys()
  1398. )
  1399. if match := re.search(other_pattern, "\n".join(remote_files)):
  1400. if "spm_file" in vocab_files:
  1401. vocab_files["spm_file"] = match.group()
  1402. else:
  1403. vocab_files["vocab_file"] = match.group()
  1404. resolved_vocab_files = {}
  1405. for file_id, file_path in vocab_files.items():
  1406. if file_path is None:
  1407. resolved_vocab_files[file_id] = None
  1408. elif single_file_id == file_id:
  1409. if os.path.isfile(file_path):
  1410. resolved_vocab_files[file_id] = file_path
  1411. else:
  1412. try:
  1413. resolved_vocab_files[file_id] = cached_file(
  1414. pretrained_model_name_or_path,
  1415. file_path,
  1416. cache_dir=cache_dir,
  1417. force_download=force_download,
  1418. proxies=proxies,
  1419. local_files_only=local_files_only,
  1420. token=token,
  1421. user_agent=user_agent,
  1422. revision=revision,
  1423. subfolder=subfolder,
  1424. _raise_exceptions_for_missing_entries=False,
  1425. _commit_hash=commit_hash,
  1426. )
  1427. except OSError:
  1428. # Re-raise any error raised by cached_file in order to get a helpful error message
  1429. raise
  1430. except Exception:
  1431. # For any other exception, we throw a generic error.
  1432. raise OSError(
  1433. f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
  1434. "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
  1435. f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
  1436. f"containing all relevant files for a {cls.__name__} tokenizer."
  1437. )
  1438. commit_hash = extract_commit_hash(resolved_vocab_files[file_id], commit_hash)
  1439. for file_id, file_path in vocab_files.items():
  1440. if file_id not in resolved_vocab_files:
  1441. continue
  1442. return cls._from_pretrained(
  1443. resolved_vocab_files,
  1444. pretrained_model_name_or_path,
  1445. init_configuration,
  1446. *init_inputs,
  1447. token=token,
  1448. cache_dir=cache_dir,
  1449. local_files_only=local_files_only,
  1450. _commit_hash=commit_hash,
  1451. _is_local=is_local,
  1452. trust_remote_code=trust_remote_code,
  1453. **kwargs,
  1454. )
  1455. @classmethod
  1456. def _from_pretrained(
  1457. cls,
  1458. resolved_vocab_files,
  1459. pretrained_model_name_or_path,
  1460. init_configuration,
  1461. *init_inputs,
  1462. token=None,
  1463. cache_dir=None,
  1464. local_files_only=False,
  1465. _commit_hash=None,
  1466. _is_local=False,
  1467. trust_remote_code=False,
  1468. **kwargs,
  1469. ):
  1470. # Prepare tokenizer initialization kwargs
  1471. # Did we saved some inputs and kwargs to reload ?
  1472. tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
  1473. if tokenizer_config_file is not None:
  1474. with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
  1475. init_kwargs = json.load(tokenizer_config_handle)
  1476. # used in the past to check if the tokenizer class matches the class in the repo
  1477. init_kwargs.pop("tokenizer_class", None)
  1478. saved_init_inputs = init_kwargs.pop("init_inputs", ())
  1479. if not init_inputs:
  1480. init_inputs = saved_init_inputs
  1481. else:
  1482. init_kwargs = init_configuration
  1483. if resolved_vocab_files.get("tokenizer_file", None) is not None:
  1484. init_kwargs.pop("add_bos_token", None)
  1485. init_kwargs.pop("add_eos_token", None)
  1486. # If independent chat template file(s) exist, they take priority over template entries in the tokenizer config
  1487. chat_templates = {}
  1488. chat_template_file = resolved_vocab_files.pop("chat_template_file", None)
  1489. extra_chat_templates = [key for key in resolved_vocab_files if key.startswith("chat_template_")]
  1490. if chat_template_file is not None:
  1491. with open(chat_template_file, encoding="utf-8") as chat_template_handle:
  1492. chat_templates["default"] = chat_template_handle.read()
  1493. for extra_chat_template in extra_chat_templates:
  1494. template_file = resolved_vocab_files.pop(extra_chat_template, None)
  1495. if template_file is None:
  1496. continue # I think this should never happen, but just in case
  1497. template_name = extra_chat_template.removeprefix("chat_template_")
  1498. with open(template_file) as chat_template_handle:
  1499. chat_templates[template_name] = chat_template_handle.read()
  1500. if len(chat_templates) == 1 and "default" in chat_templates:
  1501. init_kwargs["chat_template"] = chat_templates["default"]
  1502. elif chat_templates:
  1503. init_kwargs["chat_template"] = chat_templates
  1504. if not _is_local:
  1505. if "auto_map" in init_kwargs:
  1506. # For backward compatibility with odl format.
  1507. if isinstance(init_kwargs["auto_map"], (tuple, list)):
  1508. init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]}
  1509. # Update with newly provided kwargs
  1510. init_kwargs.update(kwargs)
  1511. # V5: Convert deprecated additional_special_tokens to extra_special_tokens
  1512. if "additional_special_tokens" in init_kwargs:
  1513. init_kwargs.setdefault("extra_special_tokens", init_kwargs.pop("additional_special_tokens"))
  1514. # V5: Collect model-specific tokens (custom *_token keys not in standard attributes)
  1515. default_attrs = set(cls.SPECIAL_TOKENS_ATTRIBUTES)
  1516. model_specific_tokens = {
  1517. key: init_kwargs.pop(key)
  1518. for key in list(init_kwargs.keys())
  1519. if key not in default_attrs and key.endswith("_token") and isinstance(init_kwargs[key], (str, AddedToken))
  1520. }
  1521. # If extra_special_tokens is a dict, merge it into model_specific_tokens
  1522. if isinstance(init_kwargs.get("extra_special_tokens"), dict):
  1523. model_specific_tokens.update(init_kwargs.pop("extra_special_tokens"))
  1524. if model_specific_tokens:
  1525. init_kwargs["model_specific_special_tokens"] = model_specific_tokens
  1526. # Merge resolved_vocab_files arguments in init_kwargs.
  1527. added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
  1528. special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
  1529. for args_name, file_path in resolved_vocab_files.items():
  1530. if args_name not in init_kwargs or init_kwargs[args_name] is None:
  1531. init_kwargs[args_name] = file_path
  1532. tokenizer_file = resolved_vocab_files.get("tokenizer_file", None)
  1533. init_kwargs["name_or_path"] = pretrained_model_name_or_path
  1534. init_kwargs["is_local"] = _is_local
  1535. #### Handle tokenizer serialization of added and special tokens
  1536. added_tokens_decoder: dict[int, AddedToken] = {}
  1537. added_tokens_map: dict[str, AddedToken] = {}
  1538. # if we have info on the slow added tokens
  1539. if "added_tokens_decoder" in init_kwargs:
  1540. for idx, token in init_kwargs["added_tokens_decoder"].items():
  1541. if isinstance(token, dict):
  1542. token = AddedToken(**token)
  1543. if isinstance(token, AddedToken):
  1544. added_tokens_decoder[int(idx)] = token
  1545. added_tokens_map[str(token)] = token
  1546. else:
  1547. raise TypeError(
  1548. f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
  1549. )
  1550. else:
  1551. # Legacy: read special_tokens_map.json and merge into init_kwargs
  1552. if special_tokens_map_file is not None:
  1553. with open(special_tokens_map_file, encoding="utf-8") as f:
  1554. special_tokens_map = json.load(f)
  1555. for key, value in special_tokens_map.items():
  1556. if key in kwargs and kwargs[key]:
  1557. continue # User-provided kwargs take precedence
  1558. if isinstance(value, dict) and key != "extra_special_tokens":
  1559. value.pop("special", None)
  1560. value = AddedToken(**value, special=True)
  1561. elif key == "extra_special_tokens" and isinstance(value, list):
  1562. # Merge list tokens, converting dicts to AddedToken
  1563. existing = list(init_kwargs.get("extra_special_tokens") or [])
  1564. for tok in value:
  1565. tok = AddedToken(**tok, special=True) if isinstance(tok, dict) else tok
  1566. if tok not in existing:
  1567. existing.append(tok)
  1568. value = existing
  1569. init_kwargs[key] = value
  1570. # Convert dict extra_special_tokens to model_specific_special_tokens
  1571. if isinstance(init_kwargs.get("extra_special_tokens"), dict):
  1572. init_kwargs.setdefault("model_specific_special_tokens", {}).update(
  1573. init_kwargs.pop("extra_special_tokens")
  1574. )
  1575. # slow -> slow|fast, legacy: convert the `"added_tokens.json"` file to `added_tokens_decoder`.
  1576. # this is for legacy purpose. We don't add the tokens after init for efficiency.
  1577. if added_tokens_file is not None:
  1578. # V5: Check both named and extra special tokens
  1579. special_tokens = {str(init_kwargs[k]) for k in cls.SPECIAL_TOKENS_ATTRIBUTES if init_kwargs.get(k)}
  1580. special_tokens.update(str(t) for t in (init_kwargs.get("extra_special_tokens") or []))
  1581. with open(added_tokens_file, encoding="utf-8") as f:
  1582. added_tok_encoder = json.load(f)
  1583. for str_token, index in added_tok_encoder.items():
  1584. is_special = str_token in special_tokens
  1585. added_tokens_decoder[index] = AddedToken(
  1586. str_token, rstrip=False, lstrip=False, normalized=not is_special, special=is_special
  1587. )
  1588. added_tokens_map[str_token] = added_tokens_decoder[index]
  1589. # allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
  1590. # if `tokenizer_config.json` is `None`
  1591. if tokenizer_file is not None:
  1592. # This is for slow so can be done before
  1593. with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle:
  1594. tokenizer_file_handle = json.load(tokenizer_file_handle)
  1595. added_tokens = tokenizer_file_handle.pop("added_tokens")
  1596. for serialized_tokens in added_tokens:
  1597. idx = serialized_tokens.pop("id")
  1598. added_tokens_decoder[idx] = AddedToken(**serialized_tokens)
  1599. added_tokens_map[str(added_tokens_decoder[idx])] = added_tokens_decoder[idx]
  1600. # end legacy
  1601. # Passing AddedTokens and not strings to the class to prevent it from casting the string to a different AddedToken
  1602. # convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
  1603. init_kwargs["added_tokens_decoder"] = added_tokens_decoder
  1604. init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
  1605. # V5: Map special tokens from added_tokens_map (named tokens only)
  1606. for key in cls.SPECIAL_TOKENS_ATTRIBUTES:
  1607. if key in init_kwargs and added_tokens_map != {} and init_kwargs[key] is not None:
  1608. init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key])
  1609. # From pretrained with the legacy fixes
  1610. # for `tokenizers` based tokenizer, we actually want to have vocab and merges pre-extracted from whatever inputs
  1611. # for `none` (PythonBackend) based tokenizer, we also want the vocab file / merge files not extracted.
  1612. # for `sentencepiece` based tokenizer, we pass the sentencepiece model file directly.
  1613. init_kwargs = cls.convert_to_native_format(**init_kwargs)
  1614. try:
  1615. tokenizer = cls(*init_inputs, **init_kwargs)
  1616. except import_protobuf_decode_error():
  1617. raise RuntimeError(
  1618. "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
  1619. "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).",
  1620. )
  1621. except RuntimeError as e:
  1622. if "sentencepiece_processor.cc" in str(e):
  1623. raise RuntimeError(
  1624. "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
  1625. "(SentencePiece RuntimeError: Tried to load SPM model with non-SPM vocab file).",
  1626. ) from e
  1627. else:
  1628. raise e
  1629. except OSError:
  1630. raise OSError(
  1631. "Unable to load vocabulary from file. "
  1632. "Please check that the provided vocabulary is accessible and not corrupted."
  1633. )
  1634. return tokenizer
  1635. @classmethod
  1636. def convert_to_native_format(cls, **kwargs):
  1637. return kwargs
  1638. @classmethod
  1639. def convert_added_tokens(cls, obj: AddedToken | Any, save=False, add_type_field=True):
  1640. if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
  1641. obj.pop("__type")
  1642. return AddedToken(**obj)
  1643. if isinstance(obj, AddedToken) and save:
  1644. obj = obj.__getstate__()
  1645. if add_type_field:
  1646. obj["__type"] = "AddedToken"
  1647. else:
  1648. # Don't save "special" for previous tokenizers
  1649. obj.pop("special")
  1650. return obj
  1651. elif isinstance(obj, (list, tuple)):
  1652. return [cls.convert_added_tokens(o, save=save, add_type_field=add_type_field) for o in obj]
  1653. elif isinstance(obj, dict):
  1654. return {k: cls.convert_added_tokens(v, save=save, add_type_field=add_type_field) for k, v in obj.items()}
  1655. return obj
  1656. def save_pretrained(
  1657. self,
  1658. save_directory: str | os.PathLike,
  1659. legacy_format: bool | None = None,
  1660. filename_prefix: str | None = None,
  1661. push_to_hub: bool = False,
  1662. **kwargs,
  1663. ) -> tuple[str, ...]:
  1664. """
  1665. Save the full tokenizer state.
  1666. This method make sure the full tokenizer can then be re-loaded using the
  1667. [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] class method..
  1668. Warning,None This won't save modifications you may have applied to the tokenizer after the instantiation (for
  1669. instance, modifying `tokenizer.do_lower_case` after creation).
  1670. Args:
  1671. save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved.
  1672. legacy_format (`bool`, *optional*):
  1673. Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
  1674. format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate
  1675. added_tokens files.
  1676. If `False`, will only save the tokenizer in the unified JSON format. This format is incompatible with
  1677. "slow" tokenizers (not powered by the *tokenizers* library), so the tokenizer will not be able to be
  1678. loaded in the corresponding "slow" tokenizer.
  1679. If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value
  1680. error is raised.
  1681. filename_prefix (`str`, *optional*):
  1682. A prefix to add to the names of the files saved by the tokenizer.
  1683. push_to_hub (`bool`, *optional*, defaults to `False`):
  1684. Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
  1685. repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
  1686. namespace).
  1687. kwargs (`dict[str, Any]`, *optional*):
  1688. Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
  1689. Returns:
  1690. A tuple of `str`: The files saved.
  1691. """
  1692. if os.path.isfile(save_directory):
  1693. logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
  1694. return
  1695. os.makedirs(save_directory, exist_ok=True)
  1696. if push_to_hub:
  1697. commit_message = kwargs.pop("commit_message", None)
  1698. repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
  1699. repo_id = create_repo(repo_id, exist_ok=True, **kwargs).repo_id
  1700. files_timestamps = self._get_files_timestamps(save_directory)
  1701. tokenizer_config_file = os.path.join(
  1702. save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
  1703. )
  1704. tokenizer_config = copy.deepcopy(self.init_kwargs)
  1705. tokenizer_config.pop("add_bos_token", None)
  1706. tokenizer_config.pop("add_eos_token", None)
  1707. # Let's save the init kwargs
  1708. target_keys = set(self.init_kwargs.keys())
  1709. target_keys.discard("add_bos_token")
  1710. target_keys.discard("add_eos_token")
  1711. # Let's save the special tokens map (only the strings)
  1712. target_keys.update(["model_max_length"])
  1713. for k in target_keys:
  1714. if hasattr(self, k):
  1715. tokenizer_config[k] = getattr(self, k)
  1716. # Let's make sure we properly save the special tokens
  1717. # V5: Save both named tokens and extra tokens
  1718. tokenizer_config.update(self.special_tokens_map)
  1719. if self._extra_special_tokens:
  1720. tokenizer_config["extra_special_tokens"] = self.extra_special_tokens
  1721. save_jinja_files = kwargs.get("save_jinja_files", True)
  1722. tokenizer_config, saved_raw_chat_template_files = self.save_chat_templates(
  1723. save_directory, tokenizer_config, filename_prefix, save_jinja_files
  1724. )
  1725. if getattr(self, "response_schema", None) is not None:
  1726. tokenizer_config["response_schema"] = self.response_schema
  1727. if len(self.init_inputs) > 0:
  1728. tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
  1729. for file_id in self.vocab_files_names:
  1730. tokenizer_config.pop(file_id, None)
  1731. # no typefields, this way old fast and slow can load it
  1732. tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True, save=True)
  1733. # Process added tokens separately: allows previous versions to ignore it!
  1734. added_tokens = {}
  1735. for key, value in self.added_tokens_decoder.items():
  1736. added_tokens[key] = value.__getstate__()
  1737. tokenizer_config["added_tokens_decoder"] = added_tokens
  1738. # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
  1739. tokenizer_class = self.__class__.__name__
  1740. # tokenizers backend don't need to save added_tokens_decoder and additional_special_tokens
  1741. if any(base.__name__ == "TokenizersBackend" for base in self.__class__.__mro__):
  1742. tokenizer_config.pop("added_tokens_decoder", None)
  1743. tokenizer_config.pop("additional_special_tokens", None)
  1744. # Remove the Fast at the end if we can save the slow tokenizer
  1745. if tokenizer_class.endswith("Fast") and getattr(self, "can_save_slow_tokenizer", False):
  1746. tokenizer_class = tokenizer_class[:-4]
  1747. tokenizer_config["tokenizer_class"] = tokenizer_class
  1748. if getattr(self, "_auto_map", None) is not None:
  1749. tokenizer_config["auto_map"] = self._auto_map
  1750. if getattr(self, "_processor_class", None) is not None:
  1751. tokenizer_config["processor_class"] = self._processor_class
  1752. tokenizer_config.pop("files_loaded", None)
  1753. # If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be
  1754. # loaded from the Hub.
  1755. if self._auto_class is not None:
  1756. custom_object_save(self, save_directory, config=tokenizer_config)
  1757. # remove private information
  1758. if "name_or_path" in tokenizer_config:
  1759. tokenizer_config.pop("name_or_path")
  1760. tokenizer_config.pop("special_tokens_map_file", None)
  1761. tokenizer_config.pop("tokenizer_file", None)
  1762. if "device_map" in tokenizer_config:
  1763. tokenizer_config.pop("device_map")
  1764. if "slow_tokenizer_class" in tokenizer_config:
  1765. tokenizer_config.pop("slow_tokenizer_class")
  1766. with open(tokenizer_config_file, "w", encoding="utf-8") as f:
  1767. out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
  1768. f.write(out_str)
  1769. logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
  1770. # Sanitize AddedTokens in special_tokens_map
  1771. file_names = (tokenizer_config_file, *saved_raw_chat_template_files)
  1772. save_files = self._save_pretrained(
  1773. save_directory=save_directory,
  1774. file_names=file_names,
  1775. legacy_format=legacy_format,
  1776. filename_prefix=filename_prefix,
  1777. )
  1778. if push_to_hub:
  1779. self._upload_modified_files(
  1780. save_directory,
  1781. repo_id,
  1782. files_timestamps,
  1783. commit_message=commit_message,
  1784. token=kwargs.get("token"),
  1785. )
  1786. return save_files
  1787. def _save_pretrained(
  1788. self,
  1789. save_directory: str | os.PathLike,
  1790. file_names: tuple[str, ...],
  1791. legacy_format: bool | None = None,
  1792. filename_prefix: str | None = None,
  1793. ) -> tuple[str, ...]:
  1794. """
  1795. Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
  1796. Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
  1797. specific [`~tokenization_utils_tokenizers.PreTrainedTokenizerFast._save_pretrained`]
  1798. """
  1799. if legacy_format is False:
  1800. raise ValueError(
  1801. "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
  1802. )
  1803. save_directory = str(save_directory)
  1804. added_tokens_file = os.path.join(
  1805. save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
  1806. )
  1807. # the new get_added_vocab() also returns special tokens and tokens that have an index < vocab_size
  1808. added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
  1809. if added_vocab:
  1810. with open(added_tokens_file, "w", encoding="utf-8") as f:
  1811. out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
  1812. f.write(out_str)
  1813. logger.info(f"added tokens file saved in {added_tokens_file}")
  1814. vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
  1815. return file_names + vocab_files + (added_tokens_file,)
  1816. def clean_up_tokenization(self, text: str) -> str:
  1817. """
  1818. Clean up tokenization spaces in a given text.
  1819. This method is mostly for remote code support.
  1820. """
  1821. text = (
  1822. text.replace(" .", ".")
  1823. .replace(" ?", "?")
  1824. .replace(" !", "!")
  1825. .replace(" ,", ",")
  1826. .replace(" ' ", "'")
  1827. .replace(" n't", "n't")
  1828. .replace(" 'm", "'m")
  1829. .replace(" 's", "'s")
  1830. .replace(" 've", "'ve")
  1831. .replace(" 're", "'re")
  1832. )
  1833. return text
  1834. def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]:
  1835. """
  1836. Save only the vocabulary of the tokenizer (vocabulary + added tokens).
  1837. This method won't save the configuration and special token mappings of the tokenizer. Use
  1838. [`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.
  1839. Args:
  1840. save_directory (`str`):
  1841. The directory in which to save the vocabulary.
  1842. filename_prefix (`str`, *optional*):
  1843. An optional prefix to add to the named of the saved files.
  1844. Returns:
  1845. `tuple(str)`: Paths to the files saved.
  1846. """
  1847. raise NotImplementedError
  1848. def tokenize(self, text: str, pair: str | None = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
  1849. """
  1850. Converts a string into a sequence of tokens, replacing unknown tokens with the `unk_token`.
  1851. Args:
  1852. text (`str`):
  1853. The sequence to be encoded.
  1854. pair (`str`, *optional*):
  1855. A second sequence to be encoded with the first.
  1856. add_special_tokens (`bool`, *optional*, defaults to `False`):
  1857. Whether or not to add the special tokens associated with the corresponding model.
  1858. kwargs (additional keyword arguments, *optional*):
  1859. Will be passed to the underlying model specific encode method. See details in
  1860. [`~PreTrainedTokenizerBase.__call__`]
  1861. Returns:
  1862. `list[str]`: The list of tokens.
  1863. """
  1864. raise NotImplementedError
  1865. @add_end_docstrings(
  1866. ENCODE_KWARGS_DOCSTRING,
  1867. """
  1868. **kwargs: Passed along to the `.tokenize()` method.
  1869. """,
  1870. """
  1871. Returns:
  1872. `list[int]`, `torch.Tensor`, or `np.ndarray`: The tokenized ids of the text.
  1873. """,
  1874. )
  1875. def encode(
  1876. self,
  1877. text: TextInput | PreTokenizedInput | EncodedInput,
  1878. text_pair: TextInput | PreTokenizedInput | EncodedInput | None = None,
  1879. add_special_tokens: bool = True,
  1880. padding: bool | str | PaddingStrategy = False,
  1881. truncation: bool | str | TruncationStrategy | None = None,
  1882. max_length: int | None = None,
  1883. stride: int = 0,
  1884. padding_side: str | None = None,
  1885. return_tensors: str | TensorType | None = None,
  1886. **kwargs,
  1887. ) -> list[int]:
  1888. """
  1889. Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
  1890. Same as doing `self.convert_tokens_to_ids(self.tokenize(text))`.
  1891. Args:
  1892. text (`str`, `list[str]` or `list[int]`):
  1893. The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
  1894. `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
  1895. method).
  1896. text_pair (`str`, `list[str]` or `list[int]`, *optional*):
  1897. Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
  1898. the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
  1899. method).
  1900. """
  1901. padding_strategy, truncation_strategy, max_length, kwargs_updated = self._get_padding_truncation_strategies(
  1902. padding=padding,
  1903. truncation=truncation,
  1904. max_length=max_length,
  1905. **kwargs,
  1906. )
  1907. kwargs.update(kwargs_updated)
  1908. encoded_inputs = self._encode_plus(
  1909. text,
  1910. text_pair=text_pair,
  1911. add_special_tokens=add_special_tokens,
  1912. padding_strategy=padding_strategy,
  1913. truncation_strategy=truncation_strategy,
  1914. max_length=max_length,
  1915. stride=stride,
  1916. padding_side=padding_side,
  1917. return_tensors=return_tensors,
  1918. **kwargs,
  1919. )
  1920. return encoded_inputs["input_ids"]
  1921. def num_special_tokens_to_add(self, pair: bool = False) -> int:
  1922. raise NotImplementedError
  1923. @property
  1924. def max_len_single_sentence(self) -> int:
  1925. """
  1926. `int`: The maximum length of a sentence that can be fed to the model.
  1927. """
  1928. return self.model_max_length - self.num_special_tokens_to_add(pair=False)
  1929. @max_len_single_sentence.setter
  1930. def max_len_single_sentence(self, value) -> None:
  1931. # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
  1932. if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
  1933. if not self.deprecation_warnings.get("max_len_single_sentence", False):
  1934. logger.warning(
  1935. "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
  1936. )
  1937. self.deprecation_warnings["max_len_single_sentence"] = True
  1938. else:
  1939. raise ValueError(
  1940. "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
  1941. )
  1942. @property
  1943. def max_len_sentences_pair(self) -> int:
  1944. """
  1945. `int`: The maximum combined length of a pair of sentences that can be fed to the model.
  1946. """
  1947. return self.model_max_length - self.num_special_tokens_to_add(pair=True)
  1948. @max_len_sentences_pair.setter
  1949. def max_len_sentences_pair(self, value) -> None:
  1950. # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
  1951. if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
  1952. if not self.deprecation_warnings.get("max_len_sentences_pair", False):
  1953. logger.warning(
  1954. "Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up."
  1955. )
  1956. self.deprecation_warnings["max_len_sentences_pair"] = True
  1957. else:
  1958. raise ValueError("Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.")
  1959. def _get_padding_truncation_strategies(
  1960. self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
  1961. ):
  1962. """
  1963. Find the correct padding/truncation strategy
  1964. """
  1965. # Backward compatibility for previous behavior:
  1966. # If you only set max_length, it activates truncation for max_length
  1967. if max_length is not None and padding is False and truncation is None:
  1968. truncation = "longest_first"
  1969. # Get padding strategy
  1970. if padding is not False:
  1971. if padding is True:
  1972. if verbose:
  1973. if max_length is not None and (
  1974. truncation is None or truncation is False or truncation == "do_not_truncate"
  1975. ):
  1976. warnings.warn(
  1977. "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
  1978. "To pad to max length, use `padding='max_length'`."
  1979. )
  1980. padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
  1981. elif not isinstance(padding, PaddingStrategy):
  1982. padding_strategy = PaddingStrategy(padding)
  1983. elif isinstance(padding, PaddingStrategy):
  1984. padding_strategy = padding
  1985. else:
  1986. padding_strategy = PaddingStrategy.DO_NOT_PAD
  1987. # Get truncation strategy
  1988. if truncation is not False and truncation is not None:
  1989. if truncation is True:
  1990. truncation_strategy = (
  1991. TruncationStrategy.LONGEST_FIRST
  1992. ) # Default to truncate the longest sequences in pairs of inputs
  1993. elif not isinstance(truncation, TruncationStrategy):
  1994. truncation_strategy = TruncationStrategy(truncation)
  1995. elif isinstance(truncation, TruncationStrategy):
  1996. truncation_strategy = truncation
  1997. else:
  1998. truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
  1999. # Set max length if needed
  2000. if max_length is None:
  2001. if padding_strategy == PaddingStrategy.MAX_LENGTH:
  2002. if self.model_max_length > LARGE_INTEGER:
  2003. padding_strategy = PaddingStrategy.DO_NOT_PAD
  2004. else:
  2005. max_length = self.model_max_length
  2006. if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
  2007. if self.model_max_length > LARGE_INTEGER:
  2008. truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
  2009. else:
  2010. max_length = self.model_max_length
  2011. # Test if we have a padding token
  2012. if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token is None or self.pad_token_id < 0):
  2013. raise ValueError(
  2014. "Asking to pad but the tokenizer does not have a padding token. "
  2015. "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
  2016. "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
  2017. )
  2018. # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
  2019. if (
  2020. truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
  2021. and padding_strategy != PaddingStrategy.DO_NOT_PAD
  2022. and pad_to_multiple_of is not None
  2023. and max_length is not None
  2024. and (max_length % pad_to_multiple_of != 0)
  2025. ):
  2026. raise ValueError(
  2027. "Truncation and padding are both activated but "
  2028. f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
  2029. )
  2030. return padding_strategy, truncation_strategy, max_length, kwargs
  2031. @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
  2032. def __call__(
  2033. self,
  2034. text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] | None = None,
  2035. text_pair: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] | None = None,
  2036. text_target: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] | None = None,
  2037. text_pair_target: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] | None = None,
  2038. add_special_tokens: bool = True,
  2039. padding: bool | str | PaddingStrategy = False,
  2040. truncation: bool | str | TruncationStrategy | None = None,
  2041. max_length: int | None = None,
  2042. stride: int = 0,
  2043. is_split_into_words: bool = False,
  2044. pad_to_multiple_of: int | None = None,
  2045. padding_side: str | None = None,
  2046. return_tensors: str | TensorType | None = None,
  2047. return_token_type_ids: bool | None = None,
  2048. return_attention_mask: bool | None = None,
  2049. return_overflowing_tokens: bool = False,
  2050. return_special_tokens_mask: bool = False,
  2051. return_offsets_mapping: bool = False,
  2052. return_length: bool = False,
  2053. verbose: bool = True,
  2054. tokenizer_kwargs: dict[str, Any] | None = None,
  2055. **kwargs,
  2056. ) -> BatchEncoding:
  2057. """
  2058. Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
  2059. sequences.
  2060. Args:
  2061. text (`str`, `list[str]`, `list[list[str]]`, *optional*):
  2062. The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
  2063. (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
  2064. `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
  2065. text_pair (`str`, `list[str]`, `list[list[str]]`, *optional*):
  2066. The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
  2067. (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
  2068. `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
  2069. text_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
  2070. The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
  2071. list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
  2072. you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
  2073. text_pair_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
  2074. The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
  2075. list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
  2076. you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
  2077. tokenizer_kwargs (`dict[str, Any]`, *optional*):
  2078. Additional kwargs to pass to the tokenizer. These will be merged with the explicit parameters and
  2079. other kwargs, with explicit parameters taking precedence.
  2080. """
  2081. # To avoid duplicating
  2082. all_kwargs = {
  2083. "add_special_tokens": add_special_tokens,
  2084. "padding": padding,
  2085. "truncation": truncation,
  2086. "max_length": max_length,
  2087. "stride": stride,
  2088. "is_split_into_words": is_split_into_words,
  2089. "pad_to_multiple_of": pad_to_multiple_of,
  2090. "padding_side": padding_side,
  2091. "return_tensors": return_tensors,
  2092. "return_token_type_ids": return_token_type_ids,
  2093. "return_attention_mask": return_attention_mask,
  2094. "return_overflowing_tokens": return_overflowing_tokens,
  2095. "return_special_tokens_mask": return_special_tokens_mask,
  2096. "return_offsets_mapping": return_offsets_mapping,
  2097. "return_length": return_length,
  2098. "split_special_tokens": kwargs.pop("split_special_tokens", self.split_special_tokens),
  2099. "verbose": verbose,
  2100. }
  2101. max_target_length = kwargs.pop("max_target_length", None)
  2102. # First merge tokenizer_kwargs, then other kwargs (explicit params take precedence)
  2103. if tokenizer_kwargs is not None:
  2104. all_kwargs.update(tokenizer_kwargs)
  2105. all_kwargs.update(kwargs)
  2106. if text is None and text_target is None:
  2107. raise ValueError("You need to specify either `text` or `text_target`.")
  2108. padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
  2109. padding=all_kwargs.pop("padding", False),
  2110. truncation=all_kwargs.pop("truncation", None),
  2111. max_length=all_kwargs.pop("max_length", None),
  2112. pad_to_multiple_of=all_kwargs.get("pad_to_multiple_of"),
  2113. verbose=all_kwargs.get("verbose", True),
  2114. **kwargs,
  2115. )
  2116. if text is not None:
  2117. # The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
  2118. # input mode in this case.
  2119. if not self._in_target_context_manager and hasattr(self, "_switch_to_input_mode"):
  2120. self._switch_to_input_mode()
  2121. encodings = self._encode_plus(
  2122. text=text,
  2123. text_pair=text_pair,
  2124. padding_strategy=padding_strategy,
  2125. truncation_strategy=truncation_strategy,
  2126. max_length=max_length,
  2127. **all_kwargs,
  2128. )
  2129. if text_target is not None:
  2130. if hasattr(self, "_switch_to_target_mode"):
  2131. self._switch_to_target_mode()
  2132. target_encodings = self._encode_plus(
  2133. text=text_target,
  2134. text_pair=text_pair_target,
  2135. padding_strategy=padding_strategy,
  2136. truncation_strategy=truncation_strategy,
  2137. max_length=max_target_length if max_target_length is not None else max_length,
  2138. **all_kwargs,
  2139. )
  2140. # Leave back tokenizer in input mode
  2141. if hasattr(self, "_switch_to_input_mode"):
  2142. self._switch_to_input_mode()
  2143. if text_target is None:
  2144. return encodings
  2145. elif text is None:
  2146. return target_encodings
  2147. else:
  2148. encodings["labels"] = target_encodings["input_ids"]
  2149. return encodings
  2150. def _encode_plus(
  2151. self,
  2152. text: TextInput | PreTokenizedInput | EncodedInput,
  2153. text_pair: TextInput | PreTokenizedInput | EncodedInput | None = None,
  2154. add_special_tokens: bool = True,
  2155. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
  2156. truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
  2157. max_length: int | None = None,
  2158. stride: int = 0,
  2159. is_split_into_words: bool = False,
  2160. pad_to_multiple_of: int | None = None,
  2161. padding_side: str | None = None,
  2162. return_tensors: str | TensorType | None = None,
  2163. return_token_type_ids: bool | None = None,
  2164. return_attention_mask: bool | None = None,
  2165. return_overflowing_tokens: bool = False,
  2166. return_special_tokens_mask: bool = False,
  2167. return_offsets_mapping: bool = False,
  2168. return_length: bool = False,
  2169. verbose: bool = True,
  2170. split_special_tokens: bool = False,
  2171. **kwargs,
  2172. ) -> BatchEncoding:
  2173. raise NotImplementedError
  2174. def pad(
  2175. self,
  2176. encoded_inputs: BatchEncoding
  2177. | list[BatchEncoding]
  2178. | dict[str, EncodedInput]
  2179. | dict[str, list[EncodedInput]]
  2180. | list[dict[str, EncodedInput]],
  2181. padding: bool | str | PaddingStrategy = True,
  2182. max_length: int | None = None,
  2183. pad_to_multiple_of: int | None = None,
  2184. padding_side: str | None = None,
  2185. return_attention_mask: bool | None = None,
  2186. return_tensors: str | TensorType | None = None,
  2187. verbose: bool = True,
  2188. ) -> BatchEncoding:
  2189. """
  2190. Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
  2191. in the batch.
  2192. Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
  2193. `self.pad_token_id` and `self.pad_token_type_id`).
  2194. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the
  2195. text followed by a call to the `pad` method to get a padded encoding.
  2196. <Tip>
  2197. If the `encoded_inputs` passed are dictionary of numpy arrays, or PyTorch tensors, the
  2198. result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
  2199. PyTorch tensors, you will lose the specific device of your tensors however.
  2200. </Tip>
  2201. Args:
  2202. encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `dict[str, list[int]]`, `dict[str, list[list[int]]` or `list[dict[str, list[int]]]`):
  2203. Tokenized inputs. Can represent one input ([`BatchEncoding`] or `dict[str, list[int]]`) or a batch of
  2204. tokenized inputs (list of [`BatchEncoding`], *dict[str, list[list[int]]]* or *list[dict[str,
  2205. list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
  2206. collate function.
  2207. Instead of `list[int]` you can have tensors (numpy arrays, or PyTorch tensors), see
  2208. the note above for the return type.
  2209. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
  2210. Select a strategy to pad the returned sequences (according to the model's padding side and padding
  2211. index) among:
  2212. - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
  2213. sequence if provided).
  2214. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  2215. acceptable input length for the model if that argument is not provided.
  2216. - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different
  2217. lengths).
  2218. max_length (`int`, *optional*):
  2219. Maximum length of the returned list and optionally padding length (see above).
  2220. pad_to_multiple_of (`int`, *optional*):
  2221. If set will pad the sequence to a multiple of the provided value.
  2222. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
  2223. `>= 7.5` (Volta).
  2224. padding_side (`str`, *optional*):
  2225. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  2226. Default value is picked from the class attribute of the same name.
  2227. return_attention_mask (`bool`, *optional*):
  2228. Whether to return the attention mask. If left to the default, will return the attention mask according
  2229. to the specific tokenizer's default, defined by the `return_outputs` attribute.
  2230. [What are attention masks?](../glossary#attention-mask)
  2231. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  2232. If set, will return tensors instead of list of python integers. Acceptable values are:
  2233. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  2234. - `'np'`: Return Numpy `np.ndarray` objects.
  2235. verbose (`bool`, *optional*, defaults to `True`):
  2236. Whether or not to print more information and warnings.
  2237. """
  2238. # If we have a list of dicts, let's convert it in a dict of lists
  2239. # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
  2240. if (
  2241. isinstance(encoded_inputs, (list, tuple))
  2242. and len(encoded_inputs) > 0
  2243. and isinstance(encoded_inputs[0], Mapping)
  2244. ):
  2245. # Call .keys() explicitly for compatibility with TensorDict and other Mapping subclasses
  2246. encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
  2247. # The model's main input name, usually `input_ids`, has been passed for padding
  2248. if self.model_input_names[0] not in encoded_inputs:
  2249. raise ValueError(
  2250. "You should supply an encoding or a list of encodings to this method "
  2251. f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
  2252. )
  2253. required_input = encoded_inputs[self.model_input_names[0]]
  2254. if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
  2255. if return_attention_mask:
  2256. encoded_inputs["attention_mask"] = []
  2257. return encoded_inputs
  2258. # If we have PyTorch/NumPy tensors/arrays as inputs, we cast them as python objects
  2259. # and rebuild them afterwards if no return_tensors is specified
  2260. # Note that we lose the specific device the tensor may be on for PyTorch
  2261. first_element = required_input[0]
  2262. if isinstance(first_element, (list, tuple)):
  2263. # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
  2264. for item in required_input:
  2265. if len(item) != 0:
  2266. first_element = item[0]
  2267. break
  2268. # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
  2269. if not isinstance(first_element, (int, list, tuple)):
  2270. if is_torch_tensor(first_element):
  2271. return_tensors = "pt" if return_tensors is None else return_tensors
  2272. elif isinstance(first_element, np.ndarray):
  2273. return_tensors = "np" if return_tensors is None else return_tensors
  2274. else:
  2275. raise ValueError(
  2276. f"type of {first_element} unknown: {type(first_element)}. "
  2277. "Should be one of a python, numpy, or pytorch object."
  2278. )
  2279. for key, value in encoded_inputs.items():
  2280. encoded_inputs[key] = to_py_obj(value)
  2281. # Convert padding_strategy in PaddingStrategy
  2282. padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
  2283. padding=padding, max_length=max_length, verbose=verbose
  2284. )
  2285. required_input = encoded_inputs[self.model_input_names[0]]
  2286. if required_input and not isinstance(required_input[0], (list, tuple)):
  2287. encoded_inputs = self._pad(
  2288. encoded_inputs,
  2289. max_length=max_length,
  2290. padding_strategy=padding_strategy,
  2291. pad_to_multiple_of=pad_to_multiple_of,
  2292. padding_side=padding_side,
  2293. return_attention_mask=return_attention_mask,
  2294. )
  2295. return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
  2296. batch_size = len(required_input)
  2297. assert all(len(v) == batch_size for v in encoded_inputs.values()), (
  2298. "Some items in the output dictionary have a different batch size than others."
  2299. )
  2300. if padding_strategy == PaddingStrategy.LONGEST:
  2301. max_length = max(len(inputs) for inputs in required_input)
  2302. padding_strategy = PaddingStrategy.MAX_LENGTH
  2303. batch_outputs = {}
  2304. for i in range(batch_size):
  2305. inputs = {k: v[i] for k, v in encoded_inputs.items()}
  2306. outputs = self._pad(
  2307. inputs,
  2308. max_length=max_length,
  2309. padding_strategy=padding_strategy,
  2310. pad_to_multiple_of=pad_to_multiple_of,
  2311. padding_side=padding_side,
  2312. return_attention_mask=return_attention_mask,
  2313. )
  2314. for key, value in outputs.items():
  2315. if key not in batch_outputs:
  2316. batch_outputs[key] = []
  2317. batch_outputs[key].append(value)
  2318. return BatchEncoding(batch_outputs, tensor_type=return_tensors)
  2319. def _pad(
  2320. self,
  2321. encoded_inputs: dict[str, EncodedInput] | BatchEncoding,
  2322. max_length: int | None = None,
  2323. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
  2324. pad_to_multiple_of: int | None = None,
  2325. padding_side: str | None = None,
  2326. return_attention_mask: bool | None = None,
  2327. ) -> dict:
  2328. """
  2329. Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
  2330. Args:
  2331. encoded_inputs:
  2332. Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
  2333. max_length: maximum length of the returned list and optionally padding length (see below).
  2334. Will truncate by taking into account the special tokens.
  2335. padding_strategy: PaddingStrategy to use for padding.
  2336. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
  2337. - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
  2338. - PaddingStrategy.DO_NOT_PAD: Do not pad
  2339. The tokenizer padding sides are defined in `padding_side` argument:
  2340. - 'left': pads on the left of the sequences
  2341. - 'right': pads on the right of the sequences
  2342. pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
  2343. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
  2344. `>= 7.5` (Volta).
  2345. padding_side:
  2346. The side on which the model should have padding applied. Should be selected between ['right', 'left'].
  2347. Default value is picked from the class attribute of the same name.
  2348. return_attention_mask:
  2349. (optional) Set to False to avoid returning attention mask (default: set to model specifics)
  2350. """
  2351. # Load from model defaults
  2352. if return_attention_mask is None:
  2353. return_attention_mask = "attention_mask" in self.model_input_names
  2354. required_input = encoded_inputs[self.model_input_names[0]]
  2355. if padding_strategy == PaddingStrategy.LONGEST:
  2356. max_length = len(required_input)
  2357. if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
  2358. max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
  2359. needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
  2360. # Initialize attention mask if not present.
  2361. if return_attention_mask and "attention_mask" not in encoded_inputs:
  2362. encoded_inputs["attention_mask"] = [1] * len(required_input)
  2363. if needs_to_be_padded:
  2364. difference = max_length - len(required_input)
  2365. padding_side = padding_side if padding_side is not None else self.padding_side
  2366. if padding_side == "right":
  2367. if return_attention_mask:
  2368. encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
  2369. if "token_type_ids" in encoded_inputs:
  2370. encoded_inputs["token_type_ids"] = (
  2371. encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
  2372. )
  2373. if "special_tokens_mask" in encoded_inputs:
  2374. encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
  2375. encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
  2376. elif padding_side == "left":
  2377. if return_attention_mask:
  2378. encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
  2379. if "token_type_ids" in encoded_inputs:
  2380. encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
  2381. "token_type_ids"
  2382. ]
  2383. if "special_tokens_mask" in encoded_inputs:
  2384. encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
  2385. encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
  2386. else:
  2387. raise ValueError(f"Invalid padding strategy:{padding_side}")
  2388. return encoded_inputs
  2389. def convert_tokens_to_string(self, tokens: list[str]) -> str:
  2390. """
  2391. Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
  2392. often want to remove sub-word tokenization artifacts at the same time.
  2393. Args:
  2394. tokens (`list[str]`): The token to join in a string.
  2395. Returns:
  2396. `str`: The joined tokens.
  2397. """
  2398. raise NotImplementedError
  2399. def decode(
  2400. self,
  2401. token_ids: int | list[int] | list[list[int]] | np.ndarray | torch.Tensor,
  2402. skip_special_tokens: bool = False,
  2403. **kwargs,
  2404. ) -> str | list[str]:
  2405. """
  2406. Converts a sequence of ids into a string, or a list of sequences into a list of strings,
  2407. using the tokenizer and vocabulary with options to remove special tokens and clean up
  2408. tokenization spaces.
  2409. Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
  2410. Args:
  2411. token_ids (`Union[int, list[int], list[list[int]], np.ndarray, torch.Tensor]`):
  2412. A single sequence or a batch (list of sequences) of tokenized input ids. Can be obtained using the
  2413. `__call__` method.
  2414. skip_special_tokens (`bool`, *optional*, defaults to `False`):
  2415. Whether or not to remove special tokens in the decoding.
  2416. kwargs (additional keyword arguments, *optional*):
  2417. Will be passed to the underlying model specific decode method.
  2418. Returns:
  2419. `Union[str, list[str]]`: The decoded string for a single sequence, or a list of decoded strings for a
  2420. batch of sequences.
  2421. """
  2422. # Convert inputs to python lists
  2423. token_ids = to_py_obj(token_ids)
  2424. # If we received batched input, decode each sequence
  2425. if isinstance(token_ids, (list, tuple)) and len(token_ids) > 0 and isinstance(token_ids[0], (list, tuple)):
  2426. clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
  2427. return [
  2428. self._decode(
  2429. token_ids=seq,
  2430. skip_special_tokens=skip_special_tokens,
  2431. clean_up_tokenization_spaces=clean_up_tokenization_spaces,
  2432. **kwargs,
  2433. )
  2434. for seq in token_ids
  2435. ]
  2436. return self._decode(
  2437. token_ids=token_ids,
  2438. skip_special_tokens=skip_special_tokens,
  2439. **kwargs,
  2440. )
  2441. def batch_decode(
  2442. self,
  2443. sequences: list[int] | list[list[int]] | np.ndarray | torch.Tensor,
  2444. skip_special_tokens: bool = False,
  2445. clean_up_tokenization_spaces: bool | None = None,
  2446. **kwargs,
  2447. ) -> list[str]:
  2448. """
  2449. Convert a list of lists of token ids into a list of strings by calling decode.
  2450. This method is provided for backwards compatibility. The `decode` method now handles batched input natively,
  2451. so you can use `decode` directly instead of `batch_decode`.
  2452. Args:
  2453. sequences (`Union[list[int], list[list[int]], np.ndarray, torch.Tensor]`):
  2454. List of tokenized input ids. Can be obtained using the `__call__` method.
  2455. skip_special_tokens (`bool`, *optional*, defaults to `False`):
  2456. Whether or not to remove special tokens in the decoding.
  2457. clean_up_tokenization_spaces (`bool`, *optional*):
  2458. Whether or not to clean up the tokenization spaces. If `None`, will default to
  2459. `self.clean_up_tokenization_spaces`.
  2460. kwargs (additional keyword arguments, *optional*):
  2461. Will be passed to the underlying model specific decode method.
  2462. Returns:
  2463. `list[str]`: The list of decoded sentences.
  2464. """
  2465. # Forward to decode() which now handles batched input natively
  2466. result = self.decode(
  2467. token_ids=sequences,
  2468. skip_special_tokens=skip_special_tokens,
  2469. clean_up_tokenization_spaces=clean_up_tokenization_spaces,
  2470. **kwargs,
  2471. )
  2472. # Ensure we always return a list for backwards compatibility
  2473. if isinstance(result, str):
  2474. return [result]
  2475. return result
  2476. def _decode(
  2477. self,
  2478. token_ids: int | list[int],
  2479. skip_special_tokens: bool = False,
  2480. clean_up_tokenization_spaces: bool | None = None,
  2481. **kwargs,
  2482. ) -> str:
  2483. raise NotImplementedError
  2484. def _eventual_warn_about_too_long_sequence(self, ids: list[int], max_length: int | None, verbose: bool):
  2485. """
  2486. Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
  2487. corresponding model
  2488. Args:
  2489. ids (`list[str]`): The ids produced by the tokenization
  2490. max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set)
  2491. verbose (`bool`): Whether or not to print more information and warnings.
  2492. """
  2493. if max_length is None and len(ids) > self.model_max_length and verbose and self.model_max_length != 0:
  2494. if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
  2495. logger.warning(
  2496. "Token indices sequence length is longer than the specified maximum sequence length "
  2497. f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
  2498. "will result in indexing errors"
  2499. )
  2500. self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
  2501. @classmethod
  2502. def register_for_auto_class(cls, auto_class="AutoTokenizer"):
  2503. """
  2504. Register this class with a given auto class. This should only be used for custom tokenizers as the ones in the
  2505. library are already mapped with `AutoTokenizer`.
  2506. Args:
  2507. auto_class (`str` or `type`, *optional*, defaults to `"AutoTokenizer"`):
  2508. The auto class to register this new tokenizer with.
  2509. """
  2510. if not isinstance(auto_class, str):
  2511. auto_class = auto_class.__name__
  2512. import transformers.models.auto as auto_module
  2513. if not hasattr(auto_module, auto_class):
  2514. raise ValueError(f"{auto_class} is not a valid auto class.")
  2515. cls._auto_class = auto_class
  2516. def apply_chat_template(
  2517. self,
  2518. conversation: list[dict[str, str]] | list[list[dict[str, str]]],
  2519. tools: list[dict | Callable] | None = None,
  2520. documents: list[dict[str, str]] | None = None,
  2521. chat_template: str | None = None,
  2522. add_generation_prompt: bool = False,
  2523. continue_final_message: bool = False,
  2524. tokenize: bool = True,
  2525. padding: bool | str | PaddingStrategy = False,
  2526. truncation: bool = False,
  2527. max_length: int | None = None,
  2528. return_tensors: str | TensorType | None = None,
  2529. return_dict: bool = True,
  2530. return_assistant_tokens_mask: bool = False,
  2531. tokenizer_kwargs: dict[str, Any] | None = None,
  2532. **kwargs,
  2533. ) -> str | list[int] | list[str] | list[list[int]] | BatchEncoding:
  2534. """
  2535. Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token
  2536. ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
  2537. determine the format and control tokens to use when converting.
  2538. Args:
  2539. conversation (Union[list[dict[str, str]], list[list[dict[str, str]]]]): A list of dicts
  2540. with "role" and "content" keys, representing the chat history so far.
  2541. tools (`list[Union[Dict, Callable]]`, *optional*):
  2542. A list of tools (callable functions) that will be accessible to the model. If the template does not
  2543. support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
  2544. giving the name, description and argument types for the tool. See our
  2545. [tool use guide](https://huggingface.co/docs/transformers/en/chat_extras#passing-tools)
  2546. for more information.
  2547. documents (`list[dict[str, str]]`, *optional*):
  2548. A list of dicts representing documents that will be accessible to the model if it is performing RAG
  2549. (retrieval-augmented generation). If the template does not support RAG, this argument will have no
  2550. effect. We recommend that each document should be a dict containing "title" and "text" keys.
  2551. chat_template (`str`, *optional*):
  2552. A Jinja template to use for this conversion. It is usually not necessary to pass anything to this
  2553. argument, as the model's template will be used by default.
  2554. add_generation_prompt (bool, *optional*):
  2555. If this is set, a prompt with the token(s) that indicate
  2556. the start of an assistant message will be appended to the formatted output. This is useful when you want to generate a response from the model.
  2557. Note that this argument will be passed to the chat template, and so it must be supported in the
  2558. template for this argument to have any effect.
  2559. continue_final_message (bool, *optional*):
  2560. If this is set, the chat will be formatted so that the final
  2561. message in the chat is open-ended, without any EOS tokens. The model will continue this message
  2562. rather than starting a new one. This allows you to "prefill" part of
  2563. the model's response for it. Cannot be used at the same time as `add_generation_prompt`.
  2564. tokenize (`bool`, defaults to `True`):
  2565. Whether to tokenize the output. If `False`, the output will be a string.
  2566. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
  2567. Select a strategy to pad the returned sequences (according to the model's padding side and padding
  2568. index) among:
  2569. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  2570. sequence if provided).
  2571. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  2572. acceptable input length for the model if that argument is not provided.
  2573. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
  2574. lengths).
  2575. truncation (`bool`, defaults to `False`):
  2576. Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`.
  2577. max_length (`int`, *optional*):
  2578. Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If
  2579. not specified, the tokenizer's `max_length` attribute will be used as a default.
  2580. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  2581. If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable
  2582. values are:
  2583. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  2584. - `'np'`: Return NumPy `np.ndarray` objects.
  2585. return_dict (`bool`, defaults to `True`):
  2586. Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
  2587. tokenizer_kwargs (`dict[str: Any]`, *optional*): Additional kwargs to pass to the tokenizer.
  2588. return_assistant_tokens_mask (`bool`, defaults to `False`):
  2589. Whether to return a mask of the assistant generated tokens. For tokens generated by the assistant,
  2590. the mask will contain 1. For user and system tokens, the mask will contain 0.
  2591. This functionality is only available for chat templates that support it via the `{% generation %}` keyword.
  2592. **kwargs: Additional kwargs to pass to the template renderer. Will be accessible by the chat template.
  2593. Returns:
  2594. `Union[list[int], Dict]`: A list of token ids representing the tokenized chat so far, including control tokens. This
  2595. output is ready to pass to the model, either directly or via methods like `generate()`. If `return_dict` is
  2596. set, will return a dict of tokenizer outputs instead.
  2597. """
  2598. if not tokenize:
  2599. return_dict = False # dicts are only returned by the tokenizer anyway
  2600. if return_assistant_tokens_mask and not (return_dict and tokenize):
  2601. raise ValueError("`return_assistant_tokens_mask=True` requires `return_dict=True` and `tokenize=True`")
  2602. if tokenizer_kwargs is None:
  2603. tokenizer_kwargs = {}
  2604. chat_template = self.get_chat_template(chat_template, tools)
  2605. if isinstance(conversation, (list, tuple)) and (
  2606. isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "messages")
  2607. ):
  2608. conversations = conversation
  2609. is_batched = True
  2610. else:
  2611. conversations = [conversation]
  2612. is_batched = False
  2613. if continue_final_message:
  2614. if add_generation_prompt:
  2615. raise ValueError(
  2616. "continue_final_message and add_generation_prompt are not compatible. Use continue_final_message when you want the model to continue the final message, and add_generation_prompt when you want to add a header that will prompt it to start a new assistant message instead."
  2617. )
  2618. if return_assistant_tokens_mask:
  2619. raise ValueError("continue_final_message is not compatible with return_assistant_tokens_mask.")
  2620. template_kwargs = {**self.special_tokens_map, **kwargs} # kwargs overwrite special tokens if both are present
  2621. rendered_chat, generation_indices = render_jinja_template(
  2622. conversations=conversations,
  2623. tools=tools,
  2624. documents=documents,
  2625. chat_template=chat_template,
  2626. return_assistant_tokens_mask=return_assistant_tokens_mask,
  2627. continue_final_message=continue_final_message,
  2628. add_generation_prompt=add_generation_prompt,
  2629. **template_kwargs,
  2630. )
  2631. if not is_batched:
  2632. rendered_chat = rendered_chat[0]
  2633. if tokenize:
  2634. out = self(
  2635. rendered_chat,
  2636. padding=padding,
  2637. truncation=truncation,
  2638. max_length=max_length,
  2639. add_special_tokens=False,
  2640. return_tensors=return_tensors,
  2641. **tokenizer_kwargs,
  2642. )
  2643. if return_dict:
  2644. if return_assistant_tokens_mask:
  2645. assistant_masks = []
  2646. if is_batched or return_tensors:
  2647. input_ids = out["input_ids"]
  2648. else:
  2649. input_ids = [out["input_ids"]]
  2650. for i in range(len(input_ids)):
  2651. current_mask = [0] * len(input_ids[i])
  2652. for assistant_start_char, assistant_end_char in generation_indices[i]:
  2653. start_token = out.char_to_token(i, assistant_start_char)
  2654. end_token = out.char_to_token(i, assistant_end_char - 1)
  2655. if start_token is None:
  2656. # start_token is out of bounds maybe due to truncation.
  2657. break
  2658. for token_id in range(start_token, end_token + 1 if end_token else len(input_ids[i])):
  2659. current_mask[token_id] = 1
  2660. assistant_masks.append(current_mask)
  2661. if not is_batched and not return_tensors:
  2662. assistant_masks = assistant_masks[0]
  2663. out["assistant_masks"] = assistant_masks
  2664. if return_tensors:
  2665. out.convert_to_tensors(tensor_type=return_tensors)
  2666. return out
  2667. else:
  2668. return out["input_ids"]
  2669. else:
  2670. return rendered_chat
  2671. def encode_message_with_chat_template(
  2672. self,
  2673. message: dict[str, str],
  2674. conversation_history: list[dict[str, str]] | None = None,
  2675. **kwargs,
  2676. ) -> list[int]:
  2677. """
  2678. Tokenize a single message. This method is a convenience wrapper around `apply_chat_template` that allows you
  2679. to tokenize messages one by one. This is useful for things like token-by-token streaming.
  2680. This method is not guaranteed to be perfect. For some models, it may be impossible to robustly tokenize
  2681. single messages. For example, if the chat template adds tokens after each message, but also has a prefix that
  2682. is added to the entire chat, it will be impossible to distinguish a chat-start-token from a message-start-token.
  2683. In these cases, this method will do its best to find the correct tokenization, but it may not be perfect.
  2684. **Note:** This method does not support `add_generation_prompt`. If you want to add a generation prompt,
  2685. you should do it separately after tokenizing the conversation.
  2686. Args:
  2687. message (`dict`):
  2688. A dictionary with "role" and "content" keys, representing the message to tokenize.
  2689. conversation_history (`list[dict]`, *optional*):
  2690. A list of dicts with "role" and "content" keys, representing the chat history so far. If you are
  2691. tokenizing messages one by one, you should pass the previous messages in the conversation here.
  2692. **kwargs:
  2693. Additional kwargs to pass to the `apply_chat_template` method.
  2694. Returns:
  2695. `list[int]`: A list of token ids representing the tokenized message.
  2696. """
  2697. if "add_generation_prompt" in kwargs:
  2698. raise ValueError(
  2699. "`encode_message_with_chat_template` does not support `add_generation_prompt`. Please add the generation prompt "
  2700. "separately."
  2701. )
  2702. if conversation_history is None or len(conversation_history) == 0:
  2703. return self.apply_chat_template(
  2704. [message], add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
  2705. )
  2706. conversation = conversation_history + [message]
  2707. tokens = self.apply_chat_template(
  2708. conversation, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
  2709. )
  2710. prefix_tokens = self.apply_chat_template(
  2711. conversation_history, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
  2712. )
  2713. # It's possible that the prefix tokens are not a prefix of the full list of tokens.
  2714. # For example, if the prefix is `<s>User: Hi` and the full conversation is `<s>User: Hi</s><s>Assistant: Hello`.
  2715. # In this case, we can't simply find the prefix, so we have to do something a bit more subtle.
  2716. # We look for the first place where the tokens differ, and that's our split point.
  2717. # This is not perfect, but it's the best we can do without a token-level API.
  2718. # To make this more robust, we could do a diff and find the longest common subsequence, but this is
  2719. # a good first approximation.
  2720. # This is particularly important for models like Llama3 that have changed their chat template to include
  2721. # EOS tokens after user messages.
  2722. min_len = min(len(prefix_tokens), len(tokens))
  2723. for i in range(min_len):
  2724. if prefix_tokens[i] != tokens[i]:
  2725. return tokens[i:]
  2726. return tokens[min_len:]
  2727. def get_chat_template(self, chat_template: str | None = None, tools: list[dict] | None = None) -> str:
  2728. """
  2729. Retrieve the chat template string used for tokenizing chat messages. This template is used
  2730. internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat
  2731. template for better generation tracking.
  2732. Args:
  2733. chat_template (`str`, *optional*):
  2734. A Jinja template or the name of a template to use for this conversion.
  2735. It is usually not necessary to pass anything to this argument,
  2736. as the model's template will be used by default.
  2737. tools (`list[Dict]`, *optional*):
  2738. A list of tools (callable functions) that will be accessible to the model. If the template does not
  2739. support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
  2740. giving the name, description and argument types for the tool. See our
  2741. [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
  2742. for more information.
  2743. Returns:
  2744. `str`: The chat template string.
  2745. """
  2746. # First, handle the cases when the model has a dict of multiple templates
  2747. if isinstance(self.chat_template, dict):
  2748. template_dict = self.chat_template
  2749. if chat_template is not None and chat_template in template_dict:
  2750. # The user can pass the name of a template to the chat template argument instead of an entire template
  2751. chat_template = template_dict[chat_template]
  2752. elif chat_template is None:
  2753. if tools is not None and "tool_use" in template_dict:
  2754. chat_template = template_dict["tool_use"]
  2755. elif "default" in template_dict:
  2756. chat_template = template_dict["default"]
  2757. else:
  2758. raise ValueError(
  2759. "This model has multiple chat templates with no default specified! Please either pass a chat "
  2760. "template or the name of the template you wish to use to the `chat_template` argument. Available "
  2761. f"template names are {sorted(template_dict.keys())}."
  2762. )
  2763. elif chat_template is None:
  2764. # These are the cases when the model has a single template
  2765. # priority: `chat_template` argument > `tokenizer.chat_template`
  2766. if self.chat_template is not None:
  2767. chat_template = self.chat_template
  2768. else:
  2769. raise ValueError(
  2770. "Cannot use chat template functions because tokenizer.chat_template is not set and no template "
  2771. "argument was passed! For information about writing templates and setting the "
  2772. "tokenizer.chat_template attribute, please see the documentation at "
  2773. "https://huggingface.co/docs/transformers/main/en/chat_templating"
  2774. )
  2775. return chat_template
  2776. def save_chat_templates(
  2777. self,
  2778. save_directory: str | os.PathLike,
  2779. tokenizer_config: dict,
  2780. filename_prefix: str | None,
  2781. save_jinja_files: bool,
  2782. ):
  2783. """
  2784. Writes chat templates out to the save directory if we're using the new format, and removes them from
  2785. the tokenizer config if present. If we're using the legacy format, it doesn't write any files, and instead
  2786. writes the templates to the tokenizer config in the correct format.
  2787. """
  2788. chat_template_file = os.path.join(
  2789. save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_FILE
  2790. )
  2791. chat_template_dir = os.path.join(
  2792. save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_DIR
  2793. )
  2794. saved_raw_chat_template_files = []
  2795. if save_jinja_files and isinstance(self.chat_template, str):
  2796. # New format for single templates is to save them as chat_template.jinja
  2797. with open(chat_template_file, "w", encoding="utf-8") as f:
  2798. f.write(self.chat_template)
  2799. logger.info(f"chat template saved in {chat_template_file}")
  2800. saved_raw_chat_template_files.append(chat_template_file)
  2801. if "chat_template" in tokenizer_config:
  2802. tokenizer_config.pop("chat_template") # To ensure it doesn't somehow end up in the config too
  2803. elif save_jinja_files and isinstance(self.chat_template, dict):
  2804. # New format for multiple templates is to save the default as chat_template.jinja
  2805. # and the other templates in the chat_templates/ directory
  2806. for template_name, template in self.chat_template.items():
  2807. if template_name == "default":
  2808. with open(chat_template_file, "w", encoding="utf-8") as f:
  2809. f.write(self.chat_template["default"])
  2810. logger.info(f"chat template saved in {chat_template_file}")
  2811. saved_raw_chat_template_files.append(chat_template_file)
  2812. else:
  2813. Path(chat_template_dir).mkdir(exist_ok=True)
  2814. template_filepath = os.path.join(chat_template_dir, f"{template_name}.jinja")
  2815. with open(template_filepath, "w", encoding="utf-8") as f:
  2816. f.write(template)
  2817. logger.info(f"chat template saved in {template_filepath}")
  2818. saved_raw_chat_template_files.append(template_filepath)
  2819. if "chat_template" in tokenizer_config:
  2820. tokenizer_config.pop("chat_template") # To ensure it doesn't somehow end up in the config too
  2821. elif isinstance(self.chat_template, dict):
  2822. # Legacy format for multiple templates:
  2823. # chat template dicts are saved to the config as lists of dicts with fixed key names.
  2824. tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()]
  2825. elif self.chat_template is not None:
  2826. # Legacy format for single templates: Just make them a key in tokenizer_config.json
  2827. tokenizer_config["chat_template"] = self.chat_template
  2828. return tokenizer_config, saved_raw_chat_template_files
  2829. def parse_response(
  2830. self,
  2831. response: str | list[str | int | list[int]] | np.ndarray | torch.Tensor,
  2832. schema: list | dict | None = None,
  2833. ):
  2834. """
  2835. Converts an output string created by generating text from a model into a parsed message dictionary.
  2836. This method is intended for use with chat models, and will read the tokenizer's `response_schema` attribute to
  2837. control parsing, although this can be overridden by passing a `response_schema` argument directly.
  2838. Args:
  2839. response (`str`):
  2840. The output string generated by the model. This can be either a decoded string or list of strings,
  2841. or token IDs as a list/array.
  2842. schema (`Union[list, dict]`, *optional*):
  2843. A response schema that indicates the expected output format and how parsing should be performed.
  2844. If not provided, the tokenizer's `response_schema` attribute will be used.
  2845. """
  2846. batched = (
  2847. (isinstance(response, list) and not isinstance(response[0], int))
  2848. or getattr(response, "ndim", 0) > 1 # For torch/numpy tensors
  2849. )
  2850. if schema is None:
  2851. if getattr(self, "response_schema", None) is None:
  2852. raise AttributeError("This tokenizer does not have a `response_schema` for parsing chat responses!")
  2853. schema = self.response_schema
  2854. if batched:
  2855. if not (isinstance(response, list) and isinstance(response[0], str)):
  2856. response = self.batch_decode(response)
  2857. return [recursive_parse(single_response, schema) for single_response in response]
  2858. else:
  2859. if not isinstance(response, str):
  2860. response = self.decode(response)
  2861. return recursive_parse(response, schema)
  2862. def get_fast_tokenizer_file(tokenization_files: list[str]) -> str:
  2863. """
  2864. Get the tokenization file to use for this version of transformers.
  2865. Args:
  2866. tokenization_files (`list[str]`): The list of available configuration files.
  2867. Returns:
  2868. `str`: The tokenization file to use.
  2869. """
  2870. tokenizer_files_map = {}
  2871. for file_name in tokenization_files:
  2872. search = _re_tokenizer_file.search(file_name)
  2873. if search is not None:
  2874. v = search.groups()[0]
  2875. tokenizer_files_map[v] = file_name
  2876. available_versions = sorted(tokenizer_files_map.keys())
  2877. # Defaults to FULL_TOKENIZER_FILE and then try to look at some newer versions.
  2878. tokenizer_file = FULL_TOKENIZER_FILE
  2879. transformers_version = version.parse(__version__)
  2880. for v in available_versions:
  2881. if version.parse(v) <= transformers_version:
  2882. tokenizer_file = tokenizer_files_map[v]
  2883. else:
  2884. # No point going further since the versions are sorted.
  2885. break
  2886. return tokenizer_file
  2887. # Shared helper to locate a SentencePiece model file for a repo/path
  2888. def find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs):
  2889. """
  2890. Find any .model file (SentencePiece model) in the model directory or Hub repo.
  2891. Tries known filenames first ("tokenizer.model", "spm.model"), then scans local dir,
  2892. and as a last resort lists files on the Hub to find any .model.
  2893. Returns the filename (str) relative to the repo root or directory if found, else None.
  2894. """
  2895. from .utils.hub import has_file
  2896. # Try common names first
  2897. for candidate in ("tokenizer.model", "spm.model"):
  2898. try:
  2899. if has_file(
  2900. pretrained_model_name_or_path,
  2901. candidate,
  2902. revision=kwargs.get("revision"),
  2903. token=kwargs.get("token"),
  2904. cache_dir=kwargs.get("cache_dir"),
  2905. local_files_only=kwargs.get("local_files_only", False),
  2906. ):
  2907. return candidate
  2908. except Exception:
  2909. # TODO: tighten to OSError / ProxyError
  2910. continue
  2911. subfolder = kwargs.get("subfolder", "")
  2912. local_files_only = kwargs.get("local_files_only", False)
  2913. # Local directory scan
  2914. if os.path.isdir(pretrained_model_name_or_path):
  2915. dir_path = (
  2916. os.path.join(pretrained_model_name_or_path, subfolder) if subfolder else pretrained_model_name_or_path
  2917. )
  2918. if os.path.isdir(dir_path):
  2919. for filename in os.listdir(dir_path):
  2920. if filename.endswith(".model"):
  2921. return filename if not subfolder else os.path.join(subfolder, filename)
  2922. # Hub listing if allowed
  2923. if not local_files_only:
  2924. try:
  2925. from huggingface_hub import list_repo_tree
  2926. entries = list_repo_tree(
  2927. repo_id=pretrained_model_name_or_path,
  2928. revision=kwargs.get("revision"),
  2929. path_in_repo=subfolder if subfolder else None,
  2930. recursive=False,
  2931. token=kwargs.get("token"),
  2932. )
  2933. for entry in entries:
  2934. if entry.path.endswith(".model"):
  2935. return entry.path if not subfolder else entry.path.removeprefix(f"{subfolder}/")
  2936. except Exception as e:
  2937. # TODO: tighten exception class
  2938. logger.debug(f"Could not list Hub repository files: {e}")
  2939. return None
  2940. def load_vocab_and_merges(pretrained_model_name_or_path, **kwargs):
  2941. """
  2942. Resolve and load tokenizer vocabulary files from a repo/path.
  2943. Priority order:
  2944. 1. Load ``vocab.json`` (WordLevel/WordPiece/BPE fast tokenizers)
  2945. 2. Load ``vocab.txt`` when only a WordPiece vocab is available
  2946. 3. Optionally load ``merges.txt`` (BPE tokenizers)
  2947. Returns:
  2948. tuple (vocab: dict|None, merges: list[tuple[str,str]]|None, files_loaded: list[str])
  2949. """
  2950. files_loaded = []
  2951. vocab = None
  2952. merges = None
  2953. try:
  2954. resolved_vocab_file = cached_file(
  2955. pretrained_model_name_or_path,
  2956. "vocab.json",
  2957. cache_dir=kwargs.get("cache_dir"),
  2958. force_download=kwargs.get("force_download", False),
  2959. proxies=kwargs.get("proxies"),
  2960. token=kwargs.get("token"),
  2961. revision=kwargs.get("revision"),
  2962. local_files_only=kwargs.get("local_files_only", False),
  2963. subfolder=kwargs.get("subfolder", ""),
  2964. )
  2965. except Exception:
  2966. resolved_vocab_file = None
  2967. if resolved_vocab_file is not None:
  2968. try:
  2969. with open(resolved_vocab_file, "r", encoding="utf-8") as vf:
  2970. vocab = json.load(vf)
  2971. files_loaded.append("vocab.json")
  2972. except Exception:
  2973. vocab = None
  2974. # Fallback to vocab.txt (WordPiece-style vocabularies)
  2975. if vocab is None:
  2976. try:
  2977. resolved_vocab_txt = cached_file(
  2978. pretrained_model_name_or_path,
  2979. "vocab.txt",
  2980. cache_dir=kwargs.get("cache_dir"),
  2981. force_download=kwargs.get("force_download", False),
  2982. proxies=kwargs.get("proxies"),
  2983. token=kwargs.get("token"),
  2984. revision=kwargs.get("revision"),
  2985. local_files_only=kwargs.get("local_files_only", False),
  2986. subfolder=kwargs.get("subfolder", ""),
  2987. )
  2988. except Exception:
  2989. resolved_vocab_txt = None
  2990. if resolved_vocab_txt is not None:
  2991. try:
  2992. vocab = OrderedDict()
  2993. with open(resolved_vocab_txt, "r", encoding="utf-8") as vf:
  2994. for index, token in enumerate(vf):
  2995. token = token.rstrip("\n")
  2996. vocab[token] = index
  2997. files_loaded.append("vocab.txt")
  2998. except Exception:
  2999. vocab = None
  3000. try:
  3001. resolved_merges_file = cached_file(
  3002. pretrained_model_name_or_path,
  3003. "merges.txt",
  3004. cache_dir=kwargs.get("cache_dir"),
  3005. force_download=kwargs.get("force_download", False),
  3006. proxies=kwargs.get("proxies"),
  3007. token=kwargs.get("token"),
  3008. revision=kwargs.get("revision"),
  3009. local_files_only=kwargs.get("local_files_only", False),
  3010. subfolder=kwargs.get("subfolder", ""),
  3011. )
  3012. except Exception:
  3013. resolved_merges_file = None
  3014. if resolved_merges_file is not None:
  3015. try:
  3016. merges = []
  3017. with open(resolved_merges_file, "r", encoding="utf-8") as mf:
  3018. for line in mf:
  3019. line = line.strip()
  3020. if line and not line.startswith("#"):
  3021. parts = line.split()
  3022. if len(parts) == 2:
  3023. merges.append((parts[0], parts[1]))
  3024. files_loaded.append("merges.txt")
  3025. except Exception:
  3026. merges = None
  3027. return vocab, merges, files_loaded
  3028. # To update the docstring, we need to copy the method, otherwise we change the original docstring.
  3029. PreTrainedTokenizerBase.push_to_hub = copy_func(PreTrainedTokenizerBase.push_to_hub)
  3030. if PreTrainedTokenizerBase.push_to_hub.__doc__ is not None:
  3031. PreTrainedTokenizerBase.push_to_hub.__doc__ = PreTrainedTokenizerBase.push_to_hub.__doc__.format(
  3032. object="tokenizer", object_class="AutoTokenizer", object_files="tokenizer files"
  3033. )
  3034. def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
  3035. if add_prefix_space:
  3036. prepend_scheme = "always"
  3037. if not getattr(original_tokenizer, "legacy", True):
  3038. prepend_scheme = "first"
  3039. else:
  3040. prepend_scheme = "never"
  3041. return prepend_scheme
  3042. def generate_merges(vocab, vocab_scores: dict[str, float] | None = None, skip_tokens: Collection[str] | None = None):
  3043. skip_tokens = set(skip_tokens) if skip_tokens is not None else set()
  3044. reverse = vocab_scores is not None
  3045. vocab_scores = dict(vocab_scores) if reverse else vocab
  3046. merges = []
  3047. for merge, piece_score in vocab_scores.items():
  3048. if merge in skip_tokens:
  3049. continue
  3050. local = []
  3051. for index in range(1, len(merge)):
  3052. piece_l, piece_r = merge[:index], merge[index:]
  3053. if piece_l in skip_tokens or piece_r in skip_tokens:
  3054. continue
  3055. if piece_l in vocab and piece_r in vocab:
  3056. local.append((piece_l, piece_r, piece_score))
  3057. local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
  3058. merges.extend(local)
  3059. merges = sorted(merges, key=lambda val: (val[2], len(val[0]), len(val[1])), reverse=reverse)
  3060. merges = [(val[0], val[1]) for val in merges]
  3061. return merges