ggml.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814
  1. # Copyright 2024 The ggml.ai team and The HuggingFace Inc. team. and pygguf author (github.com/99991)
  2. # https://github.com/99991/pygguf
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. Integration with GGML / The file is copied and adapted from https://github.com/99991/pygguf
  17. with extra methods beings exposed
  18. """
  19. from array import array
  20. import numpy as np
  21. from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
  22. from tokenizers.models import BPE, Unigram
  23. from .. import AddedToken
  24. from ..convert_slow_tokenizer import GemmaConverter, GPT2Converter, LlamaConverter, Qwen2Converter, T5Converter
  25. from ..utils import logging
  26. from ..utils.logging import tqdm
  27. logger = logging.get_logger(__name__)
  28. GGUF_CONFIG_MAPPING = {
  29. "general": {
  30. "architecture": "model_type",
  31. "name": "_model_name_or_path",
  32. },
  33. "llama": {
  34. "context_length": "max_position_embeddings",
  35. "block_count": "num_hidden_layers",
  36. "feed_forward_length": "intermediate_size",
  37. "embedding_length": "hidden_size",
  38. # NOTE: rope.dimension_count==head_dim only suitable for llama/mistral
  39. "rope.dimension_count": "head_dim",
  40. "rope.freq_base": "rope_theta",
  41. "attention.head_count": "num_attention_heads",
  42. "attention.head_count_kv": "num_key_value_heads",
  43. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  44. "vocab_size": "vocab_size",
  45. },
  46. "mistral": {
  47. "context_length": "max_position_embeddings",
  48. "block_count": "num_hidden_layers",
  49. "feed_forward_length": "intermediate_size",
  50. "embedding_length": "hidden_size",
  51. # NOTE: rope.dimension_count==head_dim only suitable for llama/mistral
  52. "rope.dimension_count": "head_dim",
  53. "rope.freq_base": "rope_theta",
  54. "attention.head_count": "num_attention_heads",
  55. "attention.head_count_kv": "num_key_value_heads",
  56. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  57. "vocab_size": "vocab_size",
  58. },
  59. "qwen2": {
  60. "context_length": "max_position_embeddings",
  61. "block_count": "num_hidden_layers",
  62. "feed_forward_length": "intermediate_size",
  63. "embedding_length": "hidden_size",
  64. "rope.dimension_count": None,
  65. "rope.freq_base": "rope_theta",
  66. "attention.head_count": "num_attention_heads",
  67. "attention.head_count_kv": "num_key_value_heads",
  68. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  69. "vocab_size": "vocab_size",
  70. },
  71. "qwen2_moe": {
  72. "context_length": "max_position_embeddings",
  73. "block_count": "num_hidden_layers",
  74. "feed_forward_length": "intermediate_size",
  75. "embedding_length": "hidden_size",
  76. "rope.dimension_count": None,
  77. "rope.freq_base": "rope_theta",
  78. "attention.head_count": "num_attention_heads",
  79. "attention.head_count_kv": "num_key_value_heads",
  80. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  81. "vocab_size": "vocab_size",
  82. "expert_count": "num_experts",
  83. "expert_used_count": "num_experts_per_tok",
  84. },
  85. "lfm2": {
  86. "context_length": "max_position_embeddings",
  87. "block_count": "num_hidden_layers",
  88. "feed_forward_length": "intermediate_size",
  89. "embedding_length": "hidden_size",
  90. "rope.dimension_count": None,
  91. "rope.freq_base": "rope_theta",
  92. "attention.head_count": "num_attention_heads",
  93. "attention.head_count_kv": "num_key_value_heads",
  94. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  95. "vocab_size": "vocab_size",
  96. "shortconv.l_cache": "conv_L_cache",
  97. },
  98. "qwen3": {
  99. "context_length": "max_position_embeddings",
  100. "block_count": "num_hidden_layers",
  101. "feed_forward_length": "intermediate_size",
  102. "embedding_length": "hidden_size",
  103. "rope.dimension_count": None,
  104. "rope.freq_base": "rope_theta",
  105. "attention.head_count": "num_attention_heads",
  106. "attention.head_count_kv": "num_key_value_heads",
  107. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  108. "vocab_size": "vocab_size",
  109. },
  110. "qwen3_moe": {
  111. "context_length": "max_position_embeddings",
  112. "block_count": "num_hidden_layers",
  113. "feed_forward_length": "intermediate_size",
  114. "embedding_length": "hidden_size",
  115. "rope.dimension_count": None,
  116. "rope.freq_base": "rope_theta",
  117. "attention.key_length": "head_dim",
  118. "attention.head_count": "num_attention_heads",
  119. "attention.head_count_kv": "num_key_value_heads",
  120. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  121. "vocab_size": "vocab_size",
  122. "expert_count": "num_experts",
  123. "expert_used_count": "num_experts_per_tok",
  124. },
  125. "falcon": {
  126. "context_length": "max_position_embeddings",
  127. "block_count": "num_hidden_layers",
  128. "feed_forward_length": "intermediate_size",
  129. "embedding_length": "hidden_size",
  130. "rope.dimension_count": None,
  131. "rope.freq_base": "rope_theta",
  132. "attention.head_count": "num_attention_heads",
  133. "attention.head_count_kv": "num_key_value_heads",
  134. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  135. "vocab_size": "vocab_size",
  136. },
  137. "tokenizer": {
  138. "ggml.bos_token_id": "bos_token_id",
  139. "ggml.eos_token_id": "eos_token_id",
  140. "ggml.unknown_token_id": "unk_token_id",
  141. "ggml.padding_token_id": "pad_token_id",
  142. },
  143. "phi3": {
  144. "context_length": "max_position_embeddings",
  145. "block_count": "num_hidden_layers",
  146. "feed_forward_length": "intermediate_size",
  147. "embedding_length": "hidden_size",
  148. "rope.dimension_count": None,
  149. "rope.freq_base": "rope_theta",
  150. "attention.head_count": "num_attention_heads",
  151. "attention.head_count_kv": "num_key_value_heads",
  152. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  153. "vocab_size": "vocab_size",
  154. },
  155. "bloom": {
  156. "block_count": "n_layer",
  157. "embedding_length": "hidden_size",
  158. "attention.head_count": "n_head",
  159. "vocab_size": "vocab_size",
  160. "attention.layer_norm_epsilon": "layer_norm_epsilon",
  161. },
  162. "t5": {
  163. "context_length": "n_positions",
  164. "block_count": "num_layers",
  165. "feed_forward_length": "d_ff",
  166. "embedding_length": "d_model",
  167. "attention.key_length": "d_kv",
  168. "attention.head_count": "num_heads",
  169. "attention.head_count_kv": "num_key_value_heads",
  170. "attention.layer_norm_epsilon": "layer_norm_epsilon",
  171. "attention.relative_buckets_count": "relative_attention_num_buckets",
  172. "decoder_start_token_id": "decoder_start_token_id",
  173. "vocab_size": "vocab_size",
  174. },
  175. "stablelm": {
  176. "context_length": "max_position_embeddings",
  177. "block_count": "num_hidden_layers",
  178. "feed_forward_length": "intermediate_size",
  179. "embedding_length": "hidden_size",
  180. "rope.dimension_count": None,
  181. "attention.head_count": "num_attention_heads",
  182. "attention.head_count_kv": "num_key_value_heads",
  183. "attention.layer_norm_epsilon": "layer_norm_eps",
  184. "vocab_size": "vocab_size",
  185. },
  186. "gpt2": {
  187. "block_count": "n_layer",
  188. "context_length": "n_ctx",
  189. "embedding_length": "n_embd",
  190. "feed_forward_length": "feed_forward_length",
  191. "attention.head_count": "n_head",
  192. "attention.layer_norm_epsilon": "layer_norm_epsilon",
  193. },
  194. "starcoder2": {
  195. "block_count": "num_hidden_layers",
  196. "context_length": "max_position_embeddings",
  197. "embedding_length": "hidden_size",
  198. "feed_forward_length": "intermediate_size",
  199. "attention.head_count": "num_attention_heads",
  200. "attention.head_count_kv": "num_key_value_heads",
  201. "attention.layer_norm_epsilon": "norm_epsilon",
  202. },
  203. "mamba": {
  204. "vocab_size": "vocab_size",
  205. "context_length": "max_position_embeddings",
  206. "embedding_length": "hidden_size",
  207. "attention.layer_norm_rms_epsilon": "layer_norm_epsilon",
  208. "block_count": "num_hidden_layers",
  209. "ssm.conv_kernel": "conv_kernel",
  210. "ssm.state_size": "state_size",
  211. "ssm.time_step_rank": "time_step_rank",
  212. "ssm.inner_size": "intermediate_size",
  213. },
  214. "nemotron": {
  215. "context_length": "max_position_embeddings",
  216. "block_count": "num_hidden_layers",
  217. "feed_forward_length": "intermediate_size",
  218. "embedding_length": "hidden_size",
  219. "rope.dimension_count": None,
  220. "rope.freq_base": "rope_theta",
  221. "attention.head_count": "num_attention_heads",
  222. "attention.head_count_kv": "num_key_value_heads",
  223. "attention.layer_norm_rms_epsilon": "norm_eps",
  224. "vocab_size": "vocab_size",
  225. },
  226. "gemma2": {
  227. "context_length": "max_position_embeddings",
  228. "block_count": "num_hidden_layers",
  229. "feed_forward_length": "intermediate_size",
  230. "embedding_length": "hidden_size",
  231. "rope.dimension_count": None,
  232. "rope.freq_base": "rope_theta",
  233. # NOTE: Gemma2 has key_length==value_length==head_dim
  234. # See: https://github.com/ggerganov/llama.cpp/blob/2e2f8f093cd4fb6bbb87ba84f6b9684fa082f3fa/convert_hf_to_gguf.py#L3293-L3294
  235. "attention.key_length": "head_dim",
  236. "attention.head_count": "num_attention_heads",
  237. "attention.head_count_kv": "num_key_value_heads",
  238. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  239. "attention.sliding_window": "sliding_window",
  240. "vocab_size": "vocab_size",
  241. },
  242. "gemma3": {
  243. "context_length": "max_position_embeddings",
  244. "block_count": "num_hidden_layers",
  245. "feed_forward_length": "intermediate_size",
  246. "embedding_length": "hidden_size",
  247. "rope.dimension_count": None,
  248. "rope.freq_base": "rope_theta",
  249. # NOTE: Gemma3 has key_length==value_length==head_dim
  250. # See: https://github.com/ggml-org/llama.cpp/blob/fe5b78c89670b2f37ecb216306bed3e677b49d9f/convert_hf_to_gguf.py#L3495-L3496
  251. "attention.key_length": "head_dim",
  252. "attention.head_count": "num_attention_heads",
  253. "attention.head_count_kv": "num_key_value_heads",
  254. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  255. "attention.sliding_window": "sliding_window",
  256. "vocab_size": "vocab_size",
  257. },
  258. "umt5": {
  259. "context_length": "n_positions",
  260. "block_count": "num_layers",
  261. "feed_forward_length": "d_ff",
  262. "embedding_length": "d_model",
  263. "attention.key_length": "d_kv",
  264. "attention.head_count": "num_heads",
  265. "attention.head_count_kv": "num_key_value_heads",
  266. "attention.layer_norm_epsilon": "layer_norm_epsilon",
  267. "attention.relative_buckets_count": "relative_attention_num_buckets",
  268. "decoder_start_token_id": "decoder_start_token_id",
  269. "vocab_size": "vocab_size",
  270. },
  271. "deci": {
  272. "context_length": "max_position_embeddings",
  273. "block_count": "num_hidden_layers",
  274. "feed_forward_length": "intermediate_size",
  275. "embedding_length": "hidden_size",
  276. "rope.dimension_count": None,
  277. "rope.freq_base": "rope_theta",
  278. "attention.head_count": "num_attention_heads",
  279. "attention.head_count_kv": "num_key_value_heads",
  280. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  281. "vocab_size": "vocab_size",
  282. },
  283. "minimax_m2": {
  284. "context_length": "max_position_embeddings",
  285. "block_count": "num_hidden_layers",
  286. "feed_forward_length": "intermediate_size",
  287. "embedding_length": "hidden_size",
  288. "rope.dimension_count": "rotary_dim",
  289. "rope.freq_base": "rope_theta",
  290. "attention.head_count": "num_attention_heads",
  291. "attention.head_count_kv": "num_key_value_heads",
  292. "attention.key_length": "head_dim",
  293. "attention.value_length": None,
  294. "attention.layer_norm_rms_epsilon": "rms_norm_eps",
  295. "expert_count": "num_local_experts",
  296. "expert_used_count": "num_experts_per_tok",
  297. "expert_feed_forward_length": None,
  298. "vocab_size": "vocab_size",
  299. "expert_gating_func": "scoring_func",
  300. },
  301. }
  302. GGUF_TOKENIZER_MAPPING = {
  303. "tokenizer": {
  304. "ggml.model": "tokenizer_type",
  305. "ggml.tokens": "tokens",
  306. "ggml.scores": "scores",
  307. "ggml.token_type": "token_type",
  308. "ggml.merges": "merges",
  309. "ggml.bos_token_id": "bos_token_id",
  310. "ggml.eos_token_id": "eos_token_id",
  311. "ggml.unknown_token_id": "unk_token_id",
  312. "ggml.padding_token_id": "pad_token_id",
  313. "ggml.add_space_prefix": "add_prefix_space",
  314. },
  315. "tokenizer_config": {
  316. "chat_template": "chat_template",
  317. "ggml.model": "model_type",
  318. "ggml.bos_token_id": "bos_token_id",
  319. "ggml.eos_token_id": "eos_token_id",
  320. "ggml.unknown_token_id": "unk_token_id",
  321. "ggml.padding_token_id": "pad_token_id",
  322. },
  323. }
  324. # We only need to set here the parameters that default to different values between transformers and llamacpp.
  325. GGUF_CONFIG_DEFAULTS_MAPPING = {
  326. "qwen3_moe": {
  327. # NOTE: Qwen3MoeConfig defaults to false but llama.cpp needs this to be true.
  328. # See: https://github.com/ggml-org/llama.cpp/blob/17f7f4baad8b3a716ee139da7bb56ae984e8c0fa/src/models/qwen3moe.cpp#L85-L96
  329. # (the parameter right after LLM_FFN_SILU corresponds to norm_topk_prob)
  330. "norm_topk_prob": True,
  331. },
  332. "minimax_m2": {
  333. # MiniMax-M2 uses routing bias (e_score_correction_bias) for MoE expert selection,
  334. # but this is not stored in GGUF metadata. Set it as default so the model weights
  335. # (which include e_score_correction_bias tensors) are loaded correctly.
  336. "use_routing_bias": True,
  337. },
  338. }
  339. def _gguf_parse_value(_value, data_type):
  340. if not isinstance(data_type, list):
  341. data_type = [data_type]
  342. if len(data_type) == 1:
  343. data_type = data_type[0]
  344. array_data_type = None
  345. else:
  346. if data_type[0] != 9:
  347. raise ValueError("Received multiple types, therefore expected the first type to indicate an array.")
  348. data_type, array_data_type = data_type
  349. if data_type in [0, 1, 2, 3, 4, 5, 10, 11]:
  350. _value = int(_value[0])
  351. elif data_type in [6, 12]:
  352. _value = float(_value[0])
  353. elif data_type == 7:
  354. _value = bool(_value[0])
  355. elif data_type == 8:
  356. _value = array("B", list(_value)).tobytes().decode()
  357. elif data_type == 9:
  358. _value = _gguf_parse_value(_value, array_data_type)
  359. return _value
  360. class GGUFTokenizerSkeleton:
  361. def __init__(self, dict_):
  362. for k, v in dict_.items():
  363. setattr(self, k, v)
  364. if not hasattr(self, "merges"):
  365. if not hasattr(self, "tokens") or not hasattr(self, "scores"):
  366. raise ValueError(
  367. "tokens and scores need to be passed for a LLaMa tokenizer without merges to be instantiated."
  368. )
  369. tokens = self.tokens
  370. scores = self.scores
  371. vocab = {t: scores[i] for i, t in enumerate(tokens)}
  372. logger.warning("Merges were not in checkpoint, building merges on the fly.")
  373. merges = []
  374. for merge, piece_score in tqdm(vocab.items()):
  375. local = []
  376. for index in range(1, len(merge)):
  377. piece_l, piece_r = merge[:index], merge[index:]
  378. if piece_l in tokens and piece_r in tokens:
  379. local.append((piece_l, piece_r, piece_score))
  380. local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]), reverse=True)
  381. merges.extend(local)
  382. merges = sorted(merges, key=lambda val: val[2], reverse=True)
  383. merges = [(val[0], val[1]) for val in merges]
  384. self.merges = merges
  385. else:
  386. self.merges = [tuple(merge.split(" ")) for merge in self.merges]
  387. if not hasattr(self, "scores"):
  388. self.scores = [None for _ in range(len(self.tokens))]
  389. if not hasattr(self, "added_tokens"):
  390. self.added_tokens = []
  391. if not hasattr(self, "unk_token_id"):
  392. self.unk_token_id = None
  393. # Llama2 uses the field `unknown_token_id`
  394. if hasattr(self, "unknown_token_id") and self.unk_token_id is None:
  395. self.unk_token_id = self.unknown_token_id
  396. class GGUFLlamaConverter(LlamaConverter):
  397. def __init__(self, tokenizer_dict):
  398. self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
  399. self.original_tokenizer = self.proto
  400. self.additional_kwargs = {}
  401. self.is_llama_3_tokenizer = getattr(self.proto, "tokenizer_type", "llama") != "llama"
  402. def vocab(self, proto):
  403. return list(zip(proto.tokens, proto.scores))
  404. def merges(self, proto):
  405. return proto.merges
  406. def tokenizer(self, proto):
  407. vocab_scores = self.vocab(self.proto)
  408. merges = self.merges(self.proto)
  409. bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
  410. unk_token = proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None
  411. bos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "bos_token_id", None) is not None else None
  412. eos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "eos_token_id", None) is not None else None
  413. tokenizer = Tokenizer(
  414. BPE(
  415. bpe_vocab,
  416. merges,
  417. unk_token=unk_token,
  418. fuse_unk=True,
  419. byte_fallback=True,
  420. )
  421. )
  422. special_tokens = []
  423. if not hasattr(self.proto, "token_type"):
  424. if unk_token is not None:
  425. special_tokens.append(AddedToken(unk_token, normalized=False, special=True))
  426. if bos_token is not None:
  427. special_tokens.append(AddedToken(bos_token, normalized=False, special=True))
  428. if eos_token is not None:
  429. special_tokens.append(AddedToken(eos_token, normalized=False, special=True))
  430. else:
  431. # 3 stands for special tokens
  432. special_tokens_idx = np.where(np.array(self.proto.token_type) == 3)[0]
  433. for idx in special_tokens_idx:
  434. special_tokens.append(AddedToken(self.proto.tokens[idx], normalized=False, special=True))
  435. if len(special_tokens) != 0:
  436. tokenizer.add_special_tokens(special_tokens)
  437. if len(self.proto.added_tokens) != 0:
  438. tokenizer.add_tokens(
  439. [AddedToken(added_token, normalized=False, special=False) for added_token in self.proto.added_tokens]
  440. )
  441. self.additional_kwargs["unk_token"] = unk_token
  442. self.additional_kwargs["eos_token"] = bos_token
  443. self.additional_kwargs["bos_token"] = eos_token
  444. if self.is_llama_3_tokenizer:
  445. self.additional_kwargs["add_prefix_space"] = None
  446. self.additional_kwargs["clean_up_tokenization_spaces"] = True
  447. self.additional_kwargs["legacy"] = False
  448. self.original_tokenizer.legacy = False
  449. return tokenizer
  450. def decoder(self, replacement, add_prefix_space):
  451. sequence = [
  452. decoders.ByteFallback(),
  453. decoders.Fuse(),
  454. decoders.Replace("▁", " "),
  455. ]
  456. if self.is_llama_3_tokenizer:
  457. sequence += [decoders.ByteLevel(add_prefix_space=False, trim_offsets=False, use_regex=True)]
  458. if add_prefix_space:
  459. sequence += [decoders.Strip(content=" ", left=1)]
  460. return decoders.Sequence(sequence)
  461. def converted(self):
  462. # Copied partly from converted method in SpmConverter class
  463. tokenizer = self.tokenizer(self.proto)
  464. # Tokenizer assemble
  465. normalizer = self.normalizer(self.proto)
  466. if normalizer is not None:
  467. tokenizer.normalizer = normalizer
  468. replacement = "▁"
  469. add_prefix_space = True
  470. if hasattr(self.original_tokenizer, "add_prefix_space"):
  471. add_prefix_space = self.original_tokenizer.add_prefix_space
  472. pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
  473. if pre_tokenizer is not None:
  474. tokenizer.pre_tokenizer = pre_tokenizer
  475. tokenizer.decoder = self.decoder(replacement, add_prefix_space)
  476. post_processor = self.post_processor()
  477. if post_processor:
  478. tokenizer.post_processor = post_processor
  479. # HACK: patch the llama-3 tokenizer to use the corresponding pre-tokenizer
  480. # and normalizer
  481. if self.is_llama_3_tokenizer:
  482. tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
  483. add_prefix_space=False, trim_offsets=False, use_regex=True
  484. )
  485. # This is tricky as the additional kwargs are passed after legacy is force-set in LlamaTokenizer's
  486. # init.
  487. tokenizer.normalizer = normalizers.Sequence([])
  488. return tokenizer
  489. class GGUFQwen2Converter(Qwen2Converter):
  490. def __init__(self, tokenizer_dict):
  491. self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
  492. self.additional_kwargs = {}
  493. def converted(self) -> Tokenizer:
  494. vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)}
  495. merges = self.original_tokenizer.merges
  496. tokenizer = super().converted(vocab, merges)
  497. tokenizer.add_special_tokens(
  498. [
  499. AddedToken("<|endoftext|>", normalized=False, special=True),
  500. AddedToken("<|im_start|>", normalized=False, special=True),
  501. AddedToken("<|im_end|>", normalized=False, special=True),
  502. ]
  503. )
  504. return tokenizer
  505. class GGUFPhi3Converter(LlamaConverter):
  506. def __init__(self, tokenizer_dict):
  507. self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
  508. self.original_tokenizer = self.proto
  509. self.additional_kwargs = {}
  510. def vocab(self, proto):
  511. return list(zip(proto.tokens, proto.scores))
  512. def merges(self, proto):
  513. return proto.merges
  514. def tokenizer(self, proto):
  515. vocab_scores = self.vocab(self.proto)
  516. merges = self.merges(self.proto)
  517. bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
  518. tokenizer = Tokenizer(BPE(bpe_vocab, merges))
  519. # add the special tokens from phi3 tokenizer config
  520. tokenizer.add_special_tokens(
  521. [
  522. AddedToken("</s>", rstrip=True, lstrip=False, normalized=False, special=True),
  523. AddedToken("<|endoftext|>", normalized=False, special=True),
  524. AddedToken("<|assistant|>", rstrip=True, normalized=False, special=True),
  525. AddedToken("<|placeholder1|>", rstrip=True, normalized=False, special=True),
  526. AddedToken("<|placeholder2|>", rstrip=True, normalized=False, special=True),
  527. AddedToken("<|placeholder3|>", rstrip=True, normalized=False, special=True),
  528. AddedToken("<|placeholder4|>", rstrip=True, normalized=False, special=True),
  529. AddedToken("<|system|>", rstrip=True, normalized=False, special=True),
  530. AddedToken("<|end|>", rstrip=True, normalized=False, special=True),
  531. AddedToken("<|placeholder5|>", rstrip=True, normalized=False, special=True),
  532. AddedToken("<|placeholder6|>", rstrip=True, normalized=False, special=True),
  533. AddedToken("<|user|>", rstrip=True, normalized=False, special=True),
  534. ]
  535. )
  536. self.additional_kwargs["unk_token"] = (
  537. proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None
  538. )
  539. self.additional_kwargs["eos_token"] = (
  540. proto.tokens[proto.eos_token_id] if proto.eos_token_id is not None else None
  541. )
  542. self.additional_kwargs["bos_token"] = (
  543. proto.tokens[proto.bos_token_id] if proto.bos_token_id is not None else None
  544. )
  545. self.additional_kwargs["pad_token"] = (
  546. proto.tokens[proto.pad_token_id] if proto.pad_token_id is not None else None
  547. )
  548. return tokenizer
  549. def decoder(self, replacement, add_prefix_space):
  550. sequence = [
  551. decoders.ByteFallback(),
  552. decoders.Fuse(),
  553. decoders.Replace(replacement, " "),
  554. ]
  555. if add_prefix_space:
  556. sequence += [decoders.Strip(content=" ", left=1)]
  557. return decoders.Sequence(sequence)
  558. def converted(self) -> Tokenizer:
  559. tokenizer = self.tokenizer(self.proto)
  560. replacement = "▁"
  561. add_prefix_space = True
  562. if hasattr(self.original_tokenizer, "add_prefix_space"):
  563. add_prefix_space = self.original_tokenizer.add_prefix_space
  564. tokenizer.decoder = self.decoder(replacement, add_prefix_space)
  565. return tokenizer
  566. class GGUFGPTConverter(GPT2Converter):
  567. def __init__(self, tokenizer_dict):
  568. self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
  569. self.additional_kwargs = {}
  570. def converted(self) -> Tokenizer:
  571. vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)}
  572. merges = self.original_tokenizer.merges
  573. tokenizer = super().converted(vocab, merges)
  574. return tokenizer
  575. class GGUFT5Converter(T5Converter):
  576. def __init__(self, tokenizer_dict):
  577. # set dummy data to avoid unnecessary merges calculation
  578. tokenizer_dict["merges"] = ["dummy text"]
  579. self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
  580. self.token2id = {k: v for v, k in enumerate(self.proto.tokens)}
  581. self.original_tokenizer = self.proto
  582. self.additional_kwargs = {}
  583. def vocab(self, proto):
  584. return list(zip(proto.tokens, proto.scores))
  585. def normalizer(self, proto):
  586. if getattr(self.original_tokenizer, "legacy", True):
  587. sequence = []
  588. if getattr(self.original_tokenizer, "add_prefix_space", True):
  589. sequence += [normalizers.Prepend(prepend="▁")]
  590. sequence += [normalizers.Replace(pattern=" ", content="▁")]
  591. return normalizers.Sequence(sequence)
  592. return None # non-legacy, no normalizer
  593. def post_processor(self):
  594. return processors.TemplateProcessing(
  595. single=["$A", "</s>"],
  596. pair=["$A", "</s>", "$B", "</s>"],
  597. special_tokens=[
  598. ("</s>", self.token2id["</s>"]),
  599. ],
  600. )
  601. def converted(self) -> Tokenizer:
  602. vocab_scores = self.vocab(self.proto)
  603. tokenizer = Tokenizer(
  604. Unigram(
  605. vocab_scores,
  606. unk_id=self.proto.unk_token_id,
  607. byte_fallback=False,
  608. )
  609. )
  610. # Tokenizer assemble
  611. normalizer = self.normalizer(self.proto)
  612. if normalizer is not None:
  613. tokenizer.normalizer = normalizer
  614. replacement = "▁"
  615. add_prefix_space = True
  616. if hasattr(self.original_tokenizer, "add_prefix_space"):
  617. add_prefix_space = self.original_tokenizer.add_prefix_space
  618. pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
  619. if pre_tokenizer is not None:
  620. tokenizer.pre_tokenizer = pre_tokenizer
  621. tokenizer.decoder = self.decoder(replacement, add_prefix_space)
  622. post_processor = self.post_processor()
  623. if post_processor:
  624. tokenizer.post_processor = post_processor
  625. return tokenizer
  626. class GGUFGemmaConverter(GemmaConverter):
  627. def __init__(self, tokenizer_dict):
  628. # set dummy data to avoid unnecessary merges calculation
  629. tokenizer_dict["merges"] = ["dummy text"]
  630. self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
  631. self.original_tokenizer = self.proto
  632. self.additional_kwargs = {}
  633. def vocab(self, proto):
  634. original_vocab = list(zip(proto.tokens, proto.scores))
  635. updated_vocab = []
  636. for token, score in original_vocab:
  637. if token == "<0x09>":
  638. updated_vocab.append(("\t", score))
  639. elif " " in token and len(token.strip()) == 0:
  640. underscores = "▁" * len(token)
  641. updated_vocab.append((underscores, score))
  642. else:
  643. updated_vocab.append((token, score))
  644. return updated_vocab
  645. def normalizer(self, proto):
  646. return normalizers.Replace(" ", "▁")
  647. def decoder(self, replacement, add_prefix_space):
  648. sequence = [
  649. decoders.Replace("▁", " "),
  650. decoders.ByteFallback(),
  651. decoders.Fuse(),
  652. ]
  653. if add_prefix_space:
  654. sequence += [decoders.Strip(content=" ", left=1)]
  655. return decoders.Sequence(sequence)
  656. def converted(self) -> Tokenizer:
  657. vocab_scores = self.vocab(self.proto)
  658. tokenizer = Tokenizer(
  659. Unigram(
  660. vocab_scores,
  661. unk_id=self.proto.unk_token_id,
  662. byte_fallback=self.handle_byte_fallback,
  663. )
  664. )
  665. normalizer = self.normalizer(self.proto)
  666. if normalizer is not None:
  667. tokenizer.normalizer = normalizer
  668. replacement = "▁"
  669. add_prefix_space = True
  670. if hasattr(self.original_tokenizer, "add_prefix_space"):
  671. add_prefix_space = self.original_tokenizer.add_prefix_space
  672. tokenizer.decoder = self.decoder(replacement, add_prefix_space)
  673. pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
  674. if pre_tokenizer is not None:
  675. tokenizer.pre_tokenizer = pre_tokenizer
  676. return tokenizer
  677. GGUF_TO_FAST_CONVERTERS = {
  678. "llama": GGUFLlamaConverter,
  679. "qwen2": GGUFQwen2Converter,
  680. "qwen2_moe": GGUFQwen2Converter,
  681. "qwen3": GGUFQwen2Converter,
  682. "qwen3_moe": GGUFQwen2Converter,
  683. "phi3": GGUFPhi3Converter,
  684. "bloom": GGUFGPTConverter,
  685. "falcon": GGUFGPTConverter,
  686. "stablelm": GGUFGPTConverter,
  687. "gpt2": GGUFGPTConverter,
  688. "starcoder2": GGUFGPTConverter,
  689. "t5": GGUFT5Converter,
  690. "mamba": GGUFGPTConverter,
  691. "nemotron": GGUFGPTConverter,
  692. "gemma2": GGUFGemmaConverter,
  693. "gemma3_text": GGUFGemmaConverter,
  694. "umt5": GGUFT5Converter,
  695. "deci": GGUFLlamaConverter,
  696. "decilm": GGUFLlamaConverter,
  697. "minimax_m2": GGUFQwen2Converter,
  698. }
  699. def convert_gguf_tokenizer(architecture: str, tokenizer_dict) -> tuple[Tokenizer, dict]:
  700. """
  701. Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
  702. Args:
  703. architecture (`str`): The model architecture derived from gguf file.
  704. transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
  705. Instance of a slow tokenizer to convert in the backend tokenizer for
  706. [`~tokenization_utils_base.PreTrainedTokenizerFast`].
  707. Return:
  708. A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
  709. [`~tokenization_utils_base.PreTrainedTokenizerFast`]
  710. """
  711. tokenizer_class_name = architecture
  712. converter = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name](tokenizer_dict)
  713. fast_tokenizer = converter.converted()
  714. return fast_tokenizer, converter.additional_kwargs