| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
- from tokenizers.models import BPE
- from transformers.convert_slow_tokenizer import bytes_to_unicode
- from transformers.tokenization_utils_tokenizers import PreTrainedTokenizerFast
- class MistralConverter:
- """
- A general tiktoken converter.
- """
- def __init__(
- self,
- vocab=None,
- pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
- add_prefix_space=False,
- additional_special_tokens=None,
- **kwargs,
- ):
- self.vocab = vocab
- self.pattern = pattern
- self.add_prefix_space = add_prefix_space
- self.additional_special_tokens = additional_special_tokens
- def extract_vocab_merges_from_model(self, vocab: str):
- bpe_ranks = vocab
- byte_encoder = bytes_to_unicode()
- def token_bytes_to_string(b):
- return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
- merges = []
- vocab = {}
- for idx, (token, rank) in enumerate(bpe_ranks.items()):
- if token not in self.additional_special_tokens:
- vocab[token_bytes_to_string(token)] = idx
- if len(token) == 1:
- continue
- local = []
- for index in range(1, len(token)):
- piece_l, piece_r = token[:index], token[index:]
- if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks:
- local.append((piece_l, piece_r, rank))
- local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False)
- merges.extend(local)
- else:
- vocab[token] = idx
- merges = sorted(merges, key=lambda val: val[2], reverse=False)
- merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges]
- return vocab, merges
- def tokenizer(self):
- vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab)
- tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False))
- if hasattr(tokenizer.model, "ignore_merges"):
- tokenizer.model.ignore_merges = True
- return tokenizer
- def converted(self) -> Tokenizer:
- tokenizer = self.tokenizer()
- tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
- [
- pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False),
- pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False),
- ]
- )
- tokenizer.decoder = decoders.ByteLevel()
- tokenizer.add_special_tokens(self.additional_special_tokens)
- tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
- return tokenizer
- def convert_tekken_tokenizer(tokenizer_file: str):
- """Convert a "tekken" tokenizer to a fast Tokenizer."""
- # Tekken format -- need to use the Converter
- from mistral_common.tokens.tokenizers.base import SpecialTokens
- from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
- # Load directly using their lib
- mistral_tokenizer = MistralTokenizer.from_file(tokenizer_file)
- # Extract vocab and special tokens
- vocab = mistral_tokenizer.instruct_tokenizer.tokenizer._tekken_token2id_nospecial
- sorted_tokens = sorted(mistral_tokenizer.instruct_tokenizer.tokenizer._all_special_tokens, key=lambda x: x["rank"])
- all_special = [token["token_str"] for token in sorted_tokens]
- specials_tokens = {token: idx for idx, token in enumerate(all_special)}
- specials_tokens.update(vocab)
- vocab = specials_tokens
- # TODO(juliendenize): expose this in mistral-common to avoid accessing private attributes
- # and improve maintainability
- pattern = mistral_tokenizer.instruct_tokenizer.tokenizer._model._pat_str
- # Convert
- tokenizer = PreTrainedTokenizerFast(
- tokenizer_object=MistralConverter(
- vocab=vocab, additional_special_tokens=all_special, pattern=pattern
- ).converted()
- )
- # Post-process
- tokenizer.add_special_tokens({"additional_special_tokens": all_special})
- MAP_SPECAL = {
- "bos_token": SpecialTokens.bos.value,
- "eos_token": SpecialTokens.eos.value,
- "pad_token": SpecialTokens.pad.value,
- "unk_token": SpecialTokens.unk.value,
- }
- for special_key, special_token in MAP_SPECAL.items():
- if special_token in all_special:
- tokenizer.add_special_tokens({special_key: special_token})
- return tokenizer
|