| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196 |
- import json
- import os
- from typing import Iterator, List, Optional, Union, Tuple
- from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
- from tokenizers.models import Unigram
- from .base_tokenizer import BaseTokenizer
- class SentencePieceUnigramTokenizer(BaseTokenizer):
- """SentencePiece Unigram Tokenizer
- Represents the Unigram algorithm, with the pretokenization used by SentencePiece
- """
- def __init__(
- self,
- vocab: Optional[List[Tuple[str, float]]] = None,
- replacement: str = "▁",
- add_prefix_space: bool = True,
- ):
- if vocab is not None:
- # Let Unigram(..) fail if only one of them is None
- tokenizer = Tokenizer(Unigram(vocab))
- else:
- tokenizer = Tokenizer(Unigram())
- tokenizer.normalizer = normalizers.Sequence(
- [normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")]
- )
- prepend_scheme = "always" if add_prefix_space else "never"
- tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
- tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
- parameters = {
- "model": "SentencePieceUnigram",
- "replacement": replacement,
- "add_prefix_space": add_prefix_space,
- }
- super().__init__(tokenizer, parameters)
- def train(
- self,
- files: Union[str, List[str]],
- vocab_size: int = 8000,
- show_progress: bool = True,
- special_tokens: Optional[List[Union[str, AddedToken]]] = None,
- initial_alphabet: Optional[List[str]] = None,
- unk_token: Optional[str] = None,
- ):
- """
- Train the model using the given files
- Args:
- files (:obj:`List[str]`):
- A list of path to the files that we should use for training
- vocab_size (:obj:`int`):
- The size of the final vocabulary, including all tokens and alphabet.
- show_progress (:obj:`bool`):
- Whether to show progress bars while training.
- special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
- A list of special tokens the model should know of.
- initial_alphabet (:obj:`List[str]`, `optional`):
- A list of characters to include in the initial alphabet, even
- if not seen in the training dataset.
- If the strings contain more than one character, only the first one
- is kept.
- unk_token (:obj:`str`, `optional`):
- The unknown token to be used by the model.
- """
- if special_tokens is None:
- special_tokens = []
- if initial_alphabet is None:
- initial_alphabet = []
- trainer = trainers.UnigramTrainer(
- vocab_size=vocab_size,
- special_tokens=special_tokens,
- show_progress=show_progress,
- initial_alphabet=initial_alphabet,
- unk_token=unk_token,
- )
- if isinstance(files, str):
- files = [files]
- self._tokenizer.train(files, trainer=trainer)
- def train_from_iterator(
- self,
- iterator: Union[Iterator[str], Iterator[Iterator[str]]],
- vocab_size: int = 8000,
- show_progress: bool = True,
- special_tokens: Optional[List[Union[str, AddedToken]]] = None,
- initial_alphabet: Optional[List[str]] = None,
- unk_token: Optional[str] = None,
- length: Optional[int] = None,
- ):
- """
- Train the model using the given iterator
- Args:
- iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
- Any iterator over strings or list of strings
- vocab_size (:obj:`int`):
- The size of the final vocabulary, including all tokens and alphabet.
- show_progress (:obj:`bool`):
- Whether to show progress bars while training.
- special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
- A list of special tokens the model should know of.
- initial_alphabet (:obj:`List[str]`, `optional`):
- A list of characters to include in the initial alphabet, even
- if not seen in the training dataset.
- If the strings contain more than one character, only the first one
- is kept.
- unk_token (:obj:`str`, `optional`):
- The unknown token to be used by the model.
- length (:obj:`int`, `optional`):
- The total number of sequences in the iterator. This is used to
- provide meaningful progress tracking
- """
- if special_tokens is None:
- special_tokens = []
- if initial_alphabet is None:
- initial_alphabet = []
- trainer = trainers.UnigramTrainer(
- vocab_size=vocab_size,
- special_tokens=special_tokens,
- show_progress=show_progress,
- initial_alphabet=initial_alphabet,
- unk_token=unk_token,
- )
- self._tokenizer.train_from_iterator(
- iterator,
- trainer=trainer,
- length=length,
- )
- @staticmethod
- def from_spm(filename: str):
- try:
- import sys
- sys.path.append(".")
- import sentencepiece_model_pb2 as model # type: ignore[import]
- except Exception:
- raise Exception(
- "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
- )
- m = model.ModelProto()
- m.ParseFromString(open(filename, "rb").read())
- precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
- vocab = [(piece.piece, piece.score) for piece in m.pieces]
- unk_id = m.trainer_spec.unk_id
- model_type = m.trainer_spec.model_type
- byte_fallback = m.trainer_spec.byte_fallback
- if model_type != 1:
- raise Exception(
- "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
- )
- replacement = "▁"
- add_prefix_space = True
- tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
- if precompiled_charsmap:
- tokenizer.normalizer = normalizers.Sequence(
- [
- normalizers.Precompiled(precompiled_charsmap),
- normalizers.Replace(Regex(" {2,}"), " "),
- ]
- )
- else:
- tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
- prepend_scheme = "always" if add_prefix_space else "never"
- tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
- tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
- parameters = {
- "model": "SentencePieceUnigram",
- }
- obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters) # type: ignore[arg-type]
- BaseTokenizer.__init__(obj, tokenizer, parameters)
- return obj
|