| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462 |
- # Generated content DO NOT EDIT
- class Trainer:
- """
- Base class for all trainers
- This class is not supposed to be instantiated directly. Instead, any implementation of a
- Trainer will return an instance of this class when instantiated.
- """
- def __getstate__(self):
- """ """
- pass
- def __setstate__(self, state):
- """ """
- pass
- class BpeTrainer(Trainer):
- """
- Trainer capable of training a BPE model
- Args:
- vocab_size (:obj:`int`, `optional`):
- The size of the final vocabulary, including all tokens and alphabet.
- min_frequency (:obj:`int`, `optional`):
- The minimum frequency a pair should have in order to be merged.
- show_progress (:obj:`bool`, `optional`):
- Whether to show progress bars while training.
- special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
- A list of special tokens the model should know of.
- limit_alphabet (:obj:`int`, `optional`):
- The maximum different characters to keep in the alphabet.
- initial_alphabet (:obj:`List[str]`, `optional`):
- A list of characters to include in the initial alphabet, even
- if not seen in the training dataset.
- If the strings contain more than one character, only the first one
- is kept.
- continuing_subword_prefix (:obj:`str`, `optional`):
- A prefix to be used for every subword that is not a beginning-of-word.
- end_of_word_suffix (:obj:`str`, `optional`):
- A suffix to be used for every subword that is a end-of-word.
- max_token_length (:obj:`int`, `optional`):
- Prevents creating tokens longer than the specified size.
- This can help with reducing polluting your vocabulary with
- highly repetitive tokens like `======` for wikipedia
- """
- def __init__(
- self,
- vocab_size=30000,
- min_frequency=0,
- show_progress=True,
- special_tokens=[],
- limit_alphabet=None,
- initial_alphabet=[],
- continuing_subword_prefix=None,
- end_of_word_suffix=None,
- max_token_length=None,
- words={},
- ):
- pass
- def __getstate__(self):
- """ """
- pass
- def __setstate__(self, state):
- """ """
- pass
- @property
- def continuing_subword_prefix(self):
- """ """
- pass
- @continuing_subword_prefix.setter
- def continuing_subword_prefix(self, value):
- """ """
- pass
- @property
- def end_of_word_suffix(self):
- """ """
- pass
- @end_of_word_suffix.setter
- def end_of_word_suffix(self, value):
- """ """
- pass
- @property
- def initial_alphabet(self):
- """ """
- pass
- @initial_alphabet.setter
- def initial_alphabet(self, value):
- """ """
- pass
- @property
- def limit_alphabet(self):
- """ """
- pass
- @limit_alphabet.setter
- def limit_alphabet(self, value):
- """ """
- pass
- @property
- def max_token_length(self):
- """ """
- pass
- @max_token_length.setter
- def max_token_length(self, value):
- """ """
- pass
- @property
- def min_frequency(self):
- """ """
- pass
- @min_frequency.setter
- def min_frequency(self, value):
- """ """
- pass
- @property
- def show_progress(self):
- """ """
- pass
- @show_progress.setter
- def show_progress(self, value):
- """ """
- pass
- @property
- def special_tokens(self):
- """ """
- pass
- @special_tokens.setter
- def special_tokens(self, value):
- """ """
- pass
- @property
- def vocab_size(self):
- """ """
- pass
- @vocab_size.setter
- def vocab_size(self, value):
- """ """
- pass
- class UnigramTrainer(Trainer):
- """
- Trainer capable of training a Unigram model
- Args:
- vocab_size (:obj:`int`):
- The size of the final vocabulary, including all tokens and alphabet.
- show_progress (:obj:`bool`):
- Whether to show progress bars while training.
- special_tokens (:obj:`List[Union[str, AddedToken]]`):
- A list of special tokens the model should know of.
- initial_alphabet (:obj:`List[str]`):
- A list of characters to include in the initial alphabet, even
- if not seen in the training dataset.
- If the strings contain more than one character, only the first one
- is kept.
- shrinking_factor (:obj:`float`):
- The shrinking factor used at each step of the training to prune the
- vocabulary.
- unk_token (:obj:`str`):
- The token used for out-of-vocabulary tokens.
- max_piece_length (:obj:`int`):
- The maximum length of a given token.
- n_sub_iterations (:obj:`int`):
- The number of iterations of the EM algorithm to perform before
- pruning the vocabulary.
- """
- def __init__(
- self,
- vocab_size=8000,
- show_progress=True,
- special_tokens=[],
- initial_alphabet=[],
- shrinking_factor=0.75,
- unk_token=None,
- max_piece_length=16,
- n_sub_iterations=2,
- ):
- pass
- def __getstate__(self):
- """ """
- pass
- def __setstate__(self, state):
- """ """
- pass
- @property
- def initial_alphabet(self):
- """ """
- pass
- @initial_alphabet.setter
- def initial_alphabet(self, value):
- """ """
- pass
- @property
- def show_progress(self):
- """ """
- pass
- @show_progress.setter
- def show_progress(self, value):
- """ """
- pass
- @property
- def special_tokens(self):
- """ """
- pass
- @special_tokens.setter
- def special_tokens(self, value):
- """ """
- pass
- @property
- def vocab_size(self):
- """ """
- pass
- @vocab_size.setter
- def vocab_size(self, value):
- """ """
- pass
- class WordLevelTrainer(Trainer):
- """
- Trainer capable of training a WorldLevel model
- Args:
- vocab_size (:obj:`int`, `optional`):
- The size of the final vocabulary, including all tokens and alphabet.
- min_frequency (:obj:`int`, `optional`):
- The minimum frequency a pair should have in order to be merged.
- show_progress (:obj:`bool`, `optional`):
- Whether to show progress bars while training.
- special_tokens (:obj:`List[Union[str, AddedToken]]`):
- A list of special tokens the model should know of.
- """
- def __init__(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[]):
- pass
- def __getstate__(self):
- """ """
- pass
- def __setstate__(self, state):
- """ """
- pass
- @property
- def min_frequency(self):
- """ """
- pass
- @min_frequency.setter
- def min_frequency(self, value):
- """ """
- pass
- @property
- def show_progress(self):
- """ """
- pass
- @show_progress.setter
- def show_progress(self, value):
- """ """
- pass
- @property
- def special_tokens(self):
- """ """
- pass
- @special_tokens.setter
- def special_tokens(self, value):
- """ """
- pass
- @property
- def vocab_size(self):
- """ """
- pass
- @vocab_size.setter
- def vocab_size(self, value):
- """ """
- pass
- class WordPieceTrainer(Trainer):
- """
- Trainer capable of training a WordPiece model
- Args:
- vocab_size (:obj:`int`, `optional`):
- The size of the final vocabulary, including all tokens and alphabet.
- min_frequency (:obj:`int`, `optional`):
- The minimum frequency a pair should have in order to be merged.
- show_progress (:obj:`bool`, `optional`):
- Whether to show progress bars while training.
- special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
- A list of special tokens the model should know of.
- limit_alphabet (:obj:`int`, `optional`):
- The maximum different characters to keep in the alphabet.
- initial_alphabet (:obj:`List[str]`, `optional`):
- A list of characters to include in the initial alphabet, even
- if not seen in the training dataset.
- If the strings contain more than one character, only the first one
- is kept.
- continuing_subword_prefix (:obj:`str`, `optional`):
- A prefix to be used for every subword that is not a beginning-of-word.
- end_of_word_suffix (:obj:`str`, `optional`):
- A suffix to be used for every subword that is a end-of-word.
- """
- def __init__(
- self,
- vocab_size=30000,
- min_frequency=0,
- show_progress=True,
- special_tokens=[],
- limit_alphabet=None,
- initial_alphabet=[],
- continuing_subword_prefix="##",
- end_of_word_suffix=None,
- ):
- pass
- def __getstate__(self):
- """ """
- pass
- def __setstate__(self, state):
- """ """
- pass
- @property
- def continuing_subword_prefix(self):
- """ """
- pass
- @continuing_subword_prefix.setter
- def continuing_subword_prefix(self, value):
- """ """
- pass
- @property
- def end_of_word_suffix(self):
- """ """
- pass
- @end_of_word_suffix.setter
- def end_of_word_suffix(self, value):
- """ """
- pass
- @property
- def initial_alphabet(self):
- """ """
- pass
- @initial_alphabet.setter
- def initial_alphabet(self, value):
- """ """
- pass
- @property
- def limit_alphabet(self):
- """ """
- pass
- @limit_alphabet.setter
- def limit_alphabet(self, value):
- """ """
- pass
- @property
- def min_frequency(self):
- """ """
- pass
- @min_frequency.setter
- def min_frequency(self, value):
- """ """
- pass
- @property
- def show_progress(self):
- """ """
- pass
- @show_progress.setter
- def show_progress(self, value):
- """ """
- pass
- @property
- def special_tokens(self):
- """ """
- pass
- @special_tokens.setter
- def special_tokens(self, value):
- """ """
- pass
- @property
- def vocab_size(self):
- """ """
- pass
- @vocab_size.setter
- def vocab_size(self, value):
- """ """
- pass
|