tokenization_qwen2.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Tokenization classes for Qwen2."""
  15. from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers
  16. from tokenizers.models import BPE
  17. from ...tokenization_utils_tokenizers import TokenizersBackend
  18. from ...utils import logging
  19. logger = logging.get_logger(__name__)
  20. VOCAB_FILES_NAMES = {
  21. "vocab_file": "vocab.json",
  22. "merges_file": "merges.txt",
  23. "tokenizer_file": "tokenizer.json",
  24. }
  25. MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
  26. PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
  27. class Qwen2Tokenizer(TokenizersBackend):
  28. vocab_files_names = VOCAB_FILES_NAMES
  29. model_input_names = ["input_ids", "attention_mask"]
  30. model = BPE
  31. def __init__(
  32. self,
  33. vocab: str | dict[str, int] | None = None,
  34. merges: str | list[str] | None = None,
  35. unk_token: str = "<|endoftext|>",
  36. bos_token=None,
  37. eos_token: str = "<|endoftext|>",
  38. pad_token: str = "<|endoftext|>",
  39. add_prefix_space=None,
  40. **kwargs,
  41. ):
  42. self.add_prefix_space = add_prefix_space if add_prefix_space is not None else False
  43. self._vocab = (
  44. vocab
  45. if vocab is not None
  46. else {
  47. "<|endoftext|>": 0,
  48. }
  49. )
  50. self._merges = merges or []
  51. self._tokenizer = Tokenizer(
  52. BPE(
  53. vocab=self._vocab,
  54. merges=self._merges,
  55. dropout=None,
  56. unk_token=None,
  57. continuing_subword_prefix="",
  58. end_of_word_suffix="",
  59. fuse_unk=False,
  60. byte_fallback=False,
  61. )
  62. )
  63. self._tokenizer.decoder = decoders.ByteLevel()
  64. self._tokenizer.normalizer = normalizers.NFC()
  65. self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
  66. [
  67. pre_tokenizers.Split(
  68. Regex(PRETOKENIZE_REGEX),
  69. behavior="isolated",
  70. invert=False,
  71. ),
  72. pre_tokenizers.ByteLevel(
  73. add_prefix_space=self.add_prefix_space,
  74. use_regex=False,
  75. ),
  76. ]
  77. )
  78. super().__init__(
  79. unk_token=unk_token,
  80. bos_token=bos_token,
  81. eos_token=eos_token,
  82. pad_token=pad_token,
  83. add_prefix_space=add_prefix_space,
  84. **kwargs,
  85. )
  86. self.add_tokens([AddedToken(token, special=True) for token in self.all_special_tokens])
  87. __all__ = ["Qwen2Tokenizer"]