tokenization_cpmant.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. # Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Tokenization classes for CPMAnt."""
  15. import collections
  16. import os
  17. from transformers.utils import is_rjieba_available, requires_backends
  18. if is_rjieba_available():
  19. import rjieba
  20. from ...tokenization_python import PreTrainedTokenizer
  21. from ...utils import logging
  22. logger = logging.get_logger(__name__)
  23. VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
  24. def load_vocab(vocab_file):
  25. """Loads a vocabulary file into a dictionary."""
  26. vocab = collections.OrderedDict()
  27. with open(vocab_file, "r", encoding="utf-8") as reader:
  28. tokens = reader.readlines()
  29. for index, token in enumerate(tokens):
  30. token = token.rstrip("\n")
  31. vocab[token] = index
  32. return vocab
  33. class WordpieceTokenizer:
  34. def __init__(self, vocab, unk_token="<unk>", max_input_chars_per_word=200):
  35. self.vocab = vocab
  36. self.unk_token = unk_token
  37. self.max_input_chars_per_word = max_input_chars_per_word
  38. def tokenize(self, token):
  39. chars = list(token)
  40. if len(chars) > self.max_input_chars_per_word:
  41. return [self.unk_token]
  42. start = 0
  43. sub_tokens = []
  44. while start < len(chars):
  45. end = len(chars)
  46. cur_substr = None
  47. while start < end:
  48. substr = "".join(chars[start:end])
  49. if substr in self.vocab:
  50. cur_substr = substr
  51. break
  52. end -= 1
  53. if cur_substr is None:
  54. sub_tokens.append(self.unk_token)
  55. start += 1
  56. else:
  57. sub_tokens.append(cur_substr)
  58. start = end
  59. return sub_tokens
  60. class CpmAntTokenizer(PreTrainedTokenizer):
  61. """
  62. Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.
  63. Args:
  64. vocab_file (`str`):
  65. Path to the vocabulary file.
  66. bod_token (`str`, *optional*, defaults to `"<d>"`):
  67. The beginning of document token.
  68. eod_token (`str`, *optional*, defaults to `"</d>"`):
  69. The end of document token.
  70. bos_token (`str`, *optional*, defaults to `"<s>"`):
  71. The beginning of sequence token.
  72. eos_token (`str`, *optional*, defaults to `"</s>"`):
  73. The end of sequence token.
  74. pad_token (`str`, *optional*, defaults to `"<pad>"`):
  75. The token used for padding.
  76. unk_token (`str`, *optional*, defaults to `"<unk>"`):
  77. The unknown token.
  78. line_token (`str`, *optional*, defaults to `"</n>"`):
  79. The line token.
  80. space_token (`str`, *optional*, defaults to `"</_>"`):
  81. The space token.
  82. """
  83. vocab_files_names = VOCAB_FILES_NAMES
  84. model_input_names = ["input_ids", "attention_mask"]
  85. add_prefix_space = False
  86. def __init__(
  87. self,
  88. vocab_file,
  89. bod_token="<d>",
  90. eod_token="</d>",
  91. bos_token="<s>",
  92. eos_token="</s>",
  93. pad_token="<pad>",
  94. unk_token="<unk>",
  95. line_token="</n>",
  96. space_token="</_>",
  97. padding_side="left",
  98. **kwargs,
  99. ):
  100. requires_backends(self, ["rjieba"])
  101. self.bod_token = bod_token
  102. self.eod_token = eod_token
  103. self.encoder = load_vocab(vocab_file)
  104. self.encoder[" "] = self.encoder[space_token]
  105. self.encoder["\n"] = self.encoder[line_token]
  106. del self.encoder[space_token]
  107. del self.encoder[line_token]
  108. self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
  109. self.decoder = {v: k for k, v in self.encoder.items()}
  110. self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=unk_token)
  111. super().__init__(
  112. bod_token=bod_token,
  113. eod_token=eod_token,
  114. bos_token=bos_token,
  115. eos_token=eos_token,
  116. pad_token=pad_token,
  117. unk_token=unk_token,
  118. line_token=line_token,
  119. space_token=space_token,
  120. padding_side=padding_side,
  121. token_type_ids_pattern="all_zeros",
  122. token_type_ids_include_special_tokens=True,
  123. special_tokens_pattern="bos",
  124. **kwargs,
  125. )
  126. for special_token in [space_token, line_token]:
  127. token_id = self.added_tokens_encoder.pop(special_token, None)
  128. if token_id is not None:
  129. self._added_tokens_decoder.pop(token_id, None)
  130. self._update_total_vocab_size()
  131. @property
  132. def bod_token_id(self):
  133. return self.encoder[self.bod_token]
  134. @property
  135. def eod_token_id(self):
  136. return self.encoder[self.eod_token]
  137. @property
  138. def newline_id(self):
  139. return self.encoder["\n"]
  140. @property
  141. def vocab_size(self) -> int:
  142. return len(self.encoder)
  143. def get_vocab(self):
  144. return dict(self.encoder, **self.added_tokens_encoder)
  145. def _tokenize(self, text):
  146. """Tokenize a string."""
  147. output_tokens = []
  148. for x in rjieba.cut(text, False):
  149. output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
  150. return output_tokens
  151. def _decode(self, token_ids, **kwargs):
  152. """Decode ids into a string."""
  153. token_ids = [i for i in token_ids if i >= 0]
  154. token_ids = [
  155. x for x in token_ids if x != self.pad_token_id and x != self.eos_token_id and x != self.bos_token_id
  156. ]
  157. return super()._decode(token_ids, **kwargs)
  158. def check(self, token):
  159. return token in self.encoder
  160. def convert_tokens_to_string(self, tokens: list[str]) -> str:
  161. return "".join(tokens)
  162. def _convert_token_to_id(self, token):
  163. """Converts a token (str) in an id using the vocab."""
  164. return self.encoder.get(token, self.encoder.get(self.unk_token))
  165. def _convert_id_to_token(self, index):
  166. """Converts an index (integer) in a token (str) using the vocab."""
  167. return self.decoder.get(index, self.unk_token)
  168. def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str]:
  169. if os.path.isdir(save_directory):
  170. vocab_file = os.path.join(
  171. save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
  172. )
  173. else:
  174. vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
  175. index = 0
  176. if " " in self.encoder:
  177. self.encoder["</_>"] = self.encoder[" "]
  178. del self.encoder[" "]
  179. if "\n" in self.encoder:
  180. self.encoder["</n>"] = self.encoder["\n"]
  181. del self.encoder["\n"]
  182. self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
  183. with open(vocab_file, "w", encoding="utf-8") as writer:
  184. for token, token_index in self.encoder.items():
  185. if index != token_index:
  186. logger.warning(
  187. f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
  188. " Please check that the vocabulary is not corrupted!"
  189. )
  190. index = token_index
  191. writer.write(token + "\n")
  192. index += 1
  193. return (vocab_file,)
  194. __all__ = ["CpmAntTokenizer"]