tiktoken.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. from pathlib import Path
  2. from typing import Any
  3. from transformers.convert_slow_tokenizer import TikTokenConverter
  4. from transformers.tokenization_utils_tokenizers import TIKTOKEN_VOCAB_FILE, TOKENIZER_FILE
  5. def convert_tiktoken_to_fast(encoding: Any, output_dir: str):
  6. """
  7. Converts given `tiktoken` encoding to `PretrainedTokenizerFast` and saves the configuration of converted tokenizer
  8. on disk.
  9. Args:
  10. encoding (`str` or `tiktoken.Encoding`):
  11. Tokenizer from `tiktoken` library. If `encoding` is `str`, the tokenizer will be loaded with
  12. `tiktoken.get_encoding(encoding)`.
  13. output_dir (`str`):
  14. Save path for converted tokenizer configuration file.
  15. """
  16. output_dir = Path(output_dir)
  17. output_dir.mkdir(exist_ok=True)
  18. save_file = output_dir / "tiktoken" / TIKTOKEN_VOCAB_FILE
  19. tokenizer_file = output_dir / TOKENIZER_FILE
  20. # Create parent directory for save_file
  21. save_file.parent.mkdir(parents=True, exist_ok=True)
  22. save_file_absolute = str(save_file.absolute())
  23. output_file_absolute = str(tokenizer_file.absolute())
  24. try:
  25. from tiktoken import get_encoding
  26. from tiktoken.load import dump_tiktoken_bpe
  27. if isinstance(encoding, str):
  28. encoding = get_encoding(encoding)
  29. dump_tiktoken_bpe(encoding._mergeable_ranks, save_file_absolute)
  30. except ImportError as e:
  31. error_msg = str(e)
  32. if "blobfile" in error_msg.lower():
  33. raise ValueError(
  34. "`blobfile` is required to save a `tiktoken` file. Install it with `pip install blobfile`."
  35. ) from e
  36. raise ValueError(
  37. "`tiktoken` is required to save a `tiktoken` file. Install it with `pip install tiktoken`."
  38. ) from e
  39. tokenizer = TikTokenConverter(
  40. vocab_file=save_file_absolute, pattern=encoding._pat_str, extra_special_tokens=encoding._special_tokens
  41. ).converted()
  42. tokenizer.save(output_file_absolute)