tokenization_rag.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. # Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Tokenization classes for RAG."""
  15. import os
  16. from ...utils import logging
  17. from .configuration_rag import RagConfig
  18. logger = logging.get_logger(__name__)
  19. class RagTokenizer:
  20. def __init__(self, question_encoder, generator):
  21. self.question_encoder = question_encoder
  22. self.generator = generator
  23. self.current_tokenizer = self.question_encoder
  24. def save_pretrained(self, save_directory):
  25. if os.path.isfile(save_directory):
  26. raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
  27. os.makedirs(save_directory, exist_ok=True)
  28. question_encoder_path = os.path.join(save_directory, "question_encoder_tokenizer")
  29. generator_path = os.path.join(save_directory, "generator_tokenizer")
  30. self.question_encoder.save_pretrained(question_encoder_path)
  31. self.generator.save_pretrained(generator_path)
  32. @classmethod
  33. def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
  34. # dynamically import AutoTokenizer
  35. from ..auto.tokenization_auto import AutoTokenizer
  36. config = kwargs.pop("config", None)
  37. if config is None:
  38. config = RagConfig.from_pretrained(pretrained_model_name_or_path)
  39. question_encoder = AutoTokenizer.from_pretrained(
  40. pretrained_model_name_or_path, config=config.question_encoder, subfolder="question_encoder_tokenizer"
  41. )
  42. generator = AutoTokenizer.from_pretrained(
  43. pretrained_model_name_or_path, config=config.generator, subfolder="generator_tokenizer"
  44. )
  45. return cls(question_encoder=question_encoder, generator=generator)
  46. def __call__(self, *args, **kwargs):
  47. return self.current_tokenizer(*args, **kwargs)
  48. def batch_decode(self, *args, **kwargs):
  49. return self.generator.batch_decode(*args, **kwargs)
  50. def decode(self, *args, **kwargs):
  51. return self.generator.decode(*args, **kwargs)
  52. def _switch_to_input_mode(self):
  53. self.current_tokenizer = self.question_encoder
  54. def _switch_to_target_mode(self):
  55. self.current_tokenizer = self.generator
  56. __all__ = ["RagTokenizer"]