quantizer_fbgemm_fp8.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from typing import TYPE_CHECKING
  15. from .base import HfQuantizer
  16. if TYPE_CHECKING:
  17. from ..modeling_utils import PreTrainedModel
  18. from ..utils.quantization_config import FbgemmFp8Config
  19. from ..utils import (
  20. is_accelerate_available,
  21. is_fbgemm_gpu_available,
  22. is_kernels_available,
  23. is_torch_available,
  24. is_torch_cuda_available,
  25. is_torch_xpu_available,
  26. logging,
  27. )
  28. from .quantizers_utils import get_module_from_name
  29. if is_torch_available():
  30. import torch
  31. logger = logging.get_logger(__name__)
  32. class FbgemmFp8HfQuantizer(HfQuantizer):
  33. """
  34. FP8 quantization using fbgemm kernels
  35. """
  36. requires_calibration = False
  37. quantization_config: "FbgemmFp8Config"
  38. def __init__(self, quantization_config, **kwargs):
  39. super().__init__(quantization_config, **kwargs)
  40. def validate_environment(self, *args, **kwargs):
  41. if not is_torch_cuda_available() and not is_torch_xpu_available():
  42. raise ImportError("Using fbgemm fp8 quantization requires a GPU or XPU")
  43. if is_torch_xpu_available() and not is_kernels_available():
  44. raise ImportError("Using FP8 fbgemm on XPU requires kernels (`pip install kernels`)")
  45. if is_torch_cuda_available() and not is_fbgemm_gpu_available():
  46. raise ImportError(
  47. "Loading an FP8 fbgemm quantized model on CUDA requires fbgemm-gpu library"
  48. "Please install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries"
  49. )
  50. if not is_accelerate_available():
  51. raise ImportError(
  52. "Loading an FP8 quantized model requires accelerate (`pip install --upgrade accelerate`)"
  53. )
  54. if is_torch_cuda_available():
  55. compute_capability = torch.cuda.get_device_capability()
  56. major, _ = compute_capability
  57. if major < 9:
  58. raise ValueError(
  59. "FP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)"
  60. )
  61. device_map = kwargs.get("device_map")
  62. if device_map is None:
  63. logger.warning_once(
  64. "You have loaded an FP8 model on CPU and have a CUDA/XPU device available, make sure to set "
  65. "your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu' or 'auto'. "
  66. )
  67. elif isinstance(device_map, dict):
  68. if not self.pre_quantized and ("cpu" in device_map.values() or "disk" in device_map.values()):
  69. raise ValueError(
  70. "You are attempting to load an FP8 model with a device_map that contains a CPU or disk device."
  71. "This is not supported when the model is quantized on the fly. "
  72. "Please use a quantized checkpoint or remove the CPU or disk device from the device_map."
  73. )
  74. def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
  75. if dtype != torch.bfloat16:
  76. logger.warning_once(
  77. f"Setting dtype to {dtype}, but only bfloat16 is supported right now. Overwriting torch_dtype to bfloat16."
  78. )
  79. dtype = torch.bfloat16
  80. return dtype
  81. def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
  82. from ..integrations import FbgemmFp8Linear, FbgemmFp8Llama4TextExperts
  83. module, tensor_name = get_module_from_name(model, param_name)
  84. if isinstance(module, FbgemmFp8Linear):
  85. if self.pre_quantized or tensor_name == "bias":
  86. return False
  87. else:
  88. return True
  89. if isinstance(module, FbgemmFp8Llama4TextExperts):
  90. if self.pre_quantized or tensor_name == "bias":
  91. return False
  92. else:
  93. return True
  94. return False
  95. def param_element_size(self, model: "PreTrainedModel", param_name: str, param: "torch.Tensor") -> float:
  96. "Return the element size (in bytes) for `param_name`."
  97. if self.param_needs_quantization(model, param_name):
  98. # 8 bit, this is neeed as when `pre_quantized`` is False, we don't set the dtype of the FP8Linear in order to correctly load the weights
  99. return 1
  100. return super().param_element_size(model, param_name, param)
  101. def _process_model_before_weight_loading(
  102. self,
  103. model: "PreTrainedModel",
  104. **kwargs,
  105. ):
  106. from ..integrations import replace_with_fbgemm_fp8_linear
  107. self.modules_to_not_convert = self.get_modules_to_not_convert(
  108. model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
  109. )
  110. model = replace_with_fbgemm_fp8_linear(
  111. model,
  112. modules_to_not_convert=self.modules_to_not_convert,
  113. quantization_config=self.quantization_config,
  114. pre_quantized=self.pre_quantized,
  115. tp_plan=model._tp_plan,
  116. )
  117. def _process_model_after_weight_loading(self, model, **kwargs):
  118. """
  119. Force update the input scale upper bound after weight loading and device dispatch are complete.
  120. This resolves issues where persistent buffers are zeroed out or overwritten during the loading process.
  121. """
  122. from ..integrations.fbgemm_fp8 import FbgemmFp8Linear, FbgemmFp8Llama4TextExperts
  123. for m in model.modules():
  124. if isinstance(m, (FbgemmFp8Linear, FbgemmFp8Llama4TextExperts)):
  125. if hasattr(m, "input_scale_ub"):
  126. # The model is now on the target device, so we can use fill_ directly.
  127. m.input_scale_ub.fill_(self.quantization_config.activation_scale_ub)
  128. return model
  129. def update_tp_plan(self, config):
  130. if "Llama4" in config.__class__.__name__:
  131. text_plan = {
  132. # We are using a different tp plan with local_colwise and local_rowwise for the attention because fbgemm operations cannot be parallelized
  133. # With local_colwise and local_rowwise, all the operations are done locally, and we add a gather operation to gather the results instead of
  134. # using dtensors
  135. "layers.*.self_attn.q_proj.weight": "colwise",
  136. "layers.*.self_attn.q_proj.weight_scale": "colwise",
  137. "layers.*.self_attn.k_proj.weight": "colwise",
  138. "layers.*.self_attn.k_proj.weight_scale": "colwise",
  139. "layers.*.self_attn.v_proj.weight": "colwise",
  140. "layers.*.self_attn.v_proj.weight_scale": "colwise",
  141. "layers.*.self_attn.o_proj.weight": "rowwise",
  142. # We keep the same sequence_parallel plan for layernorms
  143. "layers.*.input_layernorm.weight": "sequence_parallel",
  144. "layers.*.post_attention_layernorm.weight": "sequence_parallel",
  145. "norm.weight": "sequence_parallel",
  146. # We keep the same local_colwise and local_rowwise plan for the feed forward shared expert
  147. # We also add scales for the shared expert, for local_colwise the scale is also local_colwise
  148. # For local_rowwise the scale is replicated, so we don't need to add it
  149. "layers.*.feed_forward.shared_expert.gate_proj.weight": "colwise",
  150. "layers.*.feed_forward.shared_expert.gate_proj.weight_scale": "colwise",
  151. "layers.*.feed_forward.shared_expert.up_proj.weight": "colwise",
  152. "layers.*.feed_forward.shared_expert.up_proj.weight_scale": "colwise",
  153. "layers.*.feed_forward.shared_expert.down_proj.weight": "rowwise",
  154. "layers.*.feed_forward.experts.*.gate_proj.weight": "colwise",
  155. "layers.*.feed_forward.experts.*.gate_proj.weight_scale": "colwise",
  156. "layers.*.feed_forward.experts.*.up_proj.weight": "colwise",
  157. "layers.*.feed_forward.experts.*.up_proj.weight_scale": "colwise",
  158. "layers.*.feed_forward.experts.*.down_proj.weight": "rowwise",
  159. # For Fused implementation we use local_packed_rowwise for the gate_up_proj, and the same for the packed scales
  160. # We use local_colwise for the down_proj, and the scales are replicated so we don't add them
  161. "layers.*.feed_forward.experts.gate_up_proj": "packed_rowwise",
  162. "layers.*.feed_forward.experts.gate_up_proj_scale": "packed_rowwise",
  163. "layers.*.feed_forward.experts.down_proj": "colwise",
  164. }
  165. if config.get_text_config() is not None:
  166. config.get_text_config().base_model_tp_plan = text_plan
  167. else:
  168. config.base_model_tp_plan = text_plan
  169. return config
  170. return config
  171. def is_serializable(self):
  172. return True
  173. @property
  174. def is_trainable(self) -> bool:
  175. return False
  176. def get_quantize_ops(self):
  177. from ..integrations.fbgemm_fp8 import FbgemmFp8Quantize
  178. return FbgemmFp8Quantize(self)