yichael
/
image-match


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
							# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING

from ..utils.logging import tqdm
from .base import HfQuantizer
from .quantizers_utils import get_module_from_name


if TYPE_CHECKING:
    from ..modeling_utils import PreTrainedModel
    from ..utils.quantization_config import HiggsConfig

from ..utils import is_accelerate_available, is_flute_available, is_hadamard_available, is_torch_available, logging
from ..utils.quantization_config import QuantizationConfigMixin


if is_torch_available():
    import torch

logger = logging.get_logger(__name__)


class HiggsHfQuantizer(HfQuantizer):
    """
    Quantizer of the HIGGS method. Enables the loading of prequantized models and in-flight quantization of full-precision models.
    """

    requires_calibration = False
    quantization_config: "HiggsConfig"

    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
        super().__init__(quantization_config, **kwargs)

    def validate_environment(self, device_map, **kwargs):
        if not torch.cuda.is_available():
            raise NotImplementedError("HIGGS quantization is only supported on GPU. Please use a different quantizer.")

        if not is_accelerate_available():
            raise ImportError("Using `higgs` quantization requires Accelerate: `pip install accelerate`")

        if not is_flute_available():
            raise ImportError("Using `higgs` quantization requires FLUTE: `pip install flute-kernel>=0.3.0`")

        if not is_hadamard_available():
            raise ImportError(
                "Using `higgs` quantization requires fast_hadamard_transform: `pip install fast_hadamard_transform`"
            )

        if device_map is None:
            raise ValueError(
                "You are attempting to load a HIGGS model without setting device_map."
                " Please set device_map comprised of 'cuda' devices."
            )
        elif isinstance(device_map, dict):
            if "cpu" in device_map.values() or "disk" in device_map.values():
                raise ValueError(
                    "You are attempting to load a HIGGS model with a device_map that contains a CPU or disk device."
                    " This is not supported. Please remove the CPU or disk device from the device_map."
                )

    def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
        if dtype != torch.float16 and dtype != torch.bfloat16:
            raise ValueError(
                f"Invalid `dtype` {dtype}. HIGGS quantization only supports `dtype=torch.float16` or `dtype=torch.bfloat16`."
            )

        return dtype

    # TODO: to remove
    # Kept here in case we see some interest in adding support for it
    # def create_quantized_param(
    #     self,
    #     model: "PreTrainedModel",
    #     param_value: "torch.Tensor",
    #     param_name: str,
    #     target_device: "torch.device",
    #     **kwargs,
    # ):
    #     from ..integrations import quantize_with_higgs

    #     flute_dict = quantize_with_higgs(
    #         param_value.to(target_device),
    #         self.quantization_config.bits,
    #         self.quantization_config.p,
    #         self.quantization_config.group_size,
    #         self.quantization_config.hadamard_size,
    #     )
    #     del param_value

    #     module, _ = get_module_from_name(model, param_name)
    #     module_name = ".".join(param_name.split(".")[:-1])
    #     for key, value in flute_dict.items():
    #         if key in module._parameters:
    #             module._parameters[key] = torch.nn.Parameter(value, requires_grad=False)
    #         elif key in module._buffers:
    #             module._buffers[key] = torch.nn.Buffer(value)
    #         elif key == "tune_metadata":
    #             module.tune_metadata = value
    #             self.quantization_config.tune_metadata[module_name] = value.to_dict()
    #         else:
    #             raise ValueError(f"Unexpected key {key} in module {module}")

    def _process_model_before_weight_loading(
        self,
        model: "PreTrainedModel",
        **kwargs,
    ):
        from ..integrations import replace_with_higgs_linear

        self.modules_to_not_convert = self.get_modules_to_not_convert(
            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
        )

        replace_with_higgs_linear(
            model,
            quantization_config=self.quantization_config,
            modules_to_not_convert=self.modules_to_not_convert,
        )

    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
        from flute.tune import TuneMetaData, maybe_tune_and_repack
        from flute.utils import make_workspace_streamk

        from ..integrations import HiggsLinear

        flute_workspaces = {}
        flute_modules = {name: module for name, module in model.named_modules() if isinstance(module, HiggsLinear)}
        for name, module in tqdm(flute_modules.items(), desc="Repacking HIGGS modules", leave=False):
            # Every HiggsLinear needs a "workspace": a buffer for the unpacking operation.
            # This buffer needs to be on the same device as the weights, but can be reused across modules otherwise.
            if module.weight.device not in flute_workspaces:
                flute_workspaces[module.weight.device] = make_workspace_streamk(device=module.weight.device)
            module.workspace = flute_workspaces[module.weight.device]

            # FLUTE weights are packed in a way that is optimized for a specific number of SMs (GPU streaming multiprocessors).
            # If the model is loaded on a different device than the one it was saved on, we need to repack the weights.
            module.tune_metadata = TuneMetaData.from_dict(self.quantization_config.tune_metadata[name])
            module.weight.data, module.tune_metadata = maybe_tune_and_repack(
                weight=module.weight.data,
                scales=module.scales.data,
                metadata=module.tune_metadata,
            )
            self.quantization_config.tune_metadata[name] = module.tune_metadata.to_dict()

    @property
    def is_trainable(self) -> bool:
        return False

    def is_serializable(self):
        return True

    def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
        from ..integrations import HiggsLinear

        module, tensor_name = get_module_from_name(model, param_name)
        if isinstance(module, HiggsLinear) and tensor_name == "weight":
            # Only quantize weights of HiggsLinear modules that are not already quantized
            return True
        else:
            return False

    def _dequantize(self, model):
        from ..integrations import dequantize_higgs

        model = dequantize_higgs(model)
        return model