yichael
/
image-match


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
							# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Xcodec model configuration"""

import math

import numpy as np
from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...utils import auto_docstring
from ..auto import CONFIG_MAPPING, AutoConfig


@auto_docstring(checkpoint="Manel/X-Codec")
@strict
class XcodecConfig(PreTrainedConfig):
    r"""
    target_bandwidths (`List[float]`, *optional*, defaults to `[0.5, 1, 1.5, 2, 4]`):
        The range of different bandwidths (in kbps) the model can encode audio with.
    channel_ratios (`List[float]`, *optional*, defaults to `[1, 1]`):
        Expansion factors for the number of output channels in each semantic block.
    strides (`List[int]`, *optional*, defaults to `[1, 1]`):
        Strides for each semantic encoder block.
    block_dilations (`List[int]`, *optional*, defaults to `[1, 1]`):
        Dilation factors for the residual units in semantic blocks.
    unit_kernel_size (`int`, *optional*, defaults to 3):
        Kernel size inside each ResidualUnit in semantic blocks.
    acoustic_model_config (`Union[Dict, DacConfig]`, *optional*):
        An instance of the configuration for the acoustic (DAC) model.
    semantic_model_config (`Union[Dict, HubertConfig, WavLMConfig]`, *optional*):
        An instance of the configuration object for the semantic (HuBERT) model.

    Example:

    ```python
    >>> from transformers import XcodecModel, XcodecConfig

    >>> # Initializing configuration
    >>> configuration = XcodecConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = XcodecModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "xcodec"

    sub_configs = {
        "acoustic_model_config": AutoConfig,
        "semantic_model_config": AutoConfig,
    }

    _default_acoustic_model_config_kwargs = {
        "encoder_hidden_size": 64,
        # NOTE: original DAC uses [2, 4, 8, 8] `downsampling ratios`, namely reverse of `upsampling_ratios`
        # (not sure if intentional by Xcodec but we keep it)
        "downsampling_ratios": [8, 5, 4, 2],
        "decoder_hidden_size": 1024,
        "upsampling_ratios": [8, 5, 4, 2],
        "hidden_size": 256,
    }

    _default_semantic_model_config_kwargs = {}

    target_bandwidths: list[int | float] | tuple[int | float, ...] = (0.5, 1, 1.5, 2, 4)
    sample_rate: int = 16000
    kernel_size: int = 3
    channel_ratios: list[int] | tuple[int, ...] = (1, 1)
    strides: list[int] | tuple[int, ...] = (1, 1)
    block_dilations: list[int] | tuple[int, ...] = (1, 1)
    unit_kernel_size: int = 3
    codebook_size: int = 1024
    codebook_dim: int | None = None
    initializer_range: float = 0.02
    acoustic_model_config: dict | PreTrainedConfig | None = None
    semantic_model_config: dict | PreTrainedConfig | None = None

    def __post_init__(self, **kwargs):
        if self.acoustic_model_config is None:
            self.acoustic_model_config = CONFIG_MAPPING["dac"](
                encoder_hidden_size=64,
                # NOTE: original DAC uses [2, 4, 8, 8] `downsampling ratios`, namely reverse of `upsampling_ratios`
                # (not sure if intentional by Xcodec but we keep it)
                downsampling_ratios=[8, 5, 4, 2],
                decoder_hidden_size=1024,
                upsampling_ratios=[8, 5, 4, 2],
                hidden_size=256,
            )
        elif isinstance(self.acoustic_model_config, dict):
            self.acoustic_model_config["model_type"] = self.acoustic_model_config.get("model_type", "dac")
            self.acoustic_model_config = CONFIG_MAPPING[self.acoustic_model_config["model_type"]](
                **{**self._default_acoustic_model_config_kwargs, **self.acoustic_model_config}
            )

        if self.semantic_model_config is None:
            self.semantic_model_config = CONFIG_MAPPING["hubert"]()
        elif isinstance(self.semantic_model_config, dict):
            self.semantic_model_config["model_type"] = self.semantic_model_config.get("model_type", "hubert")
            self.semantic_model_config = CONFIG_MAPPING[self.semantic_model_config["model_type"]](
                **{**self._default_semantic_model_config_kwargs, **self.semantic_model_config}
            )

        if self.codebook_dim is None:
            self.codebook_dim = self.acoustic_model_config.hidden_size + self.semantic_model_config.hidden_size

        super().__post_init__(**kwargs)

    @property
    def frame_rate(self) -> int:
        return math.ceil(self.sample_rate / self.hop_length)

    @property
    def semantic_hidden_size(self) -> int:
        return self.semantic_model_config.hidden_size

    @property
    def hop_length(self) -> int:
        return int(np.prod(self.acoustic_model_config.downsampling_ratios))

    @property
    def codebook_nbits(self) -> int:
        return math.ceil(math.log2(self.codebook_size))

    @property
    def hidden_size(self) -> int:
        return self.acoustic_model_config.hidden_size + self.semantic_model_config.hidden_size

    @property
    def num_quantizers(self) -> int:
        return int(1000 * self.target_bandwidths[-1] // (self.frame_rate * self.codebook_nbits))


__all__ = ["XcodecConfig"]