| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173 |
- # Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """PyTorch Musicgen model."""
- import copy
- import inspect
- import math
- import random
- from collections.abc import Callable
- from dataclasses import dataclass
- from typing import TYPE_CHECKING, Any, Optional
- import torch
- import torch.nn as nn
- from torch.nn import CrossEntropyLoss
- from ... import initialization as init
- from ...activations import ACT2FN
- from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
- from ...generation import (
- ClassifierFreeGuidanceLogitsProcessor,
- GenerationConfig,
- GenerationMixin,
- GenerationMode,
- LogitsProcessorList,
- StoppingCriteriaList,
- )
- from ...masking_utils import create_bidirectional_mask, create_causal_mask
- from ...modeling_flash_attention_utils import (
- FlashAttentionKwargs,
- )
- from ...modeling_layers import GradientCheckpointingLayer
- from ...modeling_outputs import (
- BaseModelOutput,
- BaseModelOutputWithPastAndCrossAttentions,
- CausalLMOutputWithCrossAttentions,
- ModelOutput,
- Seq2SeqLMOutput,
- )
- from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
- from ...processing_utils import Unpack
- from ...utils import TransformersKwargs, auto_docstring, logging
- from ...utils.generic import can_return_tuple, merge_with_config_defaults
- from ...utils.output_capturing import OutputRecorder, capture_outputs
- from ..auto.configuration_auto import AutoConfig
- from ..auto.modeling_auto import AutoModel
- from .configuration_musicgen import MusicgenConfig, MusicgenDecoderConfig
- if TYPE_CHECKING:
- from ...generation.streamers import BaseStreamer
- logger = logging.get_logger(__name__)
- @dataclass
- @auto_docstring
- class MusicgenUnconditionalInput(ModelOutput):
- r"""
- encoder_outputs (`tuple[torch.FloatTensor]` of length 1, with tensor shape `(batch_size, sequence_length, hidden_size)`):
- Sequence of hidden-states at the output of the last layer of the text encoder model.
- attention_mask (`torch.LongTensor`) of shape `(batch_size, sequence_length)`, *optional*):
- Encoder attention mask to avoid performing attention on padding token indices. Mask values selected in `[0,
- 1]`: 1 for tokens that are **not masked**, 0 for tokens that are **masked**.
- guidance_scale (`float`, *optional*):
- Guidance scale for classifier free guidance, setting the balance between the conditional logits (predicted
- from the prompts) and the unconditional logits (predicted without prompts).
- """
- encoder_outputs: tuple[torch.FloatTensor] | None = None
- attention_mask: torch.LongTensor | None = None
- guidance_scale: float | None = None
- def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
- """
- Shift input ids one token to the right.
- """
- # transpose to get (bsz, num_codebooks, seq_len)
- input_ids = input_ids.transpose(1, 2)
- shifted_input_ids = input_ids.new_zeros(input_ids.shape)
- shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
- if decoder_start_token_id is None:
- raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
- shifted_input_ids[..., 0] = decoder_start_token_id
- if pad_token_id is None:
- raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
- # replace possible -100 values in labels by `pad_token_id`
- shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
- return shifted_input_ids
- class MusicgenSinusoidalPositionalEmbedding(nn.Module):
- """This module produces sinusoidal positional embeddings of any length."""
- def __init__(self, num_positions: int, embedding_dim: int):
- super().__init__()
- self.embedding_dim = embedding_dim
- self.num_positions = num_positions
- self.make_weights(num_positions, embedding_dim)
- def make_weights(self, num_embeddings: int, embedding_dim: int):
- emb_weights = self.get_embedding(num_embeddings, embedding_dim)
- if hasattr(self, "weights"):
- # in forward put the weights on the correct dtype and device of the param
- emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
- self.register_buffer("weights", emb_weights, persistent=False)
- @staticmethod
- def get_embedding(num_embeddings: int, embedding_dim: int):
- """
- Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
- description in Section 3.5 of "Attention Is All You Need".
- """
- half_dim = embedding_dim // 2
- emb = math.log(10000) / (half_dim - 1)
- emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
- emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
- emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=1).view(num_embeddings, -1)
- if embedding_dim % 2 == 1:
- # zero pad
- emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
- return emb.to(torch.get_default_dtype())
- @torch.no_grad()
- def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
- bsz, codebooks, seq_len = input_ids.size()
- # Create the position ids from the input token ids.
- position_ids = (torch.arange(seq_len) + past_key_values_length).to(input_ids.device)
- # expand embeddings if needed
- if seq_len > self.weights.size(0):
- self.make_weights(seq_len, self.embedding_dim)
- return self.weights.index_select(0, position_ids.view(-1)).detach()
- # Copied from transformers.models.bert.modeling_bert.eager_attention_forward
- def eager_attention_forward(
- module: nn.Module,
- query: torch.Tensor,
- key: torch.Tensor,
- value: torch.Tensor,
- attention_mask: torch.Tensor | None,
- scaling: float | None = None,
- dropout: float = 0.0,
- **kwargs: Unpack[TransformersKwargs],
- ):
- if scaling is None:
- scaling = query.size(-1) ** -0.5
- # Take the dot product between "query" and "key" to get the raw attention scores.
- attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
- if attention_mask is not None:
- attn_weights = attn_weights + attention_mask
- attn_weights = nn.functional.softmax(attn_weights, dim=-1)
- attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
- attn_output = torch.matmul(attn_weights, value)
- attn_output = attn_output.transpose(1, 2).contiguous()
- return attn_output, attn_weights
- class MusicgenAttention(nn.Module):
- """Multi-headed attention from 'Attention Is All You Need' paper"""
- def __init__(
- self,
- embed_dim: int,
- num_heads: int,
- dropout: float | None = 0.0,
- is_decoder: bool | None = False,
- bias: bool | None = True,
- is_causal: bool | None = False,
- config: MusicgenConfig | None = None,
- layer_idx: int | None = None,
- ):
- super().__init__()
- self.embed_dim = embed_dim
- self.num_heads = num_heads
- self.dropout = dropout
- self.head_dim = embed_dim // num_heads
- self.config = config
- if (self.head_dim * num_heads) != self.embed_dim:
- raise ValueError(
- f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
- f" and `num_heads`: {num_heads})."
- )
- self.scaling = self.head_dim**-0.5
- self.is_decoder = is_decoder
- self.is_causal = is_causal
- self.layer_idx = layer_idx
- self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
- self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
- self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
- self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
- def forward(
- self,
- hidden_states: torch.Tensor,
- key_value_states: torch.Tensor | None = None,
- past_key_values: Cache | None = None,
- attention_mask: torch.Tensor | None = None,
- output_attentions: bool | None = False,
- # TODO: we need a refactor so that the different attention modules can get their specific kwargs
- # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
- **kwargs: Unpack[FlashAttentionKwargs],
- ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
- """Input shape: Batch x Time x Channel"""
- # if key_value_states are provided this layer is used as a cross-attention layer
- # for the decoder
- is_cross_attention = key_value_states is not None
- # determine input shapes
- input_shape = hidden_states.shape[:-1]
- hidden_shape = (*input_shape, -1, self.head_dim)
- # get query proj
- query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
- is_updated = False
- if past_key_values is not None:
- if isinstance(past_key_values, EncoderDecoderCache):
- is_updated = past_key_values.is_updated.get(self.layer_idx)
- if is_cross_attention:
- # after the first generated id, we can subsequently re-use all key/value_layer from cache
- curr_past_key_values = past_key_values.cross_attention_cache
- else:
- curr_past_key_values = past_key_values.self_attention_cache
- else:
- curr_past_key_values = past_key_values
- current_states = key_value_states if is_cross_attention else hidden_states
- if is_cross_attention and past_key_values is not None and is_updated:
- # reuse k,v, cross_attentions
- key_states = curr_past_key_values.layers[self.layer_idx].keys
- value_states = curr_past_key_values.layers[self.layer_idx].values
- else:
- kv_shape = (*current_states.shape[:-1], -1, self.head_dim)
- key_states = self.k_proj(current_states).view(kv_shape).transpose(1, 2)
- value_states = self.v_proj(current_states).view(kv_shape).transpose(1, 2)
- if past_key_values is not None:
- # save all key/value_states to cache to be re-used for fast auto-regressive generation
- key_states, value_states = curr_past_key_values.update(key_states, value_states, self.layer_idx)
- # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
- if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
- past_key_values.is_updated[self.layer_idx] = True
- attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
- self.config._attn_implementation, eager_attention_forward
- )
- attn_output, attn_weights = attention_interface(
- self,
- query_states,
- key_states,
- value_states,
- attention_mask,
- dropout=0.0 if not self.training else self.dropout,
- scaling=self.scaling,
- output_attentions=output_attentions,
- **kwargs,
- )
- attn_output = attn_output.reshape(*input_shape, -1).contiguous()
- attn_output = self.out_proj(attn_output)
- return attn_output, attn_weights
- class MusicgenDecoderLayer(GradientCheckpointingLayer):
- def __init__(self, config: MusicgenDecoderConfig, layer_idx=None):
- super().__init__()
- self.embed_dim = config.hidden_size
- self.self_attn = MusicgenAttention(
- embed_dim=self.embed_dim,
- num_heads=config.num_attention_heads,
- dropout=config.attention_dropout,
- is_decoder=True,
- bias=False,
- is_causal=True,
- config=config,
- layer_idx=layer_idx,
- )
- self.dropout = config.dropout
- self.activation_fn = ACT2FN[config.activation_function]
- self.activation_dropout = config.activation_dropout
- self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
- self.encoder_attn = MusicgenAttention(
- self.embed_dim,
- config.num_attention_heads,
- dropout=config.attention_dropout,
- is_decoder=True,
- bias=False,
- config=config,
- layer_idx=layer_idx,
- )
- self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
- self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=False)
- self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=False)
- self.final_layer_norm = nn.LayerNorm(self.embed_dim)
- def forward(
- self,
- hidden_states: torch.Tensor,
- attention_mask: torch.Tensor | None = None,
- encoder_hidden_states: torch.Tensor | None = None,
- encoder_attention_mask: torch.Tensor | None = None,
- past_key_values: Cache | None = None,
- use_cache: bool | None = True,
- **kwargs: Unpack[TransformersKwargs],
- ) -> torch.Tensor:
- """
- Args:
- hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
- attention_mask (`torch.FloatTensor`): attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- encoder_hidden_states (`torch.FloatTensor`):
- cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
- encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
- past_key_values (`Cache`): cached past key and value projection states
- """
- residual = hidden_states
- hidden_states = self.self_attn_layer_norm(hidden_states)
- # Self Attention
- hidden_states, _ = self.self_attn(
- hidden_states,
- past_key_values=past_key_values,
- attention_mask=attention_mask,
- **kwargs,
- )
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
- hidden_states = residual + hidden_states
- # Cross-Attention Block
- if encoder_hidden_states is not None:
- residual = hidden_states
- hidden_states = self.encoder_attn_layer_norm(hidden_states)
- hidden_states, _ = self.encoder_attn(
- hidden_states,
- key_value_states=encoder_hidden_states,
- attention_mask=encoder_attention_mask,
- past_key_values=past_key_values,
- **kwargs,
- )
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
- hidden_states = residual + hidden_states
- # Fully Connected
- residual = hidden_states
- hidden_states = self.final_layer_norm(hidden_states)
- hidden_states = self.activation_fn(self.fc1(hidden_states))
- hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
- hidden_states = self.fc2(hidden_states)
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
- hidden_states = residual + hidden_states
- return hidden_states
- @auto_docstring
- class MusicgenPreTrainedModel(PreTrainedModel):
- config: MusicgenDecoderConfig
- base_model_prefix = "model"
- supports_gradient_checkpointing = True
- _no_split_modules = ["MusicgenDecoderLayer", "MusicgenAttention"]
- _supports_flash_attn = True
- _supports_sdpa = True
- _supports_flex_attn = True
- @torch.no_grad()
- def _init_weights(self, module):
- std = self.config.initializer_factor
- if isinstance(module, nn.Linear):
- init.normal_(module.weight, mean=0.0, std=std)
- if module.bias is not None:
- init.zeros_(module.bias)
- elif isinstance(module, nn.LayerNorm):
- init.ones_(module.weight)
- init.zeros_(module.bias)
- elif isinstance(module, nn.Embedding):
- init.normal_(module.weight, mean=0.0, std=std)
- # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
- if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
- init.zeros_(module.weight[module.padding_idx])
- elif isinstance(module, MusicgenSinusoidalPositionalEmbedding):
- emb_weights = module.get_embedding(module.num_positions, module.embedding_dim)
- init.copy_(module.weights, emb_weights)
- class MusicgenDecoder(MusicgenPreTrainedModel):
- """
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MusicgenDecoderLayer`]
- """
- _can_record_outputs = {
- "hidden_states": MusicgenDecoderLayer,
- "attentions": OutputRecorder(MusicgenAttention, index=1, layer_name="self_attn"),
- "cross_attentions": OutputRecorder(MusicgenAttention, index=1, layer_name="encoder_attn"),
- }
- def __init__(self, config: MusicgenDecoderConfig):
- super().__init__(config)
- self.dropout = config.dropout
- self.layerdrop = config.layerdrop
- self.max_target_positions = config.max_position_embeddings
- self.d_model = config.hidden_size
- self.num_codebooks = config.num_codebooks
- self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
- embed_dim = config.vocab_size + 1
- self.embed_tokens = nn.ModuleList(
- [nn.Embedding(embed_dim, config.hidden_size) for _ in range(config.num_codebooks)]
- )
- self.embed_positions = MusicgenSinusoidalPositionalEmbedding(
- config.max_position_embeddings,
- config.hidden_size,
- )
- self.layers = nn.ModuleList(
- [MusicgenDecoderLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)]
- )
- self.layer_norm = nn.LayerNorm(config.hidden_size)
- self.attn_implementation = config._attn_implementation
- self.gradient_checkpointing = False
- # Initialize weights and apply final processing
- self.post_init()
- @merge_with_config_defaults
- @capture_outputs
- @auto_docstring
- def forward(
- self,
- input_ids: torch.LongTensor | None = None,
- attention_mask: torch.Tensor | None = None,
- encoder_hidden_states: torch.FloatTensor | None = None,
- encoder_attention_mask: torch.LongTensor | None = None,
- past_key_values: Cache | None = None,
- inputs_embeds: torch.FloatTensor | None = None,
- use_cache: bool | None = None,
- **kwargs: Unpack[TransformersKwargs],
- ) -> tuple | BaseModelOutputWithPastAndCrossAttentions:
- r"""
- input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
- Indices of input sequence tokens in the vocabulary, corresponding to the sequence of audio codes.
- Indices can be obtained by encoding an audio prompt with an audio encoder model to predict audio codes,
- such as with the [`EncodecModel`]. See [`EncodecModel.encode`] for details.
- [What are input IDs?](../glossary#input-ids)
- <Tip warning={true}>
- The `input_ids` will automatically be converted from shape `(batch_size * num_codebooks,
- target_sequence_length)` to `(batch_size, num_codebooks, target_sequence_length)` in the forward pass. If
- you obtain audio codes from an audio encoding model, such as [`EncodecModel`], ensure that the number of
- frames is equal to 1, and that you reshape the audio codes from `(frames, batch_size, num_codebooks,
- target_sequence_length)` to `(batch_size * num_codebooks, target_sequence_length)` prior to passing them as
- `input_ids`.
- </Tip>
- encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
- the decoder.
- encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
- Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in `[0, 1]`:
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
- [What are attention masks?](../glossary#attention-mask)
- """
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
- elif input_ids is not None:
- # (bsz * codebooks, seq_len) -> (bsz, codebooks, seq_len)
- input = input_ids.reshape(-1, self.num_codebooks, input_ids.shape[-1])
- bsz, num_codebooks, seq_len = input.shape
- elif inputs_embeds is not None:
- input = inputs_embeds[:, :, -1:]
- else:
- raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
- if use_cache and past_key_values is None:
- past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
- past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
- if inputs_embeds is None:
- inputs_embeds = sum(self.embed_tokens[codebook](input[:, codebook]) for codebook in range(num_codebooks))
- attention_mask = create_causal_mask(
- config=self.config,
- inputs_embeds=inputs_embeds,
- attention_mask=attention_mask,
- past_key_values=past_key_values,
- )
- encoder_attention_mask = create_bidirectional_mask(
- config=self.config,
- inputs_embeds=inputs_embeds,
- attention_mask=encoder_attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- )
- # embed positions
- positions = self.embed_positions(input, past_key_values_length)
- hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
- for idx, decoder_layer in enumerate(self.layers):
- # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
- dropout_probability = random.uniform(0, 1)
- if self.training and (dropout_probability < self.layerdrop):
- continue
- hidden_states = decoder_layer(
- hidden_states,
- attention_mask,
- encoder_hidden_states, # as a positional argument for gradient checkpointing
- encoder_attention_mask=encoder_attention_mask,
- past_key_values=past_key_values,
- use_cache=use_cache,
- **kwargs,
- )
- hidden_states = self.layer_norm(hidden_states)
- return BaseModelOutputWithPastAndCrossAttentions(
- last_hidden_state=hidden_states,
- past_key_values=past_key_values,
- )
- @auto_docstring
- class MusicgenModel(MusicgenPreTrainedModel):
- def __init__(self, config: MusicgenDecoderConfig):
- super().__init__(config)
- self.decoder = MusicgenDecoder(config)
- # Initialize weights and apply final processing
- self.post_init()
- def get_input_embeddings(self):
- return self.decoder.embed_tokens
- def set_input_embeddings(self, value):
- self.decoder.embed_tokens = value
- @merge_with_config_defaults
- @capture_outputs
- @auto_docstring
- def forward(
- self,
- input_ids: torch.LongTensor | None = None,
- attention_mask: torch.Tensor | None = None,
- encoder_hidden_states: torch.FloatTensor | None = None,
- encoder_attention_mask: torch.LongTensor | None = None,
- past_key_values: Cache | None = None,
- inputs_embeds: torch.FloatTensor | None = None,
- use_cache: bool | None = None,
- **kwargs: Unpack[TransformersKwargs],
- ) -> tuple | BaseModelOutputWithPastAndCrossAttentions:
- r"""
- input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
- Indices of input sequence tokens in the vocabulary, corresponding to the sequence of audio codes.
- Indices can be obtained by encoding an audio prompt with an audio encoder model to predict audio codes,
- such as with the [`EncodecModel`]. See [`EncodecModel.encode`] for details.
- [What are input IDs?](../glossary#input-ids)
- <Tip warning={true}>
- The `input_ids` will automatically be converted from shape `(batch_size * num_codebooks,
- target_sequence_length)` to `(batch_size, num_codebooks, target_sequence_length)` in the forward pass. If
- you obtain audio codes from an audio encoding model, such as [`EncodecModel`], ensure that the number of
- frames is equal to 1, and that you reshape the audio codes from `(frames, batch_size, num_codebooks,
- target_sequence_length)` to `(batch_size * num_codebooks, target_sequence_length)` prior to passing them as
- `input_ids`.
- </Tip>
- encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
- the decoder.
- encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
- Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in `[0, 1]`:
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
- [What are attention masks?](../glossary#attention-mask)
- """
- decoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.decoder(
- input_ids=input_ids,
- attention_mask=attention_mask,
- encoder_attention_mask=encoder_attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- past_key_values=past_key_values,
- inputs_embeds=inputs_embeds,
- use_cache=use_cache,
- **kwargs,
- )
- return decoder_outputs
- @auto_docstring(
- custom_intro="""
- The MusicGen decoder model with a language modelling head on top.
- """
- )
- class MusicgenForCausalLM(MusicgenPreTrainedModel, GenerationMixin):
- output_modalities = ("audio",)
- def __init__(self, config: MusicgenDecoderConfig):
- super().__init__(config)
- self.model = MusicgenModel(config)
- self.num_codebooks = config.num_codebooks
- self.lm_heads = nn.ModuleList(
- [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(config.num_codebooks)]
- )
- # Initialize weights and apply final processing
- self.post_init()
- def get_input_embeddings(self):
- return self.model.decoder.embed_tokens
- def set_input_embeddings(self, value):
- self.model.decoder.embed_tokens = value
- def get_output_embeddings(self):
- return self.lm_heads
- def set_output_embeddings(self, new_embeddings):
- self.lm_heads = new_embeddings
- @merge_with_config_defaults
- @capture_outputs
- @auto_docstring
- def forward(
- self,
- input_ids: torch.LongTensor | None = None,
- attention_mask: torch.Tensor | None = None,
- encoder_hidden_states: torch.FloatTensor | None = None,
- encoder_attention_mask: torch.LongTensor | None = None,
- past_key_values: Cache | None = None,
- inputs_embeds: torch.FloatTensor | None = None,
- labels: torch.LongTensor | None = None,
- use_cache: bool | None = None,
- **kwargs: Unpack[TransformersKwargs],
- ) -> tuple | CausalLMOutputWithCrossAttentions:
- r"""
- input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
- Indices of input sequence tokens in the vocabulary, corresponding to the sequence of audio codes.
- Indices can be obtained by encoding an audio prompt with an audio encoder model to predict audio codes,
- such as with the [`EncodecModel`]. See [`EncodecModel.encode`] for details.
- [What are input IDs?](../glossary#input-ids)
- <Tip warning={true}>
- The `input_ids` will automatically be converted from shape `(batch_size * num_codebooks,
- target_sequence_length)` to `(batch_size, num_codebooks, target_sequence_length)` in the forward pass. If
- you obtain audio codes from an audio encoding model, such as [`EncodecModel`], ensure that the number of
- frames is equal to 1, and that you reshape the audio codes from `(frames, batch_size, num_codebooks,
- target_sequence_length)` to `(batch_size * num_codebooks, target_sequence_length)` prior to passing them as
- `input_ids`.
- </Tip>
- encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
- the decoder.
- encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
- Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
- selected in `[0, 1]`:
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
- [What are attention masks?](../glossary#attention-mask)
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`, *optional*):
- Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
- `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
- are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
- """
- if (labels is not None) and (input_ids is None and inputs_embeds is None):
- input_ids = shift_tokens_right(labels, self.config.pad_token_id, self.config.bos_token_id)
- outputs: BaseModelOutputWithPastAndCrossAttentions = self.model(
- input_ids,
- attention_mask=attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- past_key_values=past_key_values,
- inputs_embeds=inputs_embeds,
- use_cache=use_cache,
- **kwargs,
- )
- hidden_states = outputs.last_hidden_state
- lm_logits = torch.stack([head(hidden_states) for head in self.lm_heads], dim=1)
- loss = None
- if labels is not None:
- # since encoder hidden states have been concatenated to the decoder hidden states,
- # we take the last timestamps corresponding to labels
- logits = lm_logits[:, :, -labels.shape[1] :]
- loss_fct = CrossEntropyLoss()
- loss = torch.zeros([], device=self.device)
- # per codebook cross-entropy
- # -100 labels are ignored
- labels = labels.masked_fill(labels == self.config.pad_token_id, -100)
- # per codebook cross-entropy
- # ref: https://github.com/facebookresearch/audiocraft/blob/69fea8b290ad1b4b40d28f92d1dfc0ab01dbab85/audiocraft/solvers/musicgen.py#L242-L243
- for codebook in range(self.config.num_codebooks):
- codebook_logits = logits[:, codebook].contiguous().view(-1, logits.shape[-1])
- codebook_labels = labels[..., codebook].contiguous().view(-1)
- loss += loss_fct(codebook_logits, codebook_labels)
- loss = loss / self.config.num_codebooks
- # (bsz, num_codebooks, seq_len, vocab_size) -> (bsz * num_codebooks, seq_len, vocab_size)
- lm_logits = lm_logits.reshape(-1, *lm_logits.shape[2:])
- return CausalLMOutputWithCrossAttentions(
- loss=loss,
- logits=lm_logits,
- past_key_values=outputs.past_key_values,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- cross_attentions=outputs.cross_attentions,
- )
- def prepare_inputs_for_generation(
- self,
- input_ids,
- attention_mask=None,
- encoder_hidden_states=None,
- encoder_attention_mask=None,
- past_key_values=None,
- use_cache=True,
- delay_pattern_mask=None,
- guidance_scale=None,
- **kwargs,
- ):
- # Overwritten -- MusicGen has custom processing
- if delay_pattern_mask is None:
- input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
- input_ids,
- pad_token_id=self.generation_config.pad_token_id,
- max_length=self.generation_config.max_length,
- )
- # apply the delay pattern mask
- input_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
- if guidance_scale is not None and guidance_scale > 1:
- # for classifier free guidance we need to replicate the decoder args across the batch dim (we'll split these
- # before sampling)
- input_ids = input_ids.repeat((2, 1))
- if attention_mask is not None:
- attention_mask = attention_mask.repeat((2, 1))
- if past_key_values is not None:
- input_ids = input_ids[:, -1:]
- return {
- "input_ids": input_ids,
- "attention_mask": attention_mask,
- "encoder_hidden_states": encoder_hidden_states,
- "encoder_attention_mask": encoder_attention_mask,
- "past_key_values": past_key_values,
- "use_cache": use_cache,
- }
- def build_delay_pattern_mask(self, input_ids: torch.LongTensor, pad_token_id: int, max_length: int | None = None):
- """Build a delayed pattern mask to the input_ids. Each codebook is offset by the previous codebook by
- one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there
- are 4 codebooks and a max sequence length of 8, we have the delayed pattern mask of shape `(codebooks,
- seq_len)`:
- - [P, -1, -1, -1, -1, P, P, P]
- - [P, P, -1, -1, -1, -1, P, P]
- - [P, P, P, -1, -1, -1, -1, P]
- - [P, P, P, P, -1, -1, -1, -1]
- where P is the special padding token id and -1 indicates that the token is valid for prediction. If we include
- a prompt (decoder input ids), the -1 positions indicate where new tokens should be predicted. Otherwise, the
- mask is set to the value in the prompt:
- - [P, a, b, -1, -1, P, P, P]
- - [P, P, c, d, -1, -1, P, P]
- - [P, P, P, e, f, -1, -1, P]
- - [P, P, P, P, g, h, -1, -1]
- where a-h indicate the input prompt (decoder input ids) that are offset by 1. Now, we only override the -1
- tokens in our prediction.
- """
- # (bsz * num_codebooks, seq_len) -> (bsz, num_codebooks, seq_len)
- input_ids = input_ids.reshape(-1, self.num_codebooks, input_ids.shape[-1])
- bsz, num_codebooks, seq_len = input_ids.shape
- max_length = max_length if max_length is not None else self.generation_config.max_length
- input_ids_shifted = (
- torch.ones((bsz, num_codebooks, max_length), dtype=torch.long, device=input_ids.device) * -1
- )
- channel_codebooks = num_codebooks // 2 if self.config.audio_channels == 2 else num_codebooks
- # we only apply the mask if we have a large enough seq len - otherwise we return as is
- if max_length < 2 * channel_codebooks - 1:
- return input_ids.reshape(bsz * num_codebooks, -1), input_ids_shifted.reshape(bsz * num_codebooks, -1)
- # fill the shifted ids with the prompt entries, offset by the codebook idx
- for codebook in range(channel_codebooks):
- if self.config.audio_channels == 1:
- # mono channel - loop over the codebooks one-by-one
- input_ids_shifted[:, codebook, codebook : seq_len + codebook] = input_ids[:, codebook]
- else:
- # left/right channels are interleaved in the generated codebooks, so handle one then the other
- input_ids_shifted[:, 2 * codebook, codebook : seq_len + codebook] = input_ids[:, 2 * codebook]
- input_ids_shifted[:, 2 * codebook + 1, codebook : seq_len + codebook] = input_ids[:, 2 * codebook + 1]
- # construct a pattern mask that indicates the positions of padding tokens for each codebook
- # first fill the upper triangular part (the EOS padding)
- delay_pattern = torch.triu(
- torch.ones((channel_codebooks, max_length), dtype=torch.bool), diagonal=max_length - channel_codebooks + 1
- )
- # then fill the lower triangular part (the BOS padding)
- delay_pattern = delay_pattern + torch.tril(torch.ones((channel_codebooks, max_length), dtype=torch.bool))
- if self.config.audio_channels == 2:
- # for left/right channel we need to duplicate every row of the pattern mask in an interleaved fashion
- delay_pattern = delay_pattern.repeat_interleave(2, dim=0)
- mask = ~delay_pattern.to(input_ids.device)
- input_ids = mask * input_ids_shifted + ~mask * pad_token_id
- # find the first position to start generating - this is the first place we have the -1 token
- # and will always be in the first codebook (since it has no codebook offset)
- first_codebook_ids = input_ids[:, 0, :]
- start_ids = (first_codebook_ids == -1).nonzero()[:, 1]
- if len(start_ids) > 0:
- first_start_id = min(start_ids)
- else:
- # we have no tokens that need to be filled - return entire matrix of input ids
- first_start_id = seq_len
- # (bsz * num_codebooks, seq_len) -> (bsz, num_codebooks, seq_len)
- pattern_mask = input_ids.reshape(bsz * num_codebooks, -1)
- input_ids = input_ids[..., :first_start_id].reshape(bsz * num_codebooks, -1)
- return input_ids, pattern_mask
- @staticmethod
- def apply_delay_pattern_mask(input_ids, decoder_pad_token_mask):
- """Apply a delay pattern mask to the decoder input ids, only preserving predictions where
- the mask is set to -1, and otherwise setting to the value detailed in the mask."""
- seq_len = input_ids.shape[-1]
- decoder_pad_token_mask = decoder_pad_token_mask[..., :seq_len]
- input_ids = torch.where(decoder_pad_token_mask == -1, input_ids, decoder_pad_token_mask)
- return input_ids
- @torch.no_grad()
- def generate(
- self,
- inputs: torch.Tensor | None = None,
- generation_config: GenerationConfig | None = None,
- logits_processor: LogitsProcessorList | None = None,
- stopping_criteria: StoppingCriteriaList | None = None,
- synced_gpus: bool | None = None,
- streamer: Optional["BaseStreamer"] = None,
- **kwargs,
- ):
- """
- Generates sequences of token ids for models with a language modeling head.
- <Tip warning={true}>
- Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
- model's default generation configuration. You can override any `generation_config` by passing the corresponding
- parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
- For an overview of generation strategies and code examples, check out the [following
- guide](./generation_strategies).
- </Tip>
- Parameters:
- inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
- The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
- method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
- should be in the format `input_ids`. For encoder-decoder models *inputs* can represent any of
- `input_ids`, `input_values`, `input_features`, or `pixel_values`.
- generation_config (`~generation.GenerationConfig`, *optional*):
- The generation configuration to be used as base parametrization for the generation call. `**kwargs`
- passed to generate matching the attributes of `generation_config` will override them. If
- `generation_config` is not provided, the default will be used, which had the following loading
- priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
- configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
- default values, whose documentation should be checked to parameterize generation.
- logits_processor (`LogitsProcessorList`, *optional*):
- Custom logits processors that complement the default logits processors built from arguments and
- generation config. If a logit processor is passed that is already created with the arguments or a
- generation config an error is thrown. This feature is intended for advanced users.
- stopping_criteria (`StoppingCriteriaList`, *optional*):
- Custom stopping criteria that complement the default stopping criteria built from arguments and a
- generation config. If a stopping criteria is passed that is already created with the arguments or a
- generation config an error is thrown. This feature is intended for advanced users.
- synced_gpus (`bool`, *optional*, defaults to `False`):
- Whether to continue running the while loop until max_length (needed to avoid deadlocking with
- `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
- streamer (`BaseStreamer`, *optional*):
- Streamer object that will be used to stream the generated sequences. Generated tokens are passed
- through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
- kwargs (`dict[str, Any]`, *optional*):
- Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
- forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
- specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
- Return:
- [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
- or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
- If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
- [`~utils.ModelOutput`] types are:
- - [`~generation.GenerateDecoderOnlyOutput`],
- - [`~generation.GenerateBeamDecoderOnlyOutput`]
- If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
- [`~utils.ModelOutput`] types are:
- - [`~generation.GenerateEncoderDecoderOutput`],
- - [`~generation.GenerateBeamEncoderDecoderOutput`]
- """
- # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
- if generation_config is None:
- generation_config = self.generation_config
- generation_config = copy.deepcopy(generation_config)
- model_kwargs = generation_config.update(**kwargs) # All unused kwargs must be model kwargs
- generation_config.validate()
- self._validate_model_kwargs(model_kwargs.copy())
- # 2. Set generation parameters if not already defined
- logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
- stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
- requires_attention_mask = "encoder_outputs" not in model_kwargs
- kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
- # 3. Define model inputs`
- input_ids, model_input_name, model_kwargs = self._prepare_model_inputs(
- inputs, generation_config.bos_token_id, model_kwargs
- )
- batch_size = input_ids.shape[0] // self.num_codebooks
- self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=input_ids.device)
- # 4. Define other model kwargs
- model_kwargs["use_cache"] = generation_config.use_cache
- model_kwargs["guidance_scale"] = generation_config.guidance_scale
- if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
- model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
- input_ids, generation_config, model_kwargs
- )
- # 5. Prepare `max_length` depending on other stopping criteria.
- input_ids_length = input_ids.shape[-1]
- has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
- has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
- generation_config = self._prepare_generated_length(
- generation_config=generation_config,
- has_default_max_length=has_default_max_length,
- has_default_min_length=has_default_min_length,
- model_input_name=model_input_name,
- inputs_tensor=input_ids,
- input_ids_length=input_ids_length,
- )
- self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
- # 6. Prepare the cache.
- # - `model_kwargs` may be updated in place with a cache as defined by the parameters in `generation_config`.
- # - different models have a different cache name expected by the model (default = "past_key_values")
- # - `max_length`, prepared above, is used to determine the maximum cache length
- max_cache_length = generation_config.max_length - 1
- if (
- input_ids_length.shape[1] != input_ids_length
- and model_input_name == "inputs_embeds"
- and not self.config.is_encoder_decoder
- ):
- max_cache_length += input_ids_length.shape[1]
- self._prepare_cache_for_generation(
- generation_config,
- model_kwargs,
- generation_mode=None,
- batch_size=batch_size,
- max_cache_length=max_cache_length,
- )
- # 7. Prepare `input_ids` which will be used for auto-regressive generation
- # Build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
- input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
- input_ids,
- pad_token_id=generation_config._decoder_start_token_tensor,
- max_length=generation_config.max_length,
- )
- if streamer is not None:
- streamer.put(input_ids.cpu())
- # stash the delay mask so that we don't have to recompute it in each forward pass
- model_kwargs["delay_pattern_mask"] = delay_pattern_mask
- # 8. determine generation mode
- generation_mode = generation_config.get_generation_mode()
- # 9. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
- if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
- logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
- generation_config.guidance_scale = None
- # 10. prepare distribution pre_processing samplers
- logits_processor = self._get_logits_processor(
- generation_config=generation_config,
- input_ids_seq_length=input_ids_length,
- encoder_input_ids=input_ids,
- prefix_allowed_tokens_fn=None,
- logits_processor=logits_processor,
- device=input_ids.device,
- )
- # 10. prepare stopping criteria
- stopping_criteria = self._get_stopping_criteria(
- generation_config=generation_config, stopping_criteria=stopping_criteria
- )
- if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
- # expand input_ids with `num_return_sequences` additional sequences per batch
- input_ids, model_kwargs = self._expand_inputs_for_generation(
- input_ids=input_ids,
- expand_size=generation_config.num_return_sequences,
- **model_kwargs,
- )
- # 11. run sample
- outputs = self._sample(
- input_ids,
- logits_processor=logits_processor,
- stopping_criteria=stopping_criteria,
- generation_config=generation_config,
- synced_gpus=synced_gpus,
- streamer=streamer,
- **model_kwargs,
- )
- else:
- raise ValueError(
- "Got incompatible mode for generation, should be one of greedy or sampling. "
- "Ensure that beam search is de-activated by setting `num_beams=1`."
- )
- if generation_config.return_dict_in_generate:
- output_ids = outputs.sequences
- else:
- output_ids = outputs
- # apply the pattern mask to the final ids
- output_ids = self.apply_delay_pattern_mask(output_ids, model_kwargs["delay_pattern_mask"])
- # revert the pattern delay mask by filtering the pad token id
- output_ids = output_ids[output_ids != generation_config._pad_token_tensor].reshape(
- batch_size, self.num_codebooks, -1
- )
- if generation_config.return_dict_in_generate:
- outputs.sequences = output_ids
- return outputs
- else:
- return output_ids
- @auto_docstring(
- custom_intro="""
- The composite MusicGen model with a text encoder, audio encoder and Musicgen decoder,
- """
- )
- class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin):
- config: MusicgenConfig
- output_modalities = ("audio",)
- base_model_prefix = "encoder_decoder"
- main_input_name = "input_ids"
- supports_gradient_checkpointing = True
- def __init__(
- self,
- config: MusicgenConfig | None = None,
- text_encoder: PreTrainedModel | None = None,
- audio_encoder: PreTrainedModel | None = None,
- decoder: MusicgenForCausalLM | None = None,
- ):
- r"""
- text_encoder (`PreTrainedModel`, *optional*):
- The text encoder model that encodes text into hidden states for conditioning.
- audio_encoder (`PreTrainedModel`, *optional*):
- The audio encoder model that encodes audio into hidden states for conditioning.
- decoder (`MusicgenForCausalLM`, *optional*):
- The decoder model that generates audio tokens based on conditioning signals.
- """
- if config is None and (text_encoder is None or audio_encoder is None or decoder is None):
- raise ValueError(
- "Either a configuration has to be provided, or all three of text encoder, audio encoder and MusicGen decoder."
- )
- if config is None:
- config = MusicgenConfig(
- text_encoder=text_encoder.config, audio_encoder=audio_encoder.config, decoder=decoder.config
- )
- else:
- if not isinstance(config, self.config_class):
- raise ValueError(f"Config: {config} has to be of type {self.config_class}")
- if config.decoder.cross_attention_hidden_size is not None:
- if config.decoder.cross_attention_hidden_size != config.text_encoder.hidden_size:
- raise ValueError(
- "If `cross_attention_hidden_size` is specified in the MusicGen decoder's configuration, it has to be equal"
- f" to the text encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
- f" `config.decoder.cross_attention_hidden_size` and {config.text_encoder.hidden_size} for"
- " `config.text_encoder.hidden_size`."
- )
- # initialize with config
- super().__init__(config)
- if text_encoder is None:
- from ..auto.modeling_auto import AutoModelForTextEncoding
- text_encoder = AutoModelForTextEncoding.from_config(config.text_encoder)
- if audio_encoder is None:
- from ..auto.modeling_auto import AutoModel
- audio_encoder = AutoModel.from_config(config.audio_encoder)
- if decoder is None:
- decoder = MusicgenForCausalLM._from_config(config.decoder)
- self.text_encoder = text_encoder
- self.audio_encoder = audio_encoder
- self.decoder = decoder
- if self.text_encoder.config.to_dict() != self.config.text_encoder.to_dict():
- logger.warning(
- f"Config of the text_encoder: {self.text_encoder.__class__} is overwritten by shared text_encoder config:"
- f" {self.config.text_encoder}"
- )
- if self.audio_encoder.config.to_dict() != self.config.audio_encoder.to_dict():
- logger.warning(
- f"Config of the audio_encoder: {self.audio_encoder.__class__} is overwritten by shared audio_encoder config:"
- f" {self.config.audio_encoder}"
- )
- if self.decoder.config.to_dict() != self.config.decoder.to_dict():
- logger.warning(
- f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
- f" {self.config.decoder}"
- )
- # make sure that the individual model's config refers to the shared config
- # so that the updates to the config will be synced
- self.config.text_encoder._attn_implementation = self.text_encoder.config._attn_implementation
- self.config.audio_encoder._attn_implementation = self.audio_encoder.config._attn_implementation
- self.config.decoder._attn_implementation = self.decoder.config._attn_implementation
- self.text_encoder.config = self.config.text_encoder
- self.audio_encoder.config = self.config.audio_encoder
- self.decoder.config = self.config.decoder
- # text encoder outputs might need to be projected to different dimension for decoder
- if (
- self.text_encoder.config.hidden_size != self.decoder.config.hidden_size
- and self.decoder.config.cross_attention_hidden_size is None
- ):
- self.enc_to_dec_proj = nn.Linear(self.text_encoder.config.hidden_size, self.decoder.config.hidden_size)
- if self.text_encoder.get_output_embeddings() is not None:
- raise ValueError(
- f"The encoder {self.text_encoder} should not have a LM Head. Please use a model without and LM Head"
- )
- decoder_signature = set(inspect.signature(self.decoder.forward).parameters.keys())
- if "encoder_hidden_states" not in decoder_signature:
- raise ValueError(
- "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
- "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
- )
- # tie text encoder, decoder weights if config set accordingly
- self.post_init()
- def get_input_embeddings(self):
- return self.text_encoder.get_input_embeddings()
- def get_output_embeddings(self):
- return self.decoder.get_output_embeddings()
- def set_output_embeddings(self, new_embeddings):
- return self.decoder.set_output_embeddings(new_embeddings)
- @classmethod
- def from_sub_models_pretrained(
- cls,
- text_encoder_pretrained_model_name_or_path: str | None = None,
- audio_encoder_pretrained_model_name_or_path: str | None = None,
- decoder_pretrained_model_name_or_path: str | None = None,
- *model_args,
- **kwargs,
- ) -> PreTrainedModel:
- r"""
- Instantiate a text encoder, an audio encoder, and a MusicGen decoder from one, two or three base classes of the
- library from pretrained model checkpoints.
- The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
- the model, you need to first set it back in training mode with `model.train()`.
- Params:
- text_encoder_pretrained_model_name_or_path (`str`, *optional*):
- Information necessary to initiate the text encoder. Can be either:
- - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
- - A path to a *directory* containing model weights saved using
- [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
- audio_encoder_pretrained_model_name_or_path (`str`, *optional*):
- Information necessary to initiate the audio encoder. Can be either:
- - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
- - A path to a *directory* containing model weights saved using
- [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
- decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
- Information necessary to initiate the decoder. Can be either:
- - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
- - A path to a *directory* containing model weights saved using
- [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
- model_args (remaining positional arguments, *optional*):
- All remaining positional arguments will be passed to the underlying model's `__init__` method.
- kwargs (remaining dictionary of keyword arguments, *optional*):
- Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
- `output_attentions=True`).
- - To update the text encoder configuration, use the prefix *text_encoder_* for each configuration
- parameter.
- - To update the audio encoder configuration, use the prefix *audio_encoder_* for each configuration
- parameter.
- - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
- - To update the parent model configuration, do not use a prefix for each configuration parameter.
- Behaves differently depending on whether a `config` is provided or automatically loaded.
- Example:
- ```python
- >>> from transformers import MusicgenForConditionalGeneration
- >>> # initialize a musicgen model from a t5 text encoder, encodec audio encoder, and musicgen decoder
- >>> model = MusicgenForConditionalGeneration.from_sub_models_pretrained(
- ... text_encoder_pretrained_model_name_or_path="google-t5/t5-base",
- ... audio_encoder_pretrained_model_name_or_path="facebook/encodec_24khz",
- ... decoder_pretrained_model_name_or_path="facebook/musicgen-small",
- ... )
- >>> # saving model after fine-tuning
- >>> model.save_pretrained("./musicgen-ft")
- >>> # load fine-tuned model
- >>> model = MusicgenForConditionalGeneration.from_pretrained("./musicgen-ft")
- ```"""
- kwargs_text_encoder = {
- argument[len("text_encoder_") :]: value
- for argument, value in kwargs.items()
- if argument.startswith("text_encoder_")
- }
- kwargs_audio_encoder = {
- argument[len("audio_encoder_") :]: value
- for argument, value in kwargs.items()
- if argument.startswith("audio_encoder_")
- }
- kwargs_decoder = {
- argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
- }
- # remove text encoder, audio encoder and decoder kwargs from kwargs
- for key in kwargs_text_encoder:
- del kwargs["text_encoder_" + key]
- for key in kwargs_audio_encoder:
- del kwargs["audio_encoder_" + key]
- for key in kwargs_decoder:
- del kwargs["decoder_" + key]
- # Load and initialize the encoder and decoder
- # The distinction between encoder and decoder at the model level is made
- # by the value of the flag `is_decoder` that we need to set correctly.
- text_encoder = kwargs_text_encoder.pop("model", None)
- if text_encoder is None:
- if text_encoder_pretrained_model_name_or_path is None:
- raise ValueError(
- "If `text_encoder_model` is not defined as an argument, a `text_encoder_pretrained_model_name_or_path` has "
- "to be defined."
- )
- if "config" not in kwargs_text_encoder:
- encoder_config, kwargs_text_encoder = AutoConfig.from_pretrained(
- text_encoder_pretrained_model_name_or_path, **kwargs_text_encoder, return_unused_kwargs=True
- )
- if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
- logger.info(
- f"Initializing {text_encoder_pretrained_model_name_or_path} as a text_encoder model "
- "from a decoder model. Cross-attention and causal mask are disabled."
- )
- encoder_config.is_decoder = False
- encoder_config.add_cross_attention = False
- kwargs_text_encoder["config"] = encoder_config
- text_encoder = AutoModel.from_pretrained(
- text_encoder_pretrained_model_name_or_path, *model_args, **kwargs_text_encoder
- )
- audio_encoder = kwargs_audio_encoder.pop("model", None)
- if audio_encoder is None:
- if audio_encoder_pretrained_model_name_or_path is None:
- raise ValueError(
- "If `audio_encoder_model` is not defined as an argument, an `audio_encoder_pretrained_model_name_or_path` has "
- "to be defined."
- )
- if "config" not in kwargs_audio_encoder:
- encoder_config, kwargs_audio_encoder = AutoConfig.from_pretrained(
- audio_encoder_pretrained_model_name_or_path, **kwargs_audio_encoder, return_unused_kwargs=True
- )
- if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
- logger.info(
- f"Initializing {audio_encoder_pretrained_model_name_or_path} as an audio_encoder model "
- "from a decoder model. Cross-attention and causal mask are disabled."
- )
- encoder_config.is_decoder = False
- encoder_config.add_cross_attention = False
- kwargs_audio_encoder["config"] = encoder_config
- audio_encoder = AutoModel.from_pretrained(
- audio_encoder_pretrained_model_name_or_path, *model_args, **kwargs_audio_encoder
- )
- decoder = kwargs_decoder.pop("model", None)
- if decoder is None:
- if decoder_pretrained_model_name_or_path is None:
- raise ValueError(
- "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
- "to be defined."
- )
- if "config" not in kwargs_decoder:
- decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
- decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
- )
- if isinstance(decoder_config, MusicgenConfig):
- decoder_config = decoder_config.decoder
- if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
- logger.info(
- f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
- f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
- f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
- )
- decoder_config.is_decoder = True
- decoder_config.add_cross_attention = True
- kwargs_decoder["config"] = decoder_config
- if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
- logger.warning(
- f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
- f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
- "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
- "passed to `.from_sub_models_pretrained(...)` are set to `True` or do not pass a "
- "`decoder_config` to `.from_sub_models_pretrained(...)`"
- )
- decoder = MusicgenForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
- # instantiate config with corresponding kwargs
- config = MusicgenConfig(
- text_encoder=text_encoder.config, audio_encoder=audio_encoder.config, decoder=decoder.config, **kwargs
- )
- return cls(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder, config=config)
- @can_return_tuple
- @auto_docstring
- def forward(
- self,
- input_ids: torch.LongTensor | None = None,
- attention_mask: torch.BoolTensor | None = None,
- input_values: torch.FloatTensor | None = None,
- padding_mask: torch.BoolTensor | None = None,
- decoder_input_ids: torch.LongTensor | None = None,
- decoder_attention_mask: torch.BoolTensor | None = None,
- encoder_outputs: tuple[torch.FloatTensor] | None = None,
- past_key_values: Cache | None = None,
- inputs_embeds: torch.FloatTensor | None = None,
- decoder_inputs_embeds: torch.FloatTensor | None = None,
- labels: torch.LongTensor | None = None,
- use_cache: bool | None = None,
- **kwargs: Unpack[TransformersKwargs],
- ) -> tuple | Seq2SeqLMOutput:
- r"""
- padding_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
- [What are attention masks?](../glossary#attention-mask)
- decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)`, *optional*):
- Indices of decoder input sequence tokens in the vocabulary, corresponding to the sequence of audio codes.
- Indices can be obtained by encoding an audio prompt with an audio encoder model to predict audio codes,
- such as with the [`EncodecModel`]. See [`EncodecModel.encode`] for details.
- [What are decoder input IDs?](../glossary#decoder-input-ids)
- <Tip warning={true}>
- The `decoder_input_ids` will automatically be converted from shape `(batch_size * num_codebooks,
- target_sequence_length)` to `(batch_size, num_codebooks, target_sequence_length)` in the forward pass. If
- you obtain audio codes from an audio encoding model, such as [`EncodecModel`], ensure that the number of
- frames is equal to 1, and that you reshape the audio codes from `(frames, batch_size, num_codebooks,
- target_sequence_length)` to `(batch_size * num_codebooks, target_sequence_length)` prior to passing them as
- `decoder_input_ids`.
- </Tip>
- decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
- Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
- be used by default.
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`, *optional*):
- Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
- `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
- are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
- Examples:
- ```python
- >>> from transformers import AutoProcessor, MusicgenForConditionalGeneration
- >>> import torch
- >>> processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
- >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
- >>> inputs = processor(
- ... text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"],
- ... padding=True,
- ... return_tensors="pt",
- ... )
- >>> pad_token_id = model.generation_config.pad_token_id
- >>> decoder_input_ids = (
- ... torch.ones((inputs.input_ids.shape[0] * model.decoder.num_codebooks, 1), dtype=torch.long)
- ... * pad_token_id
- ... )
- >>> logits = model(**inputs, decoder_input_ids=decoder_input_ids).logits
- >>> logits.shape # (bsz * num_codebooks, tgt_len, vocab_size)
- torch.Size([8, 1, 2048])
- ```"""
- kwargs_text_encoder = {}
- kwargs_audio_encoder = {}
- kwargs_decoder = {}
- common_kwargs = {}
- for key, value in kwargs.items():
- if key.startswith("text_encoder_"):
- kwargs_text_encoder[key[len("text_encoder_") :]] = value
- elif key.startswith("audio_encoder_"):
- kwargs_audio_encoder[key[len("audio_encoder_") :]] = value
- elif key.startswith("decoder_"):
- kwargs_decoder[key[len("decoder_") :]] = value
- else:
- common_kwargs[key] = value
- if encoder_outputs is None:
- encoder_outputs = self.text_encoder(
- input_ids=input_ids,
- attention_mask=attention_mask,
- inputs_embeds=inputs_embeds,
- **kwargs_text_encoder,
- **common_kwargs,
- )
- elif isinstance(encoder_outputs, tuple):
- encoder_outputs = BaseModelOutput(*encoder_outputs)
- encoder_hidden_states = encoder_outputs[0]
- # optionally project encoder_hidden_states
- if (
- self.text_encoder.config.hidden_size != self.decoder.config.hidden_size
- and self.decoder.config.cross_attention_hidden_size is None
- ):
- encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
- if attention_mask is not None:
- encoder_hidden_states = encoder_hidden_states * attention_mask[..., None]
- if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
- decoder_input_ids = shift_tokens_right(
- labels, self.config.decoder.pad_token_id, self.config.decoder.decoder_start_token_id
- )
- elif decoder_input_ids is None and decoder_inputs_embeds is None:
- audio_encoder_outputs = self.audio_encoder(
- input_values=input_values,
- padding_mask=padding_mask,
- **kwargs_audio_encoder,
- )
- audio_codes = audio_encoder_outputs.audio_codes
- frames, bsz, codebooks, seq_len = audio_codes.shape
- if frames != 1:
- raise ValueError(
- f"Expected 1 frame in the audio code outputs, got {frames} frames. Ensure chunking is "
- "disabled by setting `chunk_length=None` in the audio encoder."
- )
- if self.config.decoder.audio_channels == 2 and audio_codes.shape[2] == self.decoder.num_codebooks // 2:
- # mono input through encodec that we convert to stereo
- audio_codes = audio_codes.repeat_interleave(2, dim=2)
- decoder_input_ids = audio_codes[0, ...].reshape(bsz * self.decoder.num_codebooks, seq_len)
- # Decode
- decoder_outputs: CausalLMOutputWithCrossAttentions = self.decoder(
- input_ids=decoder_input_ids,
- attention_mask=decoder_attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=attention_mask,
- inputs_embeds=decoder_inputs_embeds,
- use_cache=use_cache,
- past_key_values=past_key_values,
- labels=labels,
- **kwargs_decoder,
- **common_kwargs,
- )
- return Seq2SeqLMOutput(
- loss=decoder_outputs.loss,
- logits=decoder_outputs.logits,
- past_key_values=decoder_outputs.past_key_values,
- decoder_hidden_states=decoder_outputs.hidden_states,
- decoder_attentions=decoder_outputs.attentions,
- cross_attentions=decoder_outputs.cross_attentions,
- encoder_last_hidden_state=encoder_outputs.last_hidden_state,
- encoder_hidden_states=encoder_outputs.hidden_states,
- encoder_attentions=encoder_outputs.attentions,
- )
- def prepare_inputs_for_generation(
- self,
- decoder_input_ids,
- next_sequence_length: int | None = None,
- past_key_values=None,
- attention_mask=None,
- decoder_attention_mask=None,
- use_cache=None,
- encoder_outputs=None,
- decoder_delay_pattern_mask=None,
- guidance_scale=None,
- **kwargs,
- ):
- # Overwritten -- MusicGen has custom processing
- if decoder_delay_pattern_mask is None:
- decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
- decoder_input_ids,
- self.generation_config.pad_token_id,
- max_length=self.generation_config.max_length,
- )
- # apply the delay pattern mask
- decoder_input_ids = self.decoder.apply_delay_pattern_mask(decoder_input_ids, decoder_delay_pattern_mask)
- if guidance_scale is not None and guidance_scale > 1:
- # for classifier free guidance we need to replicate the decoder args across the batch dim (we'll split these
- # before sampling)
- decoder_input_ids = decoder_input_ids.repeat((2, 1))
- if decoder_attention_mask is not None:
- decoder_attention_mask = decoder_attention_mask.repeat((2, 1))
- if past_key_values is not None:
- decoder_input_ids = (
- decoder_input_ids[:, -next_sequence_length:] if next_sequence_length is not None else decoder_input_ids
- )
- return {
- "input_ids": None, # encoder_outputs is defined. input_ids not needed
- "encoder_outputs": encoder_outputs,
- "past_key_values": past_key_values,
- "decoder_input_ids": decoder_input_ids,
- "attention_mask": attention_mask,
- "decoder_attention_mask": decoder_attention_mask,
- "use_cache": use_cache,
- }
- def _prepare_decoder_input_ids_for_generation(
- self,
- batch_size: int,
- model_input_name: str,
- model_kwargs: dict[str, torch.Tensor],
- decoder_start_token_id: int | None = None,
- bos_token_id: int | None = None,
- device: torch.device | None = None,
- ) -> tuple[torch.LongTensor, dict[str, torch.Tensor]]:
- """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
- # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
- # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
- if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
- decoder_input_ids = model_kwargs.pop("decoder_input_ids")
- elif "input_ids" in model_kwargs and model_input_name != "input_ids":
- decoder_input_ids = model_kwargs.pop("input_ids")
- else:
- decoder_input_ids = None
- # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
- decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
- if device is None:
- device = self.device
- decoder_input_ids_start = (
- torch.ones((batch_size * self.decoder.num_codebooks, 1), dtype=torch.long, device=device)
- * decoder_start_token_id
- )
- # no user input -> use decoder_start_token_id as decoder_input_ids
- if decoder_input_ids is None:
- decoder_input_ids = decoder_input_ids_start
- # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
- # decoder_attention_mask if provided)
- elif (decoder_input_ids[..., 0] != decoder_start_token_id).all().item():
- decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
- if "decoder_attention_mask" in model_kwargs:
- decoder_attention_mask = model_kwargs["decoder_attention_mask"]
- decoder_attention_mask = torch.cat(
- (torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
- dim=-1,
- )
- model_kwargs["decoder_attention_mask"] = decoder_attention_mask
- return decoder_input_ids, model_kwargs
- def _prepare_text_encoder_kwargs_for_generation(
- self,
- inputs_tensor: torch.Tensor,
- model_kwargs,
- model_input_name: str | None,
- generation_config: GenerationConfig,
- ) -> dict[str, Any]:
- # 1. get text encoder
- encoder = self.get_encoder()
- # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
- # as the inputs.
- if hasattr(encoder, "_hf_hook"):
- encoder._hf_hook.io_same_device = True
- # 2. Prepare encoder args and encoder kwargs from model kwargs.
- irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
- encoder_kwargs = {
- argument: value
- for argument, value in model_kwargs.items()
- if not any(argument.startswith(p) for p in irrelevant_prefix)
- }
- encoder_signature = set(inspect.signature(encoder.forward).parameters)
- encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
- if not encoder_accepts_wildcard:
- encoder_kwargs = {
- argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
- }
- encoder_kwargs["output_attentions"] = generation_config.output_attentions
- encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
- guidance_scale = generation_config.guidance_scale
- # 3. make sure that encoder returns `ModelOutput`
- model_input_name = model_input_name if model_input_name is not None else self.text_encoder.main_input_name
- encoder_kwargs["return_dict"] = True
- encoder_kwargs[model_input_name] = inputs_tensor
- last_hidden_state = encoder(**encoder_kwargs).last_hidden_state
- # for classifier free guidance we need to add a 'null' input to our encoder hidden states
- if guidance_scale is not None and guidance_scale > 1:
- last_hidden_state = torch.concatenate([last_hidden_state, torch.zeros_like(last_hidden_state)], dim=0)
- if "attention_mask" in model_kwargs:
- model_kwargs["attention_mask"] = torch.concatenate(
- [model_kwargs["attention_mask"], torch.zeros_like(model_kwargs["attention_mask"])], dim=0
- )
- model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=last_hidden_state)
- return model_kwargs
- def _prepare_audio_encoder_kwargs_for_generation(
- self, input_values, model_kwargs, model_input_name: str | None = None
- ):
- # 1. get audio encoder
- encoder = self.get_encoder(modality="audio")
- # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
- # as the inputs.
- if hasattr(encoder, "_hf_hook"):
- encoder._hf_hook.io_same_device = True
- # 2. Prepare encoder args and encoder kwargs from model kwargs.
- irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
- encoder_kwargs = {
- argument: value
- for argument, value in model_kwargs.items()
- if not any(argument.startswith(p) for p in irrelevant_prefix)
- }
- encoder_signature = set(inspect.signature(encoder.forward).parameters)
- encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
- if not encoder_accepts_wildcard:
- encoder_kwargs = {
- argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
- }
- # 3. make sure that encoder returns `ModelOutput`
- model_input_name = model_input_name if model_input_name is not None else self.audio_encoder.main_input_name
- encoder_kwargs["return_dict"] = True
- if self.decoder.config.audio_channels == 1:
- encoder_kwargs[model_input_name] = input_values
- audio_encoder_outputs = encoder.encode(**encoder_kwargs)
- audio_codes = audio_encoder_outputs.audio_codes
- audio_scales = audio_encoder_outputs.audio_scales
- frames, bsz, codebooks, seq_len = audio_codes.shape
- else:
- if input_values.shape[1] != 2:
- raise ValueError(
- f"Expected stereo audio (2-channels) but example has {input_values.shape[1]} channel."
- )
- encoder_kwargs[model_input_name] = input_values[:, :1, :]
- audio_encoder_outputs_left = encoder.encode(**encoder_kwargs)
- audio_codes_left = audio_encoder_outputs_left.audio_codes
- audio_scales_left = audio_encoder_outputs_left.audio_scales
- encoder_kwargs[model_input_name] = input_values[:, 1:, :]
- audio_encoder_outputs_right = encoder.encode(**encoder_kwargs)
- audio_codes_right = audio_encoder_outputs_right.audio_codes
- audio_scales_right = audio_encoder_outputs_right.audio_scales
- frames, bsz, codebooks, seq_len = audio_codes_left.shape
- # copy alternating left/right channel codes into stereo codebook
- audio_codes = audio_codes_left.new_ones((frames, bsz, 2 * codebooks, seq_len))
- audio_codes[:, :, ::2, :] = audio_codes_left
- audio_codes[:, :, 1::2, :] = audio_codes_right
- if audio_scales_left != [None] or audio_scales_right != [None]:
- audio_scales = torch.stack([audio_scales_left, audio_scales_right], dim=1)
- else:
- audio_scales = [None] * bsz
- if frames != 1:
- raise ValueError(
- f"Expected 1 frame in the audio code outputs, got {frames} frames. Ensure chunking is "
- "disabled by setting `chunk_length=None` in the audio encoder."
- )
- decoder_input_ids = audio_codes[0, ...].reshape(bsz * self.decoder.num_codebooks, seq_len)
- model_kwargs["decoder_input_ids"] = decoder_input_ids
- model_kwargs["audio_scales"] = audio_scales
- return model_kwargs
- def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
- return shift_tokens_right(labels, self.config.decoder.pad_token_id, self.config.decoder.bos_token_id)
- def resize_token_embeddings(self, *args, **kwargs):
- raise NotImplementedError(
- "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
- " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
- " model.decoder.resize_token_embeddings(...))"
- )
- def freeze_audio_encoder(self):
- """
- Freeze the audio encoder weights.
- """
- for param in self.audio_encoder.parameters():
- param.requires_grad = False
- self.audio_encoder._requires_grad = False
- def freeze_text_encoder(self):
- """
- Freeze the text encoder weights.
- """
- for param in self.text_encoder.parameters():
- param.requires_grad = False
- self.text_encoder._requires_grad = False
- def _maybe_initialize_input_ids_for_generation(
- self,
- inputs: torch.Tensor | None,
- bos_token_id: int | None,
- model_kwargs: dict[str, torch.Tensor],
- ) -> torch.LongTensor:
- """Initializes input ids for generation, if necessary."""
- if inputs is not None:
- return inputs
- encoder_outputs = model_kwargs.get("encoder_outputs")
- if encoder_outputs is not None:
- # make dummy input_ids with value -100, as a sanity check ensuring that they won't be used for encoding
- shape = encoder_outputs[0].size()[:-1]
- return torch.ones(shape, dtype=torch.long, device=self.device) * -100
- if bos_token_id is None:
- raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
- # If there is some tensor in `model_kwargs`, we can infer the batch size from it. This is helpful with
- # soft-prompting or in multimodal implementations built on top of decoder-only language models.
- batch_size = 1
- for value in model_kwargs.values():
- if isinstance(value, torch.Tensor):
- batch_size = value.shape[0]
- break
- return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id
- def _get_decoder_start_token_id(
- self, decoder_start_token_id: int | list[int] | None = None, bos_token_id: int | None = None
- ) -> int:
- decoder_start_token_id = (
- decoder_start_token_id
- if decoder_start_token_id is not None
- else self.generation_config.decoder_start_token_id
- )
- bos_token_id = bos_token_id if bos_token_id is not None else self.generation_config.bos_token_id
- if decoder_start_token_id is not None:
- return decoder_start_token_id
- elif bos_token_id is not None:
- return bos_token_id
- raise ValueError(
- "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
- )
- @torch.no_grad()
- def generate(
- self,
- inputs: torch.Tensor | None = None,
- generation_config: GenerationConfig | None = None,
- logits_processor: LogitsProcessorList | None = None,
- stopping_criteria: StoppingCriteriaList | None = None,
- synced_gpus: bool | None = None,
- streamer: Optional["BaseStreamer"] = None,
- **kwargs,
- ):
- """
- Generates sequences of token ids for models with a language modeling head.
- <Tip warning={true}>
- Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
- model's default generation configuration. You can override any `generation_config` by passing the corresponding
- parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
- For an overview of generation strategies and code examples, check out the [following
- guide](./generation_strategies).
- </Tip>
- Parameters:
- inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
- The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
- method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
- should be in the format `input_ids`. For encoder-decoder models *inputs* can represent any of
- `input_ids`, `input_values`, `input_features`, or `pixel_values`.
- generation_config (`~generation.GenerationConfig`, *optional*):
- The generation configuration to be used as base parametrization for the generation call. `**kwargs`
- passed to generate matching the attributes of `generation_config` will override them. If
- `generation_config` is not provided, the default will be used, which had the following loading
- priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
- configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
- default values, whose documentation should be checked to parameterize generation.
- logits_processor (`LogitsProcessorList`, *optional*):
- Custom logits processors that complement the default logits processors built from arguments and
- generation config. If a logit processor is passed that is already created with the arguments or a
- generation config an error is thrown. This feature is intended for advanced users.
- stopping_criteria (`StoppingCriteriaList`, *optional*):
- Custom stopping criteria that complement the default stopping criteria built from arguments and a
- generation config. If a stopping criteria is passed that is already created with the arguments or a
- generation config an error is thrown. This feature is intended for advanced users.
- synced_gpus (`bool`, *optional*, defaults to `False`):
- Whether to continue running the while loop until max_length (needed to avoid deadlocking with
- `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
- streamer (`BaseStreamer`, *optional*):
- Streamer object that will be used to stream the generated sequences. Generated tokens are passed
- through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
- kwargs (`dict[str, Any]`, *optional*):
- Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
- forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
- specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
- Return:
- [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
- or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
- If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
- [`~utils.ModelOutput`] types are:
- - [`~generation.GenerateDecoderOnlyOutput`],
- - [`~generation.GenerateBeamDecoderOnlyOutput`]
- If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
- [`~utils.ModelOutput`] types are:
- - [`~generation.GenerateEncoderDecoderOutput`],
- - [`~generation.GenerateBeamEncoderDecoderOutput`]
- """
- # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
- generation_mode_kwargs = self._extract_generation_mode_kwargs(None, kwargs, False, None, None)
- generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
- generation_mode = generation_config.get_generation_mode()
- if generation_mode not in [GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH]:
- raise ValueError(
- "Got incompatible mode for generation, should be one of greedy or sampling. "
- "Ensure that beam search is de-activated by setting `num_beams=1`."
- )
- self._validate_model_kwargs(model_kwargs.copy())
- self._validate_generation_mode(generation_mode, generation_config, generation_mode_kwargs)
- if model_kwargs.get("encoder_outputs") is not None and type(model_kwargs["encoder_outputs"]) is tuple:
- # wrap the unconditional outputs as a BaseModelOutput for compatibility with the rest of generate
- model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=model_kwargs["encoder_outputs"][0])
- # 2. Set generation parameters if not already defined
- logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
- stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
- requires_attention_mask = "encoder_outputs" not in model_kwargs
- kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
- # 3. Define model inputs
- inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
- inputs, generation_config.bos_token_id, model_kwargs
- )
- batch_size = inputs_tensor.shape[0]
- self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=inputs_tensor.device)
- # 4. Define other model kwargs
- model_kwargs["use_cache"] = generation_config.use_cache
- model_kwargs["guidance_scale"] = generation_config.guidance_scale
- if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
- model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
- inputs_tensor, generation_config, model_kwargs
- )
- if "encoder_outputs" not in model_kwargs:
- # encoder_outputs are created and added to `model_kwargs`
- model_kwargs = self._prepare_text_encoder_kwargs_for_generation(
- inputs_tensor, model_kwargs, model_input_name, generation_config
- )
- if "decoder_input_ids" not in model_kwargs and "input_values" in model_kwargs:
- model_kwargs = self._prepare_audio_encoder_kwargs_for_generation(
- model_kwargs["input_values"],
- model_kwargs,
- )
- # 5. Prepare `input_ids` which will be used for auto-regressive generation
- input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
- batch_size=batch_size,
- model_input_name=model_input_name,
- model_kwargs=model_kwargs,
- decoder_start_token_id=generation_config._decoder_start_token_tensor,
- bos_token_id=generation_config._bos_token_tensor,
- device=inputs_tensor.device,
- )
- # 6. Prepare `max_length` depending on other stopping criteria.
- input_ids_length = input_ids.shape[-1]
- has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
- has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
- generation_config = self._prepare_generated_length(
- generation_config=generation_config,
- has_default_max_length=has_default_max_length,
- has_default_min_length=has_default_min_length,
- model_input_name=model_input_name,
- inputs_tensor=inputs_tensor,
- input_ids_length=input_ids_length,
- )
- # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
- input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
- input_ids,
- pad_token_id=generation_config._decoder_start_token_tensor,
- max_length=generation_config.max_length,
- )
- # stash the delay mask so that we don't have to recompute in each forward pass
- model_kwargs["decoder_delay_pattern_mask"] = decoder_delay_pattern_mask
- # input_ids are ready to be placed on the streamer (if used)
- if streamer is not None:
- streamer.put(input_ids.cpu())
- # 7. determine generation mode
- generation_mode = generation_config.get_generation_mode()
- # 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
- if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
- logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
- generation_config.guidance_scale = None
- # 9. prepare distribution pre_processing samplers
- logits_processor = self._get_logits_processor(
- generation_config=generation_config,
- input_ids_seq_length=input_ids_length,
- encoder_input_ids=inputs_tensor,
- prefix_allowed_tokens_fn=None,
- logits_processor=logits_processor,
- device=input_ids.device,
- )
- # 10. prepare stopping criteria
- stopping_criteria = self._get_stopping_criteria(
- generation_config=generation_config, stopping_criteria=stopping_criteria
- )
- # expand input_ids with `num_return_sequences` additional sequences per batch
- input_ids, model_kwargs = self._expand_inputs_for_generation(
- input_ids=input_ids,
- expand_size=generation_config.num_return_sequences,
- is_encoder_decoder=self.config.is_encoder_decoder,
- **model_kwargs,
- )
- # 10b. prepare prefill outputs
- generation_mode_kwargs["prefill_outputs"] = self._prefill(input_ids, generation_config, model_kwargs)
- # 11. run sample
- outputs = self._sample(
- input_ids,
- logits_processor=logits_processor,
- stopping_criteria=stopping_criteria,
- generation_config=generation_config,
- **generation_mode_kwargs,
- **model_kwargs,
- )
- if generation_config.return_dict_in_generate:
- output_ids = outputs.sequences
- else:
- output_ids = outputs
- # apply the pattern mask to the final ids
- output_ids = self.decoder.apply_delay_pattern_mask(output_ids, model_kwargs["decoder_delay_pattern_mask"])
- # revert the pattern delay mask by filtering the pad token id
- output_ids = output_ids[output_ids != generation_config._pad_token_tensor].reshape(
- batch_size, self.decoder.num_codebooks, -1
- )
- # append the frame dimension back to the audio codes
- output_ids = output_ids[None, ...]
- audio_scales = model_kwargs.get("audio_scales")
- if audio_scales is None:
- audio_scales = [None] * batch_size
- if self.decoder.config.audio_channels == 1:
- output_values = self.audio_encoder.decode(
- output_ids,
- audio_scales=audio_scales,
- ).audio_values
- else:
- codec_outputs_left = self.audio_encoder.decode(output_ids[:, :, ::2, :], audio_scales=audio_scales)
- output_values_left = codec_outputs_left.audio_values
- codec_outputs_right = self.audio_encoder.decode(output_ids[:, :, 1::2, :], audio_scales=audio_scales)
- output_values_right = codec_outputs_right.audio_values
- output_values = torch.cat([output_values_left, output_values_right], dim=1)
- if generation_config.return_dict_in_generate:
- outputs.sequences = output_values
- return outputs
- else:
- return output_values
- def get_unconditional_inputs(self, num_samples=1):
- """
- Helper function to get null inputs for unconditional generation, enabling the model to be used without the
- feature extractor or tokenizer.
- Args:
- num_samples (int, *optional*):
- Number of audio samples to unconditionally generate.
- max_new_tokens (int, *optional*):
- Number of tokens to generate for each sample. More tokens means longer audio samples, at the expense of
- longer inference (since more audio tokens need to be generated per sample).
- Example:
- ```python
- >>> from transformers import MusicgenForConditionalGeneration
- >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
- >>> # get the unconditional (or 'null') inputs for the model
- >>> unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
- >>> audio_samples = model.generate(**unconditional_inputs, max_new_tokens=256)
- ```"""
- last_hidden_state = torch.zeros(
- (num_samples, 1, self.config.text_encoder.hidden_size), device=self.device, dtype=self.dtype
- )
- attention_mask = torch.zeros((num_samples, 1), device=self.device, dtype=torch.long)
- return MusicgenUnconditionalInput(
- encoder_outputs=(last_hidden_state,),
- attention_mask=attention_mask,
- guidance_scale=1.0,
- )
- __all__ = ["MusicgenForConditionalGeneration", "MusicgenForCausalLM", "MusicgenModel", "MusicgenPreTrainedModel"]
|