modeling_pop2piano.py 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083
  1. # Copyright 2023 The Pop2Piano Authors and The HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """PyTorch Pop2Piano model."""
  15. import copy
  16. import math
  17. import torch
  18. from torch import nn
  19. from torch.nn import CrossEntropyLoss
  20. from transformers.generation import GenerationConfig
  21. from ... import initialization as init
  22. from ...activations import ACT2FN
  23. from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
  24. from ...generation import GenerationMixin
  25. from ...masking_utils import create_causal_mask
  26. from ...modeling_layers import GradientCheckpointingLayer
  27. from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, Seq2SeqLMOutput
  28. from ...modeling_utils import PreTrainedModel
  29. from ...utils import auto_docstring, is_torchdynamo_compiling, logging
  30. from .configuration_pop2piano import Pop2PianoConfig
  31. logger = logging.get_logger(__name__)
  32. _load_pop2piano_layer_norm = True
  33. try:
  34. from apex.normalization import FusedRMSNorm
  35. _load_pop2piano_layer_norm = False
  36. logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of Pop2PianoLayerNorm")
  37. except ImportError:
  38. # using the normal Pop2PianoLayerNorm
  39. pass
  40. except Exception:
  41. logger.warning("Discovered apex but it failed to load, falling back to Pop2PianoLayerNorm")
  42. # Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->Pop2Piano
  43. class Pop2PianoLayerNorm(nn.Module):
  44. def __init__(self, hidden_size, eps=1e-6):
  45. """
  46. Construct a layernorm module in the Pop2Piano style. No bias and no subtraction of mean.
  47. """
  48. super().__init__()
  49. self.weight = nn.Parameter(torch.ones(hidden_size))
  50. self.variance_epsilon = eps
  51. def forward(self, hidden_states):
  52. # Pop2Piano uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
  53. # Square Layer Normalization https://huggingface.co/papers/1910.07467 thus variance is calculated
  54. # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
  55. # half-precision inputs is done in fp32
  56. variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
  57. hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
  58. # convert into half-precision if necessary
  59. if self.weight.dtype in [torch.float16, torch.bfloat16]:
  60. hidden_states = hidden_states.to(self.weight.dtype)
  61. return self.weight * hidden_states
  62. if not _load_pop2piano_layer_norm:
  63. Pop2PianoLayerNorm = FusedRMSNorm
  64. # Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->Pop2Piano,t5->pop2piano
  65. class Pop2PianoDenseActDense(nn.Module):
  66. def __init__(self, config: Pop2PianoConfig):
  67. super().__init__()
  68. self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
  69. self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
  70. self.dropout = nn.Dropout(config.dropout_rate)
  71. self.act = ACT2FN[config.dense_act_fn]
  72. def forward(self, hidden_states):
  73. hidden_states = self.wi(hidden_states)
  74. hidden_states = self.act(hidden_states)
  75. hidden_states = self.dropout(hidden_states)
  76. if (
  77. isinstance(self.wo.weight, torch.Tensor)
  78. and hidden_states.dtype != self.wo.weight.dtype
  79. and self.wo.weight.dtype != torch.int8
  80. ):
  81. hidden_states = hidden_states.to(self.wo.weight.dtype)
  82. hidden_states = self.wo(hidden_states)
  83. return hidden_states
  84. # Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->Pop2Piano
  85. class Pop2PianoDenseGatedActDense(nn.Module):
  86. def __init__(self, config: Pop2PianoConfig):
  87. super().__init__()
  88. self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
  89. self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
  90. self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
  91. self.dropout = nn.Dropout(config.dropout_rate)
  92. self.act = ACT2FN[config.dense_act_fn]
  93. def forward(self, hidden_states):
  94. hidden_gelu = self.act(self.wi_0(hidden_states))
  95. hidden_linear = self.wi_1(hidden_states)
  96. hidden_states = hidden_gelu * hidden_linear
  97. hidden_states = self.dropout(hidden_states)
  98. # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
  99. # See https://github.com/huggingface/transformers/issues/20287
  100. # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
  101. if (
  102. isinstance(self.wo.weight, torch.Tensor)
  103. and hidden_states.dtype != self.wo.weight.dtype
  104. and self.wo.weight.dtype != torch.int8
  105. ):
  106. hidden_states = hidden_states.to(self.wo.weight.dtype)
  107. hidden_states = self.wo(hidden_states)
  108. return hidden_states
  109. # Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->Pop2Piano
  110. class Pop2PianoLayerFF(nn.Module):
  111. def __init__(self, config: Pop2PianoConfig):
  112. super().__init__()
  113. if config.is_gated_act:
  114. self.DenseReluDense = Pop2PianoDenseGatedActDense(config)
  115. else:
  116. self.DenseReluDense = Pop2PianoDenseActDense(config)
  117. self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
  118. self.dropout = nn.Dropout(config.dropout_rate)
  119. def forward(self, hidden_states):
  120. forwarded_states = self.layer_norm(hidden_states)
  121. forwarded_states = self.DenseReluDense(forwarded_states)
  122. hidden_states = hidden_states + self.dropout(forwarded_states)
  123. return hidden_states
  124. # Copied from transformers.models.t5.modeling_t5.T5Attention with T5->Pop2Piano,t5->pop2piano
  125. class Pop2PianoAttention(nn.Module):
  126. def __init__(
  127. self,
  128. config: Pop2PianoConfig,
  129. has_relative_attention_bias=False,
  130. layer_idx: int | None = None,
  131. ):
  132. super().__init__()
  133. self.is_decoder = config.is_decoder
  134. self.has_relative_attention_bias = has_relative_attention_bias
  135. self.relative_attention_num_buckets = config.relative_attention_num_buckets
  136. self.relative_attention_max_distance = config.relative_attention_max_distance
  137. self.d_model = config.d_model
  138. self.key_value_proj_dim = config.d_kv
  139. self.n_heads = config.num_heads
  140. self.dropout = config.dropout_rate
  141. self.inner_dim = self.n_heads * self.key_value_proj_dim
  142. self.layer_idx = layer_idx
  143. if layer_idx is None and self.is_decoder:
  144. logger.warning_once(
  145. f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
  146. "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
  147. "when creating this class."
  148. )
  149. self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
  150. self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
  151. self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
  152. self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
  153. if self.has_relative_attention_bias:
  154. self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
  155. self.gradient_checkpointing = False
  156. @staticmethod
  157. def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
  158. """
  159. Adapted from Mesh Tensorflow:
  160. https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
  161. Translate relative position to a bucket number for relative attention. The relative position is defined as
  162. memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
  163. position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
  164. small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
  165. positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
  166. This should allow for more graceful generalization to longer sequences than the model has been trained on
  167. Args:
  168. relative_position: an int32 Tensor
  169. bidirectional: a boolean - whether the attention is bidirectional
  170. num_buckets: an integer
  171. max_distance: an integer
  172. Returns:
  173. a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
  174. """
  175. relative_buckets = 0
  176. if bidirectional:
  177. num_buckets //= 2
  178. relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
  179. relative_position = torch.abs(relative_position)
  180. else:
  181. relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
  182. # now relative_position is in the range [0, inf)
  183. # half of the buckets are for exact increments in positions
  184. max_exact = num_buckets // 2
  185. is_small = relative_position < max_exact
  186. # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
  187. relative_position_if_large = max_exact + (
  188. torch.log(relative_position.float() / max_exact)
  189. / math.log(max_distance / max_exact)
  190. * (num_buckets - max_exact)
  191. ).to(torch.long)
  192. relative_position_if_large = torch.min(
  193. relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
  194. )
  195. relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
  196. return relative_buckets
  197. def compute_bias(self, query_length, key_length, device=None, past_seen_tokens=0):
  198. """Compute binned relative position bias"""
  199. if device is None:
  200. device = self.relative_attention_bias.weight.device
  201. context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + past_seen_tokens
  202. memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
  203. relative_position = memory_position - context_position # shape (query_length, key_length)
  204. relative_position_bucket = self._relative_position_bucket(
  205. relative_position, # shape (query_length, key_length)
  206. bidirectional=(not self.is_decoder),
  207. num_buckets=self.relative_attention_num_buckets,
  208. max_distance=self.relative_attention_max_distance,
  209. )
  210. values = self.relative_attention_bias(relative_position_bucket) # shape (query_length, key_length, num_heads)
  211. values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length)
  212. return values
  213. def forward(
  214. self,
  215. hidden_states,
  216. mask=None,
  217. key_value_states=None,
  218. position_bias=None,
  219. past_key_values=None,
  220. output_attentions=False,
  221. **kwargs,
  222. ):
  223. """
  224. Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
  225. """
  226. # Input is (batch_size, seq_length, dim)
  227. # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder)
  228. input_shape = hidden_states.shape[:-1]
  229. hidden_shape = (*input_shape, -1, self.key_value_proj_dim)
  230. past_seen_tokens = past_key_values.get_seq_length(self.layer_idx) if past_key_values is not None else 0
  231. # We clone here for StaticCache, as we get the value before updating it, but use it after and it's the same ref
  232. past_seen_tokens = past_seen_tokens.clone() if isinstance(past_seen_tokens, torch.Tensor) else past_seen_tokens
  233. # if key_value_states are provided this layer is used as a cross-attention layer for the decoder
  234. is_cross_attention = key_value_states is not None
  235. query_states = self.q(hidden_states).view(hidden_shape).transpose(1, 2)
  236. # Check is encoder-decoder model is being used. Otherwise we'll get `DynamicCache`
  237. is_updated = False
  238. if isinstance(past_key_values, EncoderDecoderCache):
  239. is_updated = past_key_values.is_updated.get(self.layer_idx)
  240. if is_cross_attention:
  241. # after the first generated id, we can subsequently re-use all key/value_states from cache
  242. curr_past_key_values = past_key_values.cross_attention_cache
  243. else:
  244. curr_past_key_values = past_key_values.self_attention_cache
  245. else:
  246. curr_past_key_values = past_key_values
  247. current_states = key_value_states if is_cross_attention else hidden_states
  248. if is_cross_attention and past_key_values is not None and is_updated:
  249. # reuse k,v, cross_attentions
  250. key_states = curr_past_key_values.layers[self.layer_idx].keys
  251. value_states = curr_past_key_values.layers[self.layer_idx].values
  252. else:
  253. kv_shape = (*current_states.shape[:-1], -1, self.key_value_proj_dim)
  254. key_states = self.k(current_states).view(kv_shape).transpose(1, 2)
  255. value_states = self.v(current_states).view(kv_shape).transpose(1, 2)
  256. if past_key_values is not None:
  257. key_states, value_states = curr_past_key_values.update(key_states, value_states, self.layer_idx)
  258. # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
  259. if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
  260. past_key_values.is_updated[self.layer_idx] = True
  261. # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
  262. scores = torch.matmul(query_states, key_states.transpose(3, 2))
  263. if position_bias is None:
  264. key_length = key_states.shape[-2]
  265. if not self.has_relative_attention_bias:
  266. position_bias = torch.zeros(
  267. (1, query_states.shape[1], input_shape[1], key_length), device=scores.device, dtype=scores.dtype
  268. )
  269. if self.gradient_checkpointing and self.training:
  270. position_bias.requires_grad = True
  271. else:
  272. position_bias = self.compute_bias(
  273. input_shape[1], key_length, device=scores.device, past_seen_tokens=past_seen_tokens
  274. )
  275. if mask is not None:
  276. causal_mask = mask[:, :, :, : key_states.shape[-2]]
  277. position_bias = position_bias + causal_mask
  278. position_bias_masked = position_bias
  279. scores += position_bias_masked
  280. # (batch_size, n_heads, seq_length, key_length)
  281. attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
  282. attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
  283. attn_output = torch.matmul(attn_weights, value_states)
  284. attn_output = attn_output.transpose(1, 2).contiguous()
  285. attn_output = attn_output.reshape(*input_shape, -1)
  286. attn_output = self.o(attn_output)
  287. outputs = (attn_output, position_bias)
  288. if output_attentions:
  289. outputs = outputs + (attn_weights,)
  290. return outputs
  291. # Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->Pop2Piano,t5->pop2piano
  292. class Pop2PianoLayerSelfAttention(nn.Module):
  293. def __init__(self, config, has_relative_attention_bias=False, layer_idx: int | None = None):
  294. super().__init__()
  295. self.SelfAttention = Pop2PianoAttention(
  296. config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx
  297. )
  298. self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
  299. self.dropout = nn.Dropout(config.dropout_rate)
  300. def forward(
  301. self,
  302. hidden_states,
  303. attention_mask=None,
  304. position_bias=None,
  305. past_key_values=None,
  306. use_cache=False,
  307. output_attentions=False,
  308. **kwargs,
  309. ):
  310. normed_hidden_states = self.layer_norm(hidden_states)
  311. attention_output = self.SelfAttention(
  312. normed_hidden_states,
  313. mask=attention_mask,
  314. position_bias=position_bias,
  315. past_key_values=past_key_values,
  316. use_cache=use_cache,
  317. output_attentions=output_attentions,
  318. )
  319. hidden_states = hidden_states + self.dropout(attention_output[0])
  320. outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them
  321. return outputs
  322. # Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->Pop2Piano,t5->pop2piano
  323. class Pop2PianoLayerCrossAttention(nn.Module):
  324. def __init__(self, config, layer_idx: int | None = None):
  325. super().__init__()
  326. self.EncDecAttention = Pop2PianoAttention(config, has_relative_attention_bias=False, layer_idx=layer_idx)
  327. self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
  328. self.dropout = nn.Dropout(config.dropout_rate)
  329. def forward(
  330. self,
  331. hidden_states,
  332. key_value_states,
  333. attention_mask=None,
  334. position_bias=None,
  335. past_key_values=None,
  336. output_attentions=False,
  337. **kwargs,
  338. ):
  339. normed_hidden_states = self.layer_norm(hidden_states)
  340. attention_output = self.EncDecAttention(
  341. normed_hidden_states,
  342. mask=attention_mask,
  343. key_value_states=key_value_states,
  344. position_bias=position_bias,
  345. past_key_values=past_key_values,
  346. output_attentions=output_attentions,
  347. )
  348. layer_output = hidden_states + self.dropout(attention_output[0])
  349. outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
  350. return outputs
  351. # Copied from transformers.models.t5.modeling_t5.T5Block with T5->Pop2Piano,t5->pop2piano
  352. class Pop2PianoBlock(GradientCheckpointingLayer):
  353. def __init__(self, config, has_relative_attention_bias=False, layer_idx: int | None = None):
  354. super().__init__()
  355. self.is_decoder = config.is_decoder
  356. self.layer = nn.ModuleList()
  357. self.layer.append(
  358. Pop2PianoLayerSelfAttention(
  359. config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx
  360. )
  361. )
  362. if self.is_decoder:
  363. self.layer.append(Pop2PianoLayerCrossAttention(config, layer_idx=layer_idx))
  364. self.layer.append(Pop2PianoLayerFF(config))
  365. def forward(
  366. self,
  367. hidden_states,
  368. attention_mask=None,
  369. position_bias=None,
  370. encoder_hidden_states=None,
  371. encoder_attention_mask=None,
  372. encoder_decoder_position_bias=None,
  373. past_key_values=None,
  374. use_cache=False,
  375. output_attentions=False,
  376. return_dict=True,
  377. **kwargs,
  378. ):
  379. self_attention_outputs = self.layer[0](
  380. hidden_states,
  381. attention_mask=attention_mask,
  382. position_bias=position_bias,
  383. past_key_values=past_key_values,
  384. use_cache=use_cache,
  385. output_attentions=output_attentions,
  386. )
  387. hidden_states = self_attention_outputs[0]
  388. attention_outputs = self_attention_outputs[1:] # Keep self-attention outputs and relative position weights
  389. # clamp inf values to enable fp16 training
  390. if hidden_states.dtype == torch.float16:
  391. clamp_value = torch.where(
  392. torch.isinf(hidden_states).any(),
  393. torch.finfo(hidden_states.dtype).max - 1000,
  394. torch.finfo(hidden_states.dtype).max,
  395. )
  396. hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
  397. do_cross_attention = self.is_decoder and encoder_hidden_states is not None
  398. if do_cross_attention:
  399. cross_attention_outputs = self.layer[1](
  400. hidden_states,
  401. key_value_states=encoder_hidden_states,
  402. attention_mask=encoder_attention_mask,
  403. position_bias=encoder_decoder_position_bias,
  404. past_key_values=past_key_values,
  405. output_attentions=output_attentions,
  406. )
  407. hidden_states = cross_attention_outputs[0]
  408. # clamp inf values to enable fp16 training
  409. if hidden_states.dtype == torch.float16:
  410. clamp_value = torch.where(
  411. torch.isinf(hidden_states).any(),
  412. torch.finfo(hidden_states.dtype).max - 1000,
  413. torch.finfo(hidden_states.dtype).max,
  414. )
  415. hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
  416. # Keep cross-attention outputs and relative position weights
  417. attention_outputs = attention_outputs + cross_attention_outputs[1:]
  418. # Apply Feed Forward layer
  419. hidden_states = self.layer[-1](hidden_states)
  420. # clamp inf values to enable fp16 training
  421. if hidden_states.dtype == torch.float16:
  422. clamp_value = torch.where(
  423. torch.isinf(hidden_states).any(),
  424. torch.finfo(hidden_states.dtype).max - 1000,
  425. torch.finfo(hidden_states.dtype).max,
  426. )
  427. hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
  428. outputs = (hidden_states,)
  429. return (
  430. outputs + attention_outputs
  431. ) # hidden-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
  432. @auto_docstring
  433. class Pop2PianoPreTrainedModel(PreTrainedModel):
  434. config: Pop2PianoConfig
  435. base_model_prefix = "transformer"
  436. output_modalities = ("audio",)
  437. supports_gradient_checkpointing = True
  438. _can_compile_fullgraph = False
  439. _no_split_modules = ["Pop2PianoBlock"]
  440. _keep_in_fp32_modules = ["wo"]
  441. @torch.no_grad()
  442. def _init_weights(self, module):
  443. """Initialize the weights"""
  444. factor = self.config.initializer_factor # Used for testing weights initialization
  445. if isinstance(module, Pop2PianoLayerNorm):
  446. init.constant_(module.weight, factor * 1.0)
  447. elif isinstance(module, Pop2PianoConcatEmbeddingToMel):
  448. init.normal_(module.embedding.weight, mean=0.0, std=factor * 1.0)
  449. elif isinstance(module, Pop2PianoForConditionalGeneration):
  450. init.normal_(module.shared.weight, mean=0.0, std=factor * 1.0)
  451. if hasattr(module, "lm_head"):
  452. init.normal_(module.lm_head.weight, mean=0.0, std=factor * 1.0)
  453. elif isinstance(module, Pop2PianoDenseActDense):
  454. init.normal_(module.wi.weight, mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
  455. if hasattr(module.wi, "bias") and module.wi.bias is not None:
  456. init.zeros_(module.wi.bias)
  457. init.normal_(module.wo.weight, mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
  458. if hasattr(module.wo, "bias") and module.wo.bias is not None:
  459. init.zeros_(module.wo.bias)
  460. elif isinstance(module, Pop2PianoDenseGatedActDense):
  461. init.normal_(module.wi_0.weight, mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
  462. if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
  463. init.zeros_(module.wi_0.bias)
  464. init.normal_(module.wi_1.weight, mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
  465. if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
  466. init.zeros_(module.wi_1.bias)
  467. init.normal_(module.wo.weight, mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
  468. if hasattr(module.wo, "bias") and module.wo.bias is not None:
  469. init.zeros_(module.wo.bias)
  470. elif isinstance(module, Pop2PianoAttention):
  471. d_model = self.config.d_model
  472. key_value_proj_dim = self.config.d_kv
  473. n_heads = self.config.num_heads
  474. init.normal_(module.q.weight, mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
  475. init.normal_(module.k.weight, mean=0.0, std=factor * (d_model**-0.5))
  476. init.normal_(module.v.weight, mean=0.0, std=factor * (d_model**-0.5))
  477. init.normal_(module.o.weight, mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
  478. if module.has_relative_attention_bias:
  479. init.normal_(module.relative_attention_bias.weight, mean=0.0, std=factor * ((d_model) ** -0.5))
  480. def _shift_right(self, input_ids):
  481. decoder_start_token_id = self.config.decoder_start_token_id
  482. pad_token_id = self.config.pad_token_id
  483. if decoder_start_token_id is None:
  484. raise ValueError(
  485. "self.model.config.decoder_start_token_id has to be defined. In Pop2Piano it is usually set to the pad_token_id."
  486. )
  487. shifted_input_ids = input_ids.new_zeros(input_ids.shape)
  488. shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
  489. shifted_input_ids[..., 0] = decoder_start_token_id
  490. if pad_token_id is None:
  491. raise ValueError("self.model.config.pad_token_id has to be defined.")
  492. # replace possible -100 values in labels by `pad_token_id`
  493. shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
  494. return shifted_input_ids
  495. class Pop2PianoStack(Pop2PianoPreTrainedModel):
  496. # Copied from transformers.models.t5.modeling_t5.T5Stack.__init__ with T5->Pop2Piano,t5->pop2piano
  497. def __init__(self, config):
  498. super().__init__(config)
  499. self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
  500. self.is_decoder = config.is_decoder
  501. self.block = nn.ModuleList(
  502. [
  503. Pop2PianoBlock(config, has_relative_attention_bias=bool(i == 0), layer_idx=i)
  504. for i in range(config.num_layers)
  505. ]
  506. )
  507. self.final_layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
  508. self.dropout = nn.Dropout(config.dropout_rate)
  509. # Initialize weights and apply final processing
  510. self.post_init()
  511. self.gradient_checkpointing = False
  512. # Copied from transformers.models.t5.modeling_t5.T5Stack.set_input_embeddings
  513. def set_input_embeddings(self, new_embeddings):
  514. self.embed_tokens = new_embeddings
  515. def forward(
  516. self,
  517. input_ids=None,
  518. attention_mask=None,
  519. encoder_hidden_states=None,
  520. encoder_attention_mask=None,
  521. inputs_embeds=None,
  522. past_key_values=None,
  523. use_cache=None,
  524. output_attentions=None,
  525. output_hidden_states=None,
  526. return_dict=None,
  527. **kwargs,
  528. ):
  529. use_cache = use_cache if use_cache is not None else self.config.use_cache
  530. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
  531. output_hidden_states = (
  532. output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
  533. )
  534. return_dict = return_dict if return_dict is not None else self.config.return_dict
  535. if input_ids is not None and inputs_embeds is not None:
  536. err_msg_prefix = "decoder_" if self.is_decoder else ""
  537. raise ValueError(
  538. f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
  539. )
  540. elif input_ids is not None:
  541. input_shape = input_ids.size()
  542. input_ids = input_ids.view(-1, input_shape[-1])
  543. elif inputs_embeds is not None:
  544. input_shape = inputs_embeds.size()[:-1]
  545. else:
  546. err_msg_prefix = "decoder_" if self.is_decoder else ""
  547. raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
  548. if self.gradient_checkpointing and self.training:
  549. if use_cache:
  550. logger.warning_once(
  551. "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
  552. )
  553. use_cache = False
  554. if inputs_embeds is None:
  555. if self.embed_tokens is None:
  556. raise ValueError("You have to initialize the model with valid token embeddings")
  557. inputs_embeds = self.embed_tokens(input_ids)
  558. batch_size, seq_length = input_shape
  559. if use_cache is True:
  560. if not self.is_decoder:
  561. raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
  562. if self.is_decoder:
  563. if use_cache and past_key_values is None:
  564. if self.config.is_encoder_decoder:
  565. past_key_values = EncoderDecoderCache(
  566. DynamicCache(config=self.config), DynamicCache(config=self.config)
  567. )
  568. else:
  569. past_key_values = DynamicCache(config=self.config)
  570. elif not self.is_decoder:
  571. # do not pass cache object down the line for encoder stack
  572. # it messes indexing later in decoder-stack because cache object is modified in-place
  573. past_key_values = None
  574. past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
  575. if attention_mask is None and not is_torchdynamo_compiling():
  576. # required mask seq length can be calculated via length of past cache
  577. mask_seq_length = past_key_values_length + seq_length
  578. attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
  579. if self.config.is_decoder:
  580. causal_mask = create_causal_mask(
  581. config=self.config,
  582. inputs_embeds=inputs_embeds,
  583. attention_mask=attention_mask,
  584. past_key_values=past_key_values,
  585. )
  586. else:
  587. causal_mask = attention_mask[:, None, None, :]
  588. causal_mask = causal_mask.to(dtype=inputs_embeds.dtype)
  589. causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min
  590. # If a 2D or 3D attention mask is provided for the cross-attention
  591. # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
  592. if self.is_decoder and encoder_hidden_states is not None:
  593. encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
  594. encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
  595. if encoder_attention_mask is None:
  596. encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device)
  597. encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
  598. else:
  599. encoder_extended_attention_mask = None
  600. all_hidden_states = () if output_hidden_states else None
  601. all_attentions = () if output_attentions else None
  602. all_cross_attentions = () if (output_attentions and self.is_decoder) else None
  603. position_bias = None
  604. encoder_decoder_position_bias = None
  605. hidden_states = self.dropout(inputs_embeds)
  606. for i, layer_module in enumerate(self.block):
  607. if output_hidden_states:
  608. all_hidden_states = all_hidden_states + (hidden_states,)
  609. layer_outputs = layer_module(
  610. hidden_states,
  611. causal_mask,
  612. position_bias,
  613. encoder_hidden_states,
  614. encoder_extended_attention_mask,
  615. encoder_decoder_position_bias, # as a positional argument for gradient checkpointing
  616. past_key_values=past_key_values,
  617. use_cache=use_cache,
  618. output_attentions=output_attentions,
  619. )
  620. hidden_states = layer_outputs[0]
  621. # We share the position biases between the layers - the first layer store them
  622. # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
  623. # (cross-attention position bias), (cross-attention weights)
  624. position_bias = layer_outputs[1]
  625. if self.is_decoder and encoder_hidden_states is not None:
  626. encoder_decoder_position_bias = layer_outputs[3 if output_attentions else 2]
  627. if output_attentions:
  628. all_attentions = all_attentions + (layer_outputs[2],)
  629. if self.is_decoder:
  630. all_cross_attentions = all_cross_attentions + (layer_outputs[4],)
  631. hidden_states = self.final_layer_norm(hidden_states)
  632. hidden_states = self.dropout(hidden_states)
  633. # Add last layer
  634. if output_hidden_states:
  635. all_hidden_states = all_hidden_states + (hidden_states,)
  636. if not return_dict:
  637. return tuple(
  638. v
  639. for v in [
  640. hidden_states,
  641. past_key_values,
  642. all_hidden_states,
  643. all_attentions,
  644. all_cross_attentions,
  645. ]
  646. if v is not None
  647. )
  648. return BaseModelOutputWithPastAndCrossAttentions(
  649. last_hidden_state=hidden_states,
  650. past_key_values=past_key_values,
  651. hidden_states=all_hidden_states,
  652. attentions=all_attentions,
  653. cross_attentions=all_cross_attentions,
  654. )
  655. class Pop2PianoConcatEmbeddingToMel(nn.Module):
  656. """Embedding Matrix for `composer` tokens."""
  657. def __init__(self, config):
  658. super().__init__()
  659. self.embedding = nn.Embedding(num_embeddings=config.composer_vocab_size, embedding_dim=config.d_model)
  660. def forward(self, feature, index_value, embedding_offset):
  661. index_shifted = index_value - embedding_offset
  662. composer_embedding = self.embedding(index_shifted).unsqueeze(1)
  663. inputs_embeds = torch.cat([composer_embedding, feature], dim=1)
  664. return inputs_embeds
  665. @auto_docstring(
  666. custom_intro="""
  667. Pop2Piano Model with a `language modeling` head on top.
  668. """
  669. )
  670. class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel, GenerationMixin):
  671. _tied_weights_keys = {
  672. "encoder.embed_tokens.weight": "shared.weight",
  673. "decoder.embed_tokens.weight": "shared.weight",
  674. }
  675. def __init__(self, config: Pop2PianoConfig):
  676. super().__init__(config)
  677. self.config = config
  678. self.model_dim = config.d_model
  679. self.shared = nn.Embedding(config.vocab_size, config.d_model)
  680. self.mel_conditioner = Pop2PianoConcatEmbeddingToMel(config)
  681. encoder_config = copy.deepcopy(config)
  682. encoder_config.is_decoder = False
  683. encoder_config.use_cache = False
  684. self.encoder = Pop2PianoStack(encoder_config)
  685. decoder_config = copy.deepcopy(config)
  686. decoder_config.is_decoder = True
  687. decoder_config.num_layers = config.num_decoder_layers
  688. self.decoder = Pop2PianoStack(decoder_config)
  689. self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
  690. # Initialize weights and apply final processing
  691. self.post_init()
  692. def get_input_embeddings(self):
  693. return self.shared
  694. def set_input_embeddings(self, new_embeddings):
  695. self.shared = new_embeddings
  696. self.encoder.set_input_embeddings(new_embeddings)
  697. self.decoder.set_input_embeddings(new_embeddings)
  698. def get_mel_conditioner_outputs(
  699. self,
  700. input_features: torch.FloatTensor,
  701. composer: str,
  702. generation_config: GenerationConfig,
  703. attention_mask: torch.FloatTensor | None = None,
  704. ):
  705. """
  706. This method is used to concatenate mel conditioner tokens at the front of the input_features in order to
  707. control the type of MIDI token generated by the model.
  708. Args:
  709. input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  710. input features extracted from the feature extractor.
  711. composer (`str`):
  712. composer token which determines the type of MIDI tokens to be generated.
  713. generation_config (`~generation.GenerationConfig`):
  714. The generation is used to get the composer-feature_token pair.
  715. attention_mask (``, *optional*):
  716. For batched generation `input_features` are padded to have the same shape across all examples.
  717. `attention_mask` helps to determine which areas were padded and which were not.
  718. - 1 for tokens that are **not padded**,
  719. - 0 for tokens that are **padded**.
  720. """
  721. composer_to_feature_token = generation_config.composer_to_feature_token
  722. if composer not in composer_to_feature_token:
  723. raise ValueError(
  724. f"Please choose a composer from {list(composer_to_feature_token.keys())}. Composer received - {composer}"
  725. )
  726. composer_value = composer_to_feature_token[composer]
  727. composer_value = torch.tensor(composer_value, device=self.device)
  728. composer_value = composer_value.repeat(input_features.shape[0])
  729. embedding_offset = min(composer_to_feature_token.values())
  730. input_features = self.mel_conditioner(
  731. feature=input_features,
  732. index_value=composer_value,
  733. embedding_offset=embedding_offset,
  734. )
  735. if attention_mask is not None:
  736. input_features[~attention_mask[:, 0].bool()] = 0.0
  737. # since self.mel_conditioner adds a new array at the front of inputs_embeds we need to do the same for attention_mask to keep the shapes same
  738. attention_mask = torch.concatenate([attention_mask[:, 0].view(-1, 1), attention_mask], axis=1)
  739. return input_features, attention_mask
  740. return input_features, None
  741. @auto_docstring
  742. def forward(
  743. self,
  744. input_ids: torch.LongTensor | None = None,
  745. attention_mask: torch.FloatTensor | None = None,
  746. decoder_input_ids: torch.LongTensor | None = None,
  747. decoder_attention_mask: torch.BoolTensor | None = None,
  748. encoder_outputs: tuple[tuple[torch.Tensor]] | None = None,
  749. past_key_values: Cache | None = None,
  750. inputs_embeds: torch.FloatTensor | None = None,
  751. input_features: torch.FloatTensor | None = None,
  752. decoder_inputs_embeds: torch.FloatTensor | None = None,
  753. labels: torch.LongTensor | None = None,
  754. use_cache: bool | None = None,
  755. output_attentions: bool | None = None,
  756. output_hidden_states: bool | None = None,
  757. return_dict: bool | None = None,
  758. **kwargs,
  759. ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput:
  760. r"""
  761. input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
  762. Indices of input sequence tokens in the vocabulary. Pop2Piano is a model with relative position embeddings
  763. so you should be able to pad the inputs on both the right and the left. Indices can be obtained using
  764. [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for detail.
  765. [What are input IDs?](../glossary#input-ids) To know more on how to prepare `input_ids` for pretraining
  766. take a look a [Pop2Piano Training](./Pop2Piano#training).
  767. decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
  768. Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
  769. [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
  770. [What are decoder input IDs?](../glossary#decoder-input-ids) Pop2Piano uses the `pad_token_id` as the
  771. starting token for `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last
  772. `decoder_input_ids` have to be input (see `past_key_values`). To know more on how to prepare
  773. decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
  774. Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
  775. be used by default.
  776. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
  777. Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
  778. config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
  779. labels in `[0, ..., config.vocab_size]`
  780. """
  781. use_cache = use_cache if use_cache is not None else self.config.use_cache
  782. return_dict = return_dict if return_dict is not None else self.config.return_dict
  783. if inputs_embeds is not None and input_features is not None:
  784. raise ValueError("Both `inputs_embeds` and `input_features` received! Please provide only one of them")
  785. elif input_features is not None and inputs_embeds is None:
  786. inputs_embeds = input_features
  787. # Encode if needed (training, first prediction pass)
  788. if encoder_outputs is None:
  789. # Convert encoder inputs in embeddings if needed
  790. encoder_outputs = self.encoder(
  791. input_ids=input_ids,
  792. attention_mask=attention_mask,
  793. inputs_embeds=inputs_embeds,
  794. output_attentions=output_attentions,
  795. output_hidden_states=output_hidden_states,
  796. return_dict=return_dict,
  797. )
  798. elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
  799. encoder_outputs = BaseModelOutput(
  800. last_hidden_state=encoder_outputs[0],
  801. hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
  802. attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
  803. )
  804. hidden_states = encoder_outputs[0]
  805. if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
  806. # get decoder inputs from shifting lm labels to the right
  807. decoder_input_ids = self._shift_right(labels)
  808. # Decode
  809. decoder_outputs = self.decoder(
  810. input_ids=decoder_input_ids,
  811. attention_mask=decoder_attention_mask,
  812. inputs_embeds=decoder_inputs_embeds,
  813. past_key_values=past_key_values,
  814. encoder_hidden_states=hidden_states,
  815. encoder_attention_mask=attention_mask,
  816. use_cache=use_cache,
  817. output_attentions=output_attentions,
  818. output_hidden_states=output_hidden_states,
  819. return_dict=return_dict,
  820. )
  821. sequence_output = decoder_outputs[0]
  822. if self.config.tie_word_embeddings:
  823. sequence_output = sequence_output * (self.model_dim**-0.5)
  824. lm_logits = self.lm_head(sequence_output)
  825. loss = None
  826. if labels is not None:
  827. loss_fct = CrossEntropyLoss(ignore_index=-100)
  828. loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
  829. if not return_dict:
  830. output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
  831. return ((loss,) + output) if loss is not None else output
  832. return Seq2SeqLMOutput(
  833. loss=loss,
  834. logits=lm_logits,
  835. past_key_values=decoder_outputs.past_key_values,
  836. decoder_hidden_states=decoder_outputs.hidden_states,
  837. decoder_attentions=decoder_outputs.attentions,
  838. cross_attentions=decoder_outputs.cross_attentions,
  839. encoder_last_hidden_state=encoder_outputs.last_hidden_state,
  840. encoder_hidden_states=encoder_outputs.hidden_states,
  841. encoder_attentions=encoder_outputs.attentions,
  842. )
  843. @torch.no_grad()
  844. def generate(
  845. self,
  846. input_features,
  847. attention_mask=None,
  848. composer="composer1",
  849. generation_config=None,
  850. **kwargs,
  851. ):
  852. """
  853. Generates token ids for midi outputs.
  854. <Tip warning={true}>
  855. Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
  856. model's default generation configuration. You can override any `generation_config` by passing the corresponding
  857. parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. For an overview of generation
  858. strategies and code examples, check out the [following guide](./generation_strategies).
  859. </Tip>
  860. Parameters:
  861. input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  862. This is the featurized version of audio generated by `Pop2PianoFeatureExtractor`.
  863. attention_mask:
  864. For batched generation `input_features` are padded to have the same shape across all examples.
  865. `attention_mask` helps to determine which areas were padded and which were not.
  866. - 1 for tokens that are **not padded**,
  867. - 0 for tokens that are **padded**.
  868. composer (`str`, *optional*, defaults to `"composer1"`):
  869. This value is passed to `Pop2PianoConcatEmbeddingToMel` to generate different embeddings for each
  870. `"composer"`. Please make sure that the composer value is present in `composer_to_feature_token` in
  871. `generation_config`. For an example please see
  872. https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json .
  873. generation_config (`~generation.GenerationConfig`, *optional*):
  874. The generation configuration to be used as base parametrization for the generation call. `**kwargs`
  875. passed to generate matching the attributes of `generation_config` will override them. If
  876. `generation_config` is not provided, the default will be used, which had the following loading
  877. priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
  878. configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
  879. default values, whose documentation should be checked to parameterize generation.
  880. kwargs:
  881. Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
  882. forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
  883. specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
  884. Return:
  885. [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
  886. or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
  887. Since Pop2Piano is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
  888. [`~utils.ModelOutput`] types are:
  889. - [`~generation.GenerateEncoderDecoderOutput`],
  890. - [`~generation.GenerateBeamEncoderDecoderOutput`]
  891. """
  892. if generation_config is None:
  893. generation_config = self.generation_config
  894. generation_config.update(**kwargs)
  895. # check for composer_to_feature_token
  896. if not hasattr(generation_config, "composer_to_feature_token"):
  897. raise ValueError(
  898. "`composer_to_feature_token` was not found! Please refer to "
  899. "https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json"
  900. "and parse a dict like that."
  901. )
  902. if len(generation_config.composer_to_feature_token) != self.config.composer_vocab_size:
  903. raise ValueError(
  904. "config.composer_vocab_size must be same as the number of keys in "
  905. f"generation_config.composer_to_feature_token! "
  906. f"Found {self.config.composer_vocab_size} vs {len(generation_config.composer_to_feature_token)}."
  907. )
  908. # to control the variation of generated MIDI tokens we concatenate mel-conditioner tokens(which depends on composer_token)
  909. # at the front of input_features.
  910. input_features, attention_mask = self.get_mel_conditioner_outputs(
  911. input_features=input_features,
  912. attention_mask=attention_mask,
  913. composer=composer,
  914. generation_config=generation_config,
  915. )
  916. return super().generate(
  917. inputs=None,
  918. inputs_embeds=input_features,
  919. attention_mask=attention_mask,
  920. generation_config=generation_config,
  921. **kwargs,
  922. )
  923. def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
  924. return self._shift_right(labels)
  925. __all__ = ["Pop2PianoForConditionalGeneration", "Pop2PianoPreTrainedModel"]