modeling_outputs.py 106 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706
  1. # Copyright 2020 The HuggingFace Team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from dataclasses import dataclass
  15. import torch
  16. from .cache_utils import Cache, EncoderDecoderCache
  17. from .utils import ModelOutput
  18. @dataclass
  19. class BaseModelOutput(ModelOutput):
  20. """
  21. Base class for model's outputs, with potential hidden states and attentions.
  22. Args:
  23. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  24. Sequence of hidden-states at the output of the last layer of the model.
  25. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  26. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  27. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  28. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  29. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  30. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  31. sequence_length)`.
  32. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  33. heads.
  34. """
  35. last_hidden_state: torch.FloatTensor | None = None
  36. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  37. attentions: tuple[torch.FloatTensor, ...] | None = None
  38. @dataclass
  39. class BaseModelOutputWithNoAttention(ModelOutput):
  40. """
  41. Base class for model's outputs, with potential hidden states.
  42. Args:
  43. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
  44. Sequence of hidden-states at the output of the last layer of the model.
  45. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  46. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  47. one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  48. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  49. """
  50. last_hidden_state: torch.FloatTensor | None = None
  51. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  52. @dataclass
  53. class BaseModelOutputWithPooling(ModelOutput):
  54. """
  55. Base class for model's outputs that also contains a pooling of the last hidden states.
  56. Args:
  57. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  58. Sequence of hidden-states at the output of the last layer of the model.
  59. pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
  60. Last layer hidden-state of the first token of the sequence (classification token) after further processing
  61. through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
  62. the classification token after processing through a linear layer and a tanh activation function. The linear
  63. layer weights are trained from the next sentence prediction (classification) objective during pretraining.
  64. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  65. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  66. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  67. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  68. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  69. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  70. sequence_length)`.
  71. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  72. heads.
  73. """
  74. last_hidden_state: torch.FloatTensor | None = None
  75. pooler_output: torch.FloatTensor | None = None
  76. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  77. attentions: tuple[torch.FloatTensor, ...] | None = None
  78. @dataclass
  79. class BaseModelOutputWithPoolingAndNoAttention(ModelOutput):
  80. """
  81. Base class for model's outputs that also contains a pooling of the last hidden states.
  82. Args:
  83. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
  84. Sequence of hidden-states at the output of the last layer of the model.
  85. pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
  86. Last layer hidden-state after a pooling operation on the spatial dimensions.
  87. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  88. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  89. one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  90. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  91. """
  92. last_hidden_state: torch.FloatTensor | None = None
  93. pooler_output: torch.FloatTensor | None = None
  94. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  95. @dataclass
  96. class BaseModelOutputWithPast(ModelOutput):
  97. """
  98. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
  99. Args:
  100. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  101. Sequence of hidden-states at the output of the last layer of the model.
  102. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  103. hidden_size)` is output.
  104. past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  105. It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  106. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  107. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  108. input) to speed up sequential decoding.
  109. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  110. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  111. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  112. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  113. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  114. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  115. sequence_length)`.
  116. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  117. heads.
  118. """
  119. last_hidden_state: torch.FloatTensor | None = None
  120. past_key_values: Cache | None = None
  121. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  122. attentions: tuple[torch.FloatTensor, ...] | None = None
  123. @dataclass
  124. class BaseModelOutputWithCrossAttentions(ModelOutput):
  125. """
  126. Base class for model's outputs, with potential hidden states and attentions.
  127. Args:
  128. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  129. Sequence of hidden-states at the output of the last layer of the model.
  130. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  131. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  132. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  133. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  134. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  135. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  136. sequence_length)`.
  137. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  138. heads.
  139. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
  140. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  141. sequence_length)`.
  142. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  143. weighted average in the cross-attention heads.
  144. """
  145. last_hidden_state: torch.FloatTensor | None = None
  146. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  147. attentions: tuple[torch.FloatTensor, ...] | None = None
  148. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  149. @dataclass
  150. class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
  151. """
  152. Base class for model's outputs that also contains a pooling of the last hidden states.
  153. Args:
  154. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  155. Sequence of hidden-states at the output of the last layer of the model.
  156. pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
  157. Last layer hidden-state of the first token of the sequence (classification token) after further processing
  158. through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
  159. the classification token after processing through a linear layer and a tanh activation function. The linear
  160. layer weights are trained from the next sentence prediction (classification) objective during pretraining.
  161. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  162. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  163. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  164. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  165. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  166. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  167. sequence_length)`.
  168. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  169. heads.
  170. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
  171. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  172. sequence_length)`.
  173. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  174. weighted average in the cross-attention heads.
  175. past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  176. It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  177. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  178. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  179. input) to speed up sequential decoding.
  180. """
  181. last_hidden_state: torch.FloatTensor | None = None
  182. pooler_output: torch.FloatTensor | None = None
  183. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  184. past_key_values: Cache | None = None
  185. attentions: tuple[torch.FloatTensor, ...] | None = None
  186. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  187. @dataclass
  188. class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
  189. """
  190. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
  191. Args:
  192. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  193. Sequence of hidden-states at the output of the last layer of the model.
  194. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  195. hidden_size)` is output.
  196. past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  197. It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  198. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  199. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  200. input) to speed up sequential decoding.
  201. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  202. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  203. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  204. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  205. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  206. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  207. sequence_length)`.
  208. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  209. heads.
  210. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
  211. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  212. sequence_length)`.
  213. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  214. weighted average in the cross-attention heads.
  215. """
  216. last_hidden_state: torch.FloatTensor | None = None
  217. past_key_values: Cache | None = None
  218. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  219. attentions: tuple[torch.FloatTensor, ...] | None = None
  220. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  221. @dataclass
  222. class MoECausalLMOutputWithPast(ModelOutput):
  223. """
  224. Base class for causal language model (or autoregressive) outputs as well as Mixture of Expert's router hidden
  225. states terms, to train a MoE model.
  226. Args:
  227. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  228. Language modeling loss (for next-token prediction).
  229. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  230. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  231. past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  232. It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  233. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  234. `past_key_values` input) to speed up sequential decoding.
  235. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  236. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  237. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  238. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  239. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  240. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  241. sequence_length)`.
  242. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  243. heads.
  244. z_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
  245. z_loss for the sparse modules.
  246. aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
  247. aux_loss for the sparse modules.
  248. router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
  249. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  250. Router logits of the encoder model, useful to compute the auxiliary loss and the z_loss for the sparse
  251. modules.
  252. """
  253. loss: torch.FloatTensor | None = None
  254. logits: torch.FloatTensor | None = None
  255. past_key_values: Cache | None = None
  256. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  257. attentions: tuple[torch.FloatTensor, ...] | None = None
  258. z_loss: torch.FloatTensor | None = None
  259. aux_loss: torch.FloatTensor | None = None
  260. router_logits: tuple[torch.FloatTensor] | None = None
  261. @dataclass
  262. class MoEModelOutput(ModelOutput):
  263. """
  264. Base class for model's outputs, with potential hidden states and attentions.
  265. Args:
  266. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  267. Sequence of hidden-states at the output of the last layer of the model.
  268. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  269. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  270. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  271. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  272. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  273. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  274. sequence_length)`.
  275. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  276. heads.
  277. router_probs (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
  278. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  279. Raw router probabilities that are computed by MoE routers, these terms are used to compute the auxiliary
  280. loss and the z_loss for Mixture of Experts models.
  281. """
  282. last_hidden_state: torch.FloatTensor | None = None
  283. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  284. attentions: tuple[torch.FloatTensor, ...] | None = None
  285. router_probs: tuple[torch.FloatTensor] | None = None
  286. router_logits: tuple[torch.FloatTensor] | None = None
  287. @dataclass
  288. class MoeModelOutputWithPast(ModelOutput):
  289. """
  290. Base class for model's outputs, with potential hidden states and attentions.
  291. Args:
  292. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  293. Sequence of hidden-states at the output of the last layer of the model.
  294. past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  295. It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  296. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  297. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  298. input) to speed up sequential decoding.
  299. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  300. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  301. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  302. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  303. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  304. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  305. sequence_length)`.
  306. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  307. heads.
  308. router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
  309. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  310. Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
  311. loss for Mixture of Experts models.
  312. """
  313. last_hidden_state: torch.FloatTensor | None = None
  314. past_key_values: Cache | None = None
  315. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  316. attentions: tuple[torch.FloatTensor, ...] | None = None
  317. router_logits: tuple[torch.FloatTensor] | None = None
  318. @dataclass
  319. class MoeCausalLMOutputWithPast(ModelOutput):
  320. """
  321. Base class for causal language model (or autoregressive) with mixture of experts outputs.
  322. Args:
  323. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  324. Language modeling loss (for next-token prediction).
  325. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  326. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  327. aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
  328. aux_loss for the sparse modules.
  329. router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
  330. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  331. Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
  332. loss for Mixture of Experts models.
  333. past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  334. It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  335. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  336. `past_key_values` input) to speed up sequential decoding.
  337. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  338. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  339. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  340. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  341. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  342. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  343. sequence_length)`.
  344. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  345. heads.
  346. """
  347. loss: torch.FloatTensor | None = None
  348. aux_loss: torch.FloatTensor | None = None
  349. logits: torch.FloatTensor | None = None
  350. past_key_values: Cache | None = None
  351. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  352. attentions: tuple[torch.FloatTensor, ...] | None = None
  353. router_logits: tuple[torch.FloatTensor] | None = None
  354. @dataclass
  355. class MoEModelOutputWithPastAndCrossAttentions(ModelOutput):
  356. """
  357. Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding) as well as
  358. Mixture of Expert's router hidden states terms, to train a MoE model.
  359. Args:
  360. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  361. Sequence of hidden-states at the output of the last layer of the model.
  362. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  363. hidden_size)` is output.
  364. past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  365. It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  366. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
  367. `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
  368. input) to speed up sequential decoding.
  369. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  370. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  371. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  372. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  373. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  374. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  375. sequence_length)`.
  376. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  377. heads.
  378. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
  379. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  380. sequence_length)`.
  381. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  382. weighted average in the cross-attention heads.
  383. router_probs (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
  384. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  385. Raw router probabilities that are computed by MoE routers, these terms are used to compute the auxiliary
  386. loss and the z_loss for Mixture of Experts models.
  387. """
  388. last_hidden_state: torch.FloatTensor | None = None
  389. past_key_values: Cache | None = None
  390. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  391. attentions: tuple[torch.FloatTensor, ...] | None = None
  392. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  393. router_probs: tuple[torch.FloatTensor] | None = None
  394. router_logits: tuple[torch.FloatTensor] | None = None
  395. @dataclass
  396. class Seq2SeqModelOutput(ModelOutput):
  397. """
  398. Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
  399. decoding.
  400. Args:
  401. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  402. Sequence of hidden-states at the output of the last layer of the decoder of the model.
  403. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  404. hidden_size)` is output.
  405. past_key_values (`EncoderDecoderCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  406. It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  407. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  408. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  409. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  410. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  411. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  412. Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
  413. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  414. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  415. sequence_length)`.
  416. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  417. self-attention heads.
  418. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  419. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  420. sequence_length)`.
  421. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  422. weighted average in the cross-attention heads.
  423. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  424. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  425. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  426. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  427. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  428. Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
  429. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  430. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  431. sequence_length)`.
  432. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  433. self-attention heads.
  434. """
  435. last_hidden_state: torch.FloatTensor | None = None
  436. past_key_values: EncoderDecoderCache | None = None
  437. decoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  438. decoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  439. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  440. encoder_last_hidden_state: torch.FloatTensor | None = None
  441. encoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  442. encoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  443. @dataclass
  444. class Seq2SeqMoEModelOutput(ModelOutput):
  445. """
  446. Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
  447. decoding.
  448. Args:
  449. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  450. Sequence of hidden-states at the output of the last layer of the decoder of the model.
  451. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  452. hidden_size)` is output.
  453. past_key_values (`EncoderDecoderCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  454. It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  455. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  456. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  457. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  458. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  459. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  460. Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
  461. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  462. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  463. sequence_length)`.
  464. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  465. self-attention heads.
  466. decoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
  467. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  468. Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models.
  469. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  470. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  471. sequence_length)`.
  472. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  473. weighted average in the cross-attention heads.
  474. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  475. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  476. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  477. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  478. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  479. Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
  480. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  481. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  482. sequence_length)`.
  483. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  484. self-attention heads.
  485. encoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
  486. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  487. Router logits of the encoder model, useful to compute the auxiliary loss and the z_loss for the sparse
  488. modules.
  489. """
  490. last_hidden_state: torch.FloatTensor | None = None
  491. past_key_values: EncoderDecoderCache | None = None
  492. decoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  493. decoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  494. decoder_router_logits: tuple[torch.FloatTensor] | None = None
  495. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  496. encoder_last_hidden_state: torch.FloatTensor | None = None
  497. encoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  498. encoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  499. encoder_router_logits: tuple[torch.FloatTensor] | None = None
  500. @dataclass
  501. class CausalLMOutput(ModelOutput):
  502. """
  503. Base class for causal language model (or autoregressive) outputs.
  504. Args:
  505. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  506. Language modeling loss (for next-token prediction).
  507. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  508. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  509. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  510. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  511. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  512. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  513. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  514. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  515. sequence_length)`.
  516. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  517. heads.
  518. """
  519. loss: torch.FloatTensor | None = None
  520. logits: torch.FloatTensor | None = None
  521. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  522. attentions: tuple[torch.FloatTensor, ...] | None = None
  523. @dataclass
  524. class CausalLMOutputWithPast(ModelOutput):
  525. """
  526. Base class for causal language model (or autoregressive) outputs.
  527. Args:
  528. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  529. Language modeling loss (for next-token prediction).
  530. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  531. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  532. past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  533. It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  534. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  535. `past_key_values` input) to speed up sequential decoding.
  536. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  537. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  538. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  539. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  540. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  541. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  542. sequence_length)`.
  543. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  544. heads.
  545. """
  546. loss: torch.FloatTensor | None = None
  547. logits: torch.FloatTensor | None = None
  548. past_key_values: Cache | None = None
  549. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  550. attentions: tuple[torch.FloatTensor, ...] | None = None
  551. @dataclass
  552. class CausalLMOutputWithCrossAttentions(ModelOutput):
  553. """
  554. Base class for causal language model (or autoregressive) outputs.
  555. Args:
  556. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  557. Language modeling loss (for next-token prediction).
  558. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  559. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  560. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  561. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  562. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  563. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  564. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  565. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  566. sequence_length)`.
  567. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  568. heads.
  569. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  570. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  571. sequence_length)`.
  572. Cross attentions weights after the attention softmax, used to compute the weighted average in the
  573. cross-attention heads.
  574. past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  575. It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  576. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
  577. `past_key_values` input) to speed up sequential decoding.
  578. """
  579. loss: torch.FloatTensor | None = None
  580. logits: torch.FloatTensor | None = None
  581. past_key_values: Cache | None = None
  582. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  583. attentions: tuple[torch.FloatTensor, ...] | None = None
  584. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  585. @dataclass
  586. class SequenceClassifierOutputWithPast(ModelOutput):
  587. """
  588. Base class for outputs of sentence classification models.
  589. Args:
  590. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  591. Classification (or regression if config.num_labels==1) loss.
  592. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
  593. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  594. past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  595. It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  596. Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
  597. `past_key_values` input) to speed up sequential decoding.
  598. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  599. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  600. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  601. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  602. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  603. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  604. sequence_length)`.
  605. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  606. heads.
  607. """
  608. loss: torch.FloatTensor | None = None
  609. logits: torch.FloatTensor | None = None
  610. past_key_values: Cache | None = None
  611. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  612. attentions: tuple[torch.FloatTensor, ...] | None = None
  613. @dataclass
  614. class MaskedLMOutput(ModelOutput):
  615. """
  616. Base class for masked language models outputs.
  617. Args:
  618. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  619. Masked language modeling (MLM) loss.
  620. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  621. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  622. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  623. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  624. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  625. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  626. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  627. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  628. sequence_length)`.
  629. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  630. heads.
  631. """
  632. loss: torch.FloatTensor | None = None
  633. logits: torch.FloatTensor | None = None
  634. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  635. attentions: tuple[torch.FloatTensor, ...] | None = None
  636. @dataclass
  637. class Seq2SeqLMOutput(ModelOutput):
  638. """
  639. Base class for sequence-to-sequence language models outputs.
  640. Args:
  641. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  642. Language modeling loss.
  643. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  644. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  645. past_key_values (`EncoderDecoderCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  646. It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  647. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  648. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  649. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  650. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  651. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  652. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  653. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  654. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  655. sequence_length)`.
  656. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  657. self-attention heads.
  658. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  659. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  660. sequence_length)`.
  661. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  662. weighted average in the cross-attention heads.
  663. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  664. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  665. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  666. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  667. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  668. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  669. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  670. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  671. sequence_length)`.
  672. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  673. self-attention heads.
  674. """
  675. loss: torch.FloatTensor | None = None
  676. logits: torch.FloatTensor | None = None
  677. past_key_values: EncoderDecoderCache | None = None
  678. decoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  679. decoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  680. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  681. encoder_last_hidden_state: torch.FloatTensor | None = None
  682. encoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  683. encoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  684. @dataclass
  685. class Seq2SeqMoEOutput(ModelOutput):
  686. """
  687. Base class for sequence-to-sequence language models outputs.
  688. Args:
  689. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  690. Language modeling loss.
  691. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
  692. Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
  693. past_key_values (`EncoderDecoderCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  694. It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  695. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  696. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  697. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  698. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  699. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  700. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  701. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  702. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  703. sequence_length)`.
  704. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  705. self-attention heads.
  706. decoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
  707. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  708. Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models.
  709. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  710. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  711. sequence_length)`.
  712. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  713. weighted average in the cross-attention heads.
  714. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  715. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  716. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  717. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  718. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  719. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  720. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  721. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  722. sequence_length)`.
  723. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  724. self-attention heads.
  725. encoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
  726. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
  727. Router logits of the encoder model, useful to compute the auxiliary loss and z_loss for Mixture of Experts
  728. models.
  729. """
  730. loss: torch.FloatTensor | None = None
  731. logits: torch.FloatTensor | None = None
  732. encoder_z_loss: torch.FloatTensor | None = None
  733. decoder_z_loss: torch.FloatTensor | None = None
  734. encoder_aux_loss: torch.FloatTensor | None = None
  735. decoder_aux_loss: torch.FloatTensor | None = None
  736. past_key_values: EncoderDecoderCache | None = None
  737. decoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  738. decoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  739. decoder_router_logits: tuple[torch.FloatTensor] | None = None
  740. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  741. encoder_last_hidden_state: torch.FloatTensor | None = None
  742. encoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  743. encoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  744. encoder_router_logits: tuple[torch.FloatTensor] | None = None
  745. @dataclass
  746. class NextSentencePredictorOutput(ModelOutput):
  747. """
  748. Base class for outputs of models predicting if two sentences are consecutive or not.
  749. Args:
  750. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `next_sentence_label` is provided):
  751. Next sequence prediction (classification) loss.
  752. logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
  753. Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
  754. before SoftMax).
  755. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  756. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  757. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  758. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  759. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  760. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  761. sequence_length)`.
  762. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  763. heads.
  764. """
  765. loss: torch.FloatTensor | None = None
  766. logits: torch.FloatTensor | None = None
  767. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  768. attentions: tuple[torch.FloatTensor, ...] | None = None
  769. @dataclass
  770. class SequenceClassifierOutput(ModelOutput):
  771. """
  772. Base class for outputs of sentence classification models.
  773. Args:
  774. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  775. Classification (or regression if config.num_labels==1) loss.
  776. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
  777. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  778. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  779. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  780. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  781. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  782. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  783. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  784. sequence_length)`.
  785. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  786. heads.
  787. """
  788. loss: torch.FloatTensor | None = None
  789. logits: torch.FloatTensor | None = None
  790. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  791. attentions: tuple[torch.FloatTensor, ...] | None = None
  792. @dataclass
  793. class Seq2SeqSequenceClassifierOutput(ModelOutput):
  794. """
  795. Base class for outputs of sequence-to-sequence sentence classification models.
  796. Args:
  797. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
  798. Classification (or regression if config.num_labels==1) loss.
  799. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
  800. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  801. past_key_values (`EncoderDecoderCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  802. It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  803. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  804. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  805. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  806. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  807. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  808. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  809. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  810. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  811. sequence_length)`.
  812. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  813. self-attention heads.
  814. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  815. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  816. sequence_length)`.
  817. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  818. weighted average in the cross-attention heads.
  819. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  820. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  821. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  822. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  823. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  824. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  825. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  826. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  827. sequence_length)`.
  828. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  829. self-attention heads.
  830. """
  831. loss: torch.FloatTensor | None = None
  832. logits: torch.FloatTensor | None = None
  833. past_key_values: EncoderDecoderCache | None = None
  834. decoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  835. decoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  836. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  837. encoder_last_hidden_state: torch.FloatTensor | None = None
  838. encoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  839. encoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  840. @dataclass
  841. class MultipleChoiceModelOutput(ModelOutput):
  842. """
  843. Base class for outputs of multiple choice models.
  844. Args:
  845. loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
  846. Classification loss.
  847. logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
  848. *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
  849. Classification scores (before SoftMax).
  850. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  851. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  852. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  853. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  854. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  855. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  856. sequence_length)`.
  857. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  858. heads.
  859. """
  860. loss: torch.FloatTensor | None = None
  861. logits: torch.FloatTensor | None = None
  862. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  863. attentions: tuple[torch.FloatTensor, ...] | None = None
  864. @dataclass
  865. class TokenClassifierOutput(ModelOutput):
  866. """
  867. Base class for outputs of token classification models.
  868. Args:
  869. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
  870. Classification loss.
  871. logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
  872. Classification scores (before SoftMax).
  873. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  874. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  875. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  876. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  877. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  878. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  879. sequence_length)`.
  880. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  881. heads.
  882. """
  883. loss: torch.FloatTensor | None = None
  884. logits: torch.FloatTensor | None = None
  885. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  886. attentions: tuple[torch.FloatTensor, ...] | None = None
  887. @dataclass
  888. class QuestionAnsweringModelOutput(ModelOutput):
  889. """
  890. Base class for outputs of question answering models.
  891. Args:
  892. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  893. Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
  894. start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
  895. Span-start scores (before SoftMax).
  896. end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
  897. Span-end scores (before SoftMax).
  898. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  899. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  900. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  901. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  902. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  903. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  904. sequence_length)`.
  905. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  906. heads.
  907. """
  908. loss: torch.FloatTensor | None = None
  909. start_logits: torch.FloatTensor | None = None
  910. end_logits: torch.FloatTensor | None = None
  911. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  912. attentions: tuple[torch.FloatTensor, ...] | None = None
  913. @dataclass
  914. class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
  915. """
  916. Base class for outputs of sequence-to-sequence question answering models.
  917. Args:
  918. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  919. Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
  920. start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
  921. Span-start scores (before SoftMax).
  922. end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
  923. Span-end scores (before SoftMax).
  924. past_key_values (`EncoderDecoderCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  925. It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  926. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  927. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  928. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  929. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  930. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  931. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  932. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  933. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  934. sequence_length)`.
  935. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  936. self-attention heads.
  937. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  938. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  939. sequence_length)`.
  940. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  941. weighted average in the cross-attention heads.
  942. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  943. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  944. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  945. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  946. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  947. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  948. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  949. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  950. sequence_length)`.
  951. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  952. self-attention heads.
  953. """
  954. loss: torch.FloatTensor | None = None
  955. start_logits: torch.FloatTensor | None = None
  956. end_logits: torch.FloatTensor | None = None
  957. past_key_values: EncoderDecoderCache | None = None
  958. decoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  959. decoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  960. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  961. encoder_last_hidden_state: torch.FloatTensor | None = None
  962. encoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  963. encoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  964. @dataclass
  965. class SemanticSegmenterOutput(ModelOutput):
  966. """
  967. Base class for outputs of semantic segmentation models.
  968. Args:
  969. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  970. Classification (or regression if config.num_labels==1) loss.
  971. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
  972. Classification scores for each pixel.
  973. <Tip warning={true}>
  974. The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
  975. to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
  976. original image size as post-processing. You should always check your logits shape and resize as needed.
  977. </Tip>
  978. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  979. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  980. one for the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
  981. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  982. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  983. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  984. sequence_length)`.
  985. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  986. heads.
  987. """
  988. loss: torch.FloatTensor | None = None
  989. logits: torch.FloatTensor | None = None
  990. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  991. attentions: tuple[torch.FloatTensor, ...] | None = None
  992. @dataclass
  993. class ImageClassifierOutput(ModelOutput):
  994. """
  995. Base class for outputs of image classification models.
  996. Args:
  997. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  998. Classification (or regression if config.num_labels==1) loss.
  999. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
  1000. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  1001. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1002. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1003. one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
  1004. (also called feature maps) of the model at the output of each stage.
  1005. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1006. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  1007. sequence_length)`.
  1008. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1009. heads.
  1010. """
  1011. loss: torch.FloatTensor | None = None
  1012. logits: torch.FloatTensor | None = None
  1013. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1014. attentions: tuple[torch.FloatTensor, ...] | None = None
  1015. @dataclass
  1016. class ImageClassifierOutputWithNoAttention(ModelOutput):
  1017. """
  1018. Base class for outputs of image classification models.
  1019. Args:
  1020. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1021. Classification (or regression if config.num_labels==1) loss.
  1022. logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
  1023. Classification (or regression if config.num_labels==1) scores (before SoftMax).
  1024. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1025. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1026. one for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
  1027. called feature maps) of the model at the output of each stage.
  1028. """
  1029. loss: torch.FloatTensor | None = None
  1030. logits: torch.FloatTensor | None = None
  1031. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1032. @dataclass
  1033. class DepthEstimatorOutput(ModelOutput):
  1034. """
  1035. Base class for outputs of depth estimation models.
  1036. Args:
  1037. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1038. Classification (or regression if config.num_labels==1) loss.
  1039. predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
  1040. Predicted depth for each pixel.
  1041. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1042. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1043. one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
  1044. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  1045. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1046. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  1047. sequence_length)`.
  1048. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1049. heads.
  1050. """
  1051. loss: torch.FloatTensor | None = None
  1052. predicted_depth: torch.FloatTensor | None = None
  1053. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1054. attentions: tuple[torch.FloatTensor, ...] | None = None
  1055. @dataclass
  1056. class ImageSuperResolutionOutput(ModelOutput):
  1057. """
  1058. Base class for outputs of image super resolution models.
  1059. Args:
  1060. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1061. Reconstruction loss.
  1062. reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
  1063. Reconstructed images, possibly upscaled.
  1064. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1065. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1066. one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
  1067. (also called feature maps) of the model at the output of each stage.
  1068. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1069. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  1070. sequence_length)`.
  1071. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1072. heads.
  1073. """
  1074. loss: torch.FloatTensor | None = None
  1075. reconstruction: torch.FloatTensor | None = None
  1076. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1077. attentions: tuple[torch.FloatTensor, ...] | None = None
  1078. @dataclass
  1079. class Wav2Vec2BaseModelOutput(ModelOutput):
  1080. """
  1081. Base class for models that have been trained with the Wav2Vec2 loss objective.
  1082. Args:
  1083. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  1084. Sequence of hidden-states at the output of the last layer of the model.
  1085. extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
  1086. Sequence of extracted feature vectors of the last convolutional layer of the model.
  1087. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1088. Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
  1089. shape `(batch_size, sequence_length, hidden_size)`.
  1090. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  1091. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1092. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1093. sequence_length)`.
  1094. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1095. heads.
  1096. """
  1097. last_hidden_state: torch.FloatTensor | None = None
  1098. extract_features: torch.FloatTensor | None = None
  1099. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1100. attentions: tuple[torch.FloatTensor, ...] | None = None
  1101. @dataclass
  1102. class XVectorOutput(ModelOutput):
  1103. """
  1104. Output type of [`Wav2Vec2ForXVector`].
  1105. Args:
  1106. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1107. Classification loss.
  1108. logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
  1109. Classification hidden states before AMSoftmax.
  1110. embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`):
  1111. Utterance embeddings used for vector similarity-based retrieval.
  1112. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1113. Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
  1114. shape `(batch_size, sequence_length, hidden_size)`.
  1115. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
  1116. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1117. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1118. sequence_length)`.
  1119. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1120. heads.
  1121. """
  1122. loss: torch.FloatTensor | None = None
  1123. logits: torch.FloatTensor | None = None
  1124. embeddings: torch.FloatTensor | None = None
  1125. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1126. attentions: tuple[torch.FloatTensor, ...] | None = None
  1127. @dataclass
  1128. class BackboneOutput(ModelOutput):
  1129. """
  1130. Base class for outputs of backbones.
  1131. Args:
  1132. feature_maps (`tuple(torch.FloatTensor)` of shape `(batch_size, num_channels, height, width)`):
  1133. Feature maps of the stages.
  1134. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1135. Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
  1136. shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, num_channels, height, width)`,
  1137. depending on the backbone.
  1138. Hidden-states of the model at the output of each stage plus the initial embedding outputs.
  1139. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1140. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1141. sequence_length)`. Only applicable if the backbone uses attention.
  1142. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1143. heads.
  1144. """
  1145. feature_maps: tuple[torch.FloatTensor] | None = None
  1146. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1147. attentions: tuple[torch.FloatTensor, ...] | None = None
  1148. @dataclass
  1149. class BaseModelOutputWithPoolingAndProjection(ModelOutput):
  1150. """
  1151. Base class for model's outputs that also contains a pooling of the last hidden states.
  1152. Args:
  1153. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  1154. Sequence of hidden-states at the output of the last layer of the model.
  1155. pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
  1156. Last layer hidden-state of the first token of the sequence (classification token) after further processing
  1157. through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
  1158. the classification token after processing through a linear layer and a tanh activation function. The linear
  1159. layer weights are trained from the next sentence prediction (classification) objective during pretraining.
  1160. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1161. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1162. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1163. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
  1164. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1165. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1166. sequence_length)`.
  1167. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
  1168. heads.
  1169. projection_state (`tuple(torch.FloatTensor)`, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1170. Tuple of `torch.FloatTensor` of shape `(batch_size,config.project_dim)`.
  1171. Text embeddings before the projection layer, used to mimic the last hidden state of the teacher encoder.
  1172. """
  1173. last_hidden_state: torch.FloatTensor | None = None
  1174. pooler_output: torch.FloatTensor | None = None
  1175. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1176. attentions: tuple[torch.FloatTensor, ...] | None = None
  1177. projection_state: tuple[torch.FloatTensor] | None = None
  1178. @dataclass
  1179. class Seq2SeqSpectrogramOutput(ModelOutput):
  1180. """
  1181. Base class for sequence-to-sequence spectrogram outputs.
  1182. Args:
  1183. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
  1184. Spectrogram generation loss.
  1185. spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
  1186. The predicted spectrogram.
  1187. past_key_values (`EncoderDecoderCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  1188. It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  1189. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  1190. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  1191. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1192. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1193. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1194. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  1195. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1196. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1197. sequence_length)`.
  1198. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  1199. self-attention heads.
  1200. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1201. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1202. sequence_length)`.
  1203. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  1204. weighted average in the cross-attention heads.
  1205. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  1206. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  1207. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1208. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1209. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1210. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  1211. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1212. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1213. sequence_length)`.
  1214. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  1215. self-attention heads.
  1216. """
  1217. loss: torch.FloatTensor | None = None
  1218. spectrogram: torch.FloatTensor | None = None
  1219. past_key_values: EncoderDecoderCache | None = None
  1220. decoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1221. decoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  1222. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  1223. encoder_last_hidden_state: torch.FloatTensor | None = None
  1224. encoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1225. encoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  1226. @dataclass
  1227. class Seq2SeqTSModelOutput(ModelOutput):
  1228. """
  1229. Base class for time series model's encoder outputs that also contains pre-computed hidden states that can speed up
  1230. sequential decoding.
  1231. Args:
  1232. last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
  1233. Sequence of hidden-states at the output of the last layer of the decoder of the model.
  1234. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
  1235. hidden_size)` is output.
  1236. past_key_values (`EncoderDecoderCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  1237. It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  1238. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  1239. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  1240. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1241. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1242. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1243. Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
  1244. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1245. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1246. sequence_length)`.
  1247. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  1248. self-attention heads.
  1249. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1250. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1251. sequence_length)`.
  1252. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  1253. weighted average in the cross-attention heads.
  1254. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  1255. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  1256. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1257. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1258. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1259. Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
  1260. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1261. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1262. sequence_length)`.
  1263. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  1264. self-attention heads.
  1265. loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
  1266. Shift values of each time series' context window which is used to give the model inputs of the same
  1267. magnitude and then used to shift back to the original magnitude.
  1268. scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
  1269. Scaling values of each time series' context window which is used to give the model inputs of the same
  1270. magnitude and then used to rescale back to the original magnitude.
  1271. static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
  1272. Static features of each time series' in a batch which are copied to the covariates at inference time.
  1273. """
  1274. last_hidden_state: torch.FloatTensor | None = None
  1275. past_key_values: EncoderDecoderCache | None = None
  1276. decoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1277. decoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  1278. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  1279. encoder_last_hidden_state: torch.FloatTensor | None = None
  1280. encoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1281. encoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  1282. loc: torch.FloatTensor | None = None
  1283. scale: torch.FloatTensor | None = None
  1284. static_features: torch.FloatTensor | None = None
  1285. @dataclass
  1286. class Seq2SeqTSPredictionOutput(ModelOutput):
  1287. """
  1288. Base class for time series model's decoder outputs that also contain the loss as well as the parameters of the
  1289. chosen distribution.
  1290. Args:
  1291. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided):
  1292. Distributional loss.
  1293. params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`):
  1294. Parameters of the chosen distribution.
  1295. past_key_values (`EncoderDecoderCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
  1296. It is a [`~cache_utils.EncoderDecoderCache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
  1297. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  1298. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
  1299. decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1300. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1301. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1302. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
  1303. decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1304. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1305. sequence_length)`.
  1306. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
  1307. self-attention heads.
  1308. cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1309. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1310. sequence_length)`.
  1311. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
  1312. weighted average in the cross-attention heads.
  1313. encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
  1314. Sequence of hidden-states at the output of the last layer of the encoder of the model.
  1315. encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
  1316. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1317. one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
  1318. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
  1319. encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
  1320. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
  1321. sequence_length)`.
  1322. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
  1323. self-attention heads.
  1324. loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
  1325. Shift values of each time series' context window which is used to give the model inputs of the same
  1326. magnitude and then used to shift back to the original magnitude.
  1327. scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
  1328. Scaling values of each time series' context window which is used to give the model inputs of the same
  1329. magnitude and then used to rescale back to the original magnitude.
  1330. static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
  1331. Static features of each time series' in a batch which are copied to the covariates at inference time.
  1332. """
  1333. loss: torch.FloatTensor | None = None
  1334. params: tuple[torch.FloatTensor, ...] | None = None
  1335. past_key_values: EncoderDecoderCache | None = None
  1336. decoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1337. decoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  1338. cross_attentions: tuple[torch.FloatTensor, ...] | None = None
  1339. encoder_last_hidden_state: torch.FloatTensor | None = None
  1340. encoder_hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1341. encoder_attentions: tuple[torch.FloatTensor, ...] | None = None
  1342. loc: torch.FloatTensor | None = None
  1343. scale: torch.FloatTensor | None = None
  1344. static_features: torch.FloatTensor | None = None
  1345. @dataclass
  1346. class SampleTSPredictionOutput(ModelOutput):
  1347. """
  1348. Base class for time series model's predictions outputs that contains the sampled values from the chosen
  1349. distribution.
  1350. Args:
  1351. sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, num_samples, prediction_length, input_size)`):
  1352. Sampled values from the chosen distribution.
  1353. """
  1354. sequences: torch.FloatTensor | None = None
  1355. @dataclass
  1356. class MaskedImageModelingOutput(ModelOutput):
  1357. """
  1358. Base class for outputs of masked image completion / in-painting models.
  1359. Args:
  1360. loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
  1361. Reconstruction loss.
  1362. reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
  1363. Reconstructed / completed images.
  1364. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or
  1365. when `config.output_hidden_states=True`):
  1366. Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
  1367. one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
  1368. (also called feature maps) of the model at the output of each stage.
  1369. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when
  1370. `config.output_attentions=True`):
  1371. Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
  1372. sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
  1373. the self-attention heads.
  1374. """
  1375. loss: torch.FloatTensor | None = None
  1376. reconstruction: torch.FloatTensor | None = None
  1377. hidden_states: tuple[torch.FloatTensor, ...] | None = None
  1378. attentions: tuple[torch.FloatTensor, ...] | None = None