modeling_roberta.py 52 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271
  1. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  2. # This file was automatically generated from src/transformers/models/roberta/modular_roberta.py.
  3. # Do NOT edit this file manually as any edits will be overwritten by the generation of
  4. # the file from the modular. If any change should be done, please apply the change to the
  5. # modular_roberta.py file directly. One of our CI enforces this.
  6. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
  7. # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  8. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  9. #
  10. # Licensed under the Apache License, Version 2.0 (the "License");
  11. # you may not use this file except in compliance with the License.
  12. # You may obtain a copy of the License at
  13. #
  14. # http://www.apache.org/licenses/LICENSE-2.0
  15. #
  16. # Unless required by applicable law or agreed to in writing, software
  17. # distributed under the License is distributed on an "AS IS" BASIS,
  18. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  19. # See the License for the specific language governing permissions and
  20. # limitations under the License.
  21. from collections.abc import Callable
  22. import torch
  23. import torch.nn as nn
  24. from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
  25. from ... import initialization as init
  26. from ...activations import ACT2FN, gelu
  27. from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
  28. from ...generation import GenerationMixin
  29. from ...masking_utils import create_bidirectional_mask, create_causal_mask
  30. from ...modeling_layers import GradientCheckpointingLayer
  31. from ...modeling_outputs import (
  32. BaseModelOutputWithPastAndCrossAttentions,
  33. BaseModelOutputWithPoolingAndCrossAttentions,
  34. CausalLMOutputWithCrossAttentions,
  35. MaskedLMOutput,
  36. MultipleChoiceModelOutput,
  37. QuestionAnsweringModelOutput,
  38. SequenceClassifierOutput,
  39. TokenClassifierOutput,
  40. )
  41. from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
  42. from ...processing_utils import Unpack
  43. from ...pytorch_utils import apply_chunking_to_forward
  44. from ...utils import TransformersKwargs, auto_docstring, logging
  45. from ...utils.generic import can_return_tuple, merge_with_config_defaults
  46. from ...utils.output_capturing import capture_outputs
  47. from .configuration_roberta import RobertaConfig
  48. logger = logging.get_logger(__name__)
  49. class RobertaEmbeddings(nn.Module):
  50. """Construct the embeddings from word, position and token_type embeddings."""
  51. def __init__(self, config):
  52. super().__init__()
  53. self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
  54. self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
  55. self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  56. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  57. # position_ids (1, len position emb) is contiguous in memory and exported when serialized
  58. self.register_buffer(
  59. "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
  60. )
  61. self.register_buffer(
  62. "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
  63. )
  64. self.padding_idx = config.pad_token_id
  65. self.position_embeddings = nn.Embedding(
  66. config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
  67. )
  68. def forward(
  69. self,
  70. input_ids: torch.LongTensor | None = None,
  71. token_type_ids: torch.LongTensor | None = None,
  72. position_ids: torch.LongTensor | None = None,
  73. inputs_embeds: torch.FloatTensor | None = None,
  74. past_key_values_length: int = 0,
  75. ) -> torch.Tensor:
  76. if position_ids is None:
  77. if input_ids is not None:
  78. # Create the position ids from the input token ids. Any padded tokens remain padded.
  79. position_ids = self.create_position_ids_from_input_ids(
  80. input_ids, self.padding_idx, past_key_values_length
  81. )
  82. else:
  83. position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, self.padding_idx)
  84. if input_ids is not None:
  85. input_shape = input_ids.size()
  86. else:
  87. input_shape = inputs_embeds.size()[:-1]
  88. batch_size, seq_length = input_shape
  89. # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
  90. # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
  91. # issue #5664
  92. if token_type_ids is None:
  93. if hasattr(self, "token_type_ids"):
  94. # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
  95. buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
  96. buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
  97. token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
  98. else:
  99. token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
  100. if inputs_embeds is None:
  101. inputs_embeds = self.word_embeddings(input_ids)
  102. token_type_embeddings = self.token_type_embeddings(token_type_ids)
  103. embeddings = inputs_embeds + token_type_embeddings
  104. position_embeddings = self.position_embeddings(position_ids)
  105. embeddings = embeddings + position_embeddings
  106. embeddings = self.LayerNorm(embeddings)
  107. embeddings = self.dropout(embeddings)
  108. return embeddings
  109. @staticmethod
  110. def create_position_ids_from_inputs_embeds(inputs_embeds, padding_idx):
  111. """
  112. We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
  113. Args:
  114. inputs_embeds: torch.Tensor
  115. Returns: torch.Tensor
  116. """
  117. input_shape = inputs_embeds.size()[:-1]
  118. sequence_length = input_shape[1]
  119. position_ids = torch.arange(
  120. padding_idx + 1, sequence_length + padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
  121. )
  122. return position_ids.unsqueeze(0).expand(input_shape)
  123. @staticmethod
  124. def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
  125. """
  126. Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
  127. are ignored. This is modified from fairseq's `utils.make_positions`.
  128. Args:
  129. x: torch.Tensor x:
  130. Returns: torch.Tensor
  131. """
  132. # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
  133. mask = input_ids.ne(padding_idx).int()
  134. incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
  135. return incremental_indices.long() + padding_idx
  136. def eager_attention_forward(
  137. module: nn.Module,
  138. query: torch.Tensor,
  139. key: torch.Tensor,
  140. value: torch.Tensor,
  141. attention_mask: torch.Tensor | None,
  142. scaling: float | None = None,
  143. dropout: float = 0.0,
  144. **kwargs: Unpack[TransformersKwargs],
  145. ):
  146. if scaling is None:
  147. scaling = query.size(-1) ** -0.5
  148. # Take the dot product between "query" and "key" to get the raw attention scores.
  149. attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
  150. if attention_mask is not None:
  151. attn_weights = attn_weights + attention_mask
  152. attn_weights = nn.functional.softmax(attn_weights, dim=-1)
  153. attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
  154. attn_output = torch.matmul(attn_weights, value)
  155. attn_output = attn_output.transpose(1, 2).contiguous()
  156. return attn_output, attn_weights
  157. class RobertaSelfAttention(nn.Module):
  158. def __init__(self, config, is_causal=False, layer_idx=None):
  159. super().__init__()
  160. if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
  161. raise ValueError(
  162. f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
  163. f"heads ({config.num_attention_heads})"
  164. )
  165. self.config = config
  166. self.num_attention_heads = config.num_attention_heads
  167. self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
  168. self.all_head_size = self.num_attention_heads * self.attention_head_size
  169. self.scaling = self.attention_head_size**-0.5
  170. self.query = nn.Linear(config.hidden_size, self.all_head_size)
  171. self.key = nn.Linear(config.hidden_size, self.all_head_size)
  172. self.value = nn.Linear(config.hidden_size, self.all_head_size)
  173. self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
  174. self.is_decoder = config.is_decoder
  175. self.is_causal = is_causal
  176. self.layer_idx = layer_idx
  177. def forward(
  178. self,
  179. hidden_states: torch.Tensor,
  180. attention_mask: torch.FloatTensor | None = None,
  181. past_key_values: Cache | None = None,
  182. **kwargs: Unpack[TransformersKwargs],
  183. ) -> tuple[torch.Tensor]:
  184. input_shape = hidden_states.shape[:-1]
  185. hidden_shape = (*input_shape, -1, self.attention_head_size)
  186. # get all proj
  187. query_layer = self.query(hidden_states).view(*hidden_shape).transpose(1, 2)
  188. key_layer = self.key(hidden_states).view(*hidden_shape).transpose(1, 2)
  189. value_layer = self.value(hidden_states).view(*hidden_shape).transpose(1, 2)
  190. if past_key_values is not None:
  191. # decoder-only roberta can have a simple dynamic cache for example
  192. current_past_key_values = past_key_values
  193. if isinstance(past_key_values, EncoderDecoderCache):
  194. current_past_key_values = past_key_values.self_attention_cache
  195. # save all key/value_layer to cache to be re-used for fast auto-regressive generation
  196. key_layer, value_layer = current_past_key_values.update(key_layer, value_layer, self.layer_idx)
  197. attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
  198. self.config._attn_implementation, eager_attention_forward
  199. )
  200. attn_output, attn_weights = attention_interface(
  201. self,
  202. query_layer,
  203. key_layer,
  204. value_layer,
  205. attention_mask,
  206. dropout=0.0 if not self.training else self.dropout.p,
  207. scaling=self.scaling,
  208. **kwargs,
  209. )
  210. attn_output = attn_output.reshape(*input_shape, -1).contiguous()
  211. return attn_output, attn_weights
  212. class RobertaCrossAttention(nn.Module):
  213. def __init__(self, config, is_causal=False, layer_idx=None):
  214. super().__init__()
  215. if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
  216. raise ValueError(
  217. f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
  218. f"heads ({config.num_attention_heads})"
  219. )
  220. self.config = config
  221. self.num_attention_heads = config.num_attention_heads
  222. self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
  223. self.all_head_size = self.num_attention_heads * self.attention_head_size
  224. self.scaling = self.attention_head_size**-0.5
  225. self.query = nn.Linear(config.hidden_size, self.all_head_size)
  226. self.key = nn.Linear(config.hidden_size, self.all_head_size)
  227. self.value = nn.Linear(config.hidden_size, self.all_head_size)
  228. self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
  229. self.is_causal = is_causal
  230. self.layer_idx = layer_idx
  231. def forward(
  232. self,
  233. hidden_states: torch.Tensor,
  234. encoder_hidden_states: torch.FloatTensor | None = None,
  235. attention_mask: torch.FloatTensor | None = None,
  236. past_key_values: EncoderDecoderCache | None = None,
  237. **kwargs: Unpack[TransformersKwargs],
  238. ) -> tuple[torch.Tensor]:
  239. # determine input shapes
  240. input_shape = hidden_states.shape[:-1]
  241. hidden_shape = (*input_shape, -1, self.attention_head_size)
  242. # get query proj
  243. query_layer = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
  244. is_updated = past_key_values.is_updated.get(self.layer_idx) if past_key_values is not None else False
  245. if past_key_values is not None and is_updated:
  246. # reuse k,v, cross_attentions
  247. key_layer = past_key_values.cross_attention_cache.layers[self.layer_idx].keys
  248. value_layer = past_key_values.cross_attention_cache.layers[self.layer_idx].values
  249. else:
  250. kv_shape = (*encoder_hidden_states.shape[:-1], -1, self.attention_head_size)
  251. key_layer = self.key(encoder_hidden_states).view(kv_shape).transpose(1, 2)
  252. value_layer = self.value(encoder_hidden_states).view(kv_shape).transpose(1, 2)
  253. if past_key_values is not None:
  254. # save all states to the cache
  255. key_layer, value_layer = past_key_values.cross_attention_cache.update(
  256. key_layer, value_layer, self.layer_idx
  257. )
  258. # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
  259. past_key_values.is_updated[self.layer_idx] = True
  260. attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
  261. self.config._attn_implementation, eager_attention_forward
  262. )
  263. attn_output, attn_weights = attention_interface(
  264. self,
  265. query_layer,
  266. key_layer,
  267. value_layer,
  268. attention_mask,
  269. dropout=0.0 if not self.training else self.dropout.p,
  270. scaling=self.scaling,
  271. **kwargs,
  272. )
  273. attn_output = attn_output.reshape(*input_shape, -1).contiguous()
  274. return attn_output, attn_weights
  275. class RobertaSelfOutput(nn.Module):
  276. def __init__(self, config):
  277. super().__init__()
  278. self.dense = nn.Linear(config.hidden_size, config.hidden_size)
  279. self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  280. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  281. def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
  282. hidden_states = self.dense(hidden_states)
  283. hidden_states = self.dropout(hidden_states)
  284. hidden_states = self.LayerNorm(hidden_states + input_tensor)
  285. return hidden_states
  286. class RobertaAttention(nn.Module):
  287. def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False):
  288. super().__init__()
  289. self.is_cross_attention = is_cross_attention
  290. attention_class = RobertaCrossAttention if is_cross_attention else RobertaSelfAttention
  291. self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx)
  292. self.output = RobertaSelfOutput(config)
  293. def forward(
  294. self,
  295. hidden_states: torch.Tensor,
  296. attention_mask: torch.FloatTensor | None = None,
  297. encoder_hidden_states: torch.FloatTensor | None = None,
  298. encoder_attention_mask: torch.FloatTensor | None = None,
  299. past_key_values: Cache | None = None,
  300. **kwargs: Unpack[TransformersKwargs],
  301. ) -> tuple[torch.Tensor]:
  302. attention_mask = attention_mask if not self.is_cross_attention else encoder_attention_mask
  303. attention_output, attn_weights = self.self(
  304. hidden_states,
  305. encoder_hidden_states=encoder_hidden_states,
  306. attention_mask=attention_mask,
  307. past_key_values=past_key_values,
  308. **kwargs,
  309. )
  310. attention_output = self.output(attention_output, hidden_states)
  311. return attention_output, attn_weights
  312. class RobertaIntermediate(nn.Module):
  313. def __init__(self, config):
  314. super().__init__()
  315. self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
  316. if isinstance(config.hidden_act, str):
  317. self.intermediate_act_fn = ACT2FN[config.hidden_act]
  318. else:
  319. self.intermediate_act_fn = config.hidden_act
  320. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  321. hidden_states = self.dense(hidden_states)
  322. hidden_states = self.intermediate_act_fn(hidden_states)
  323. return hidden_states
  324. class RobertaOutput(nn.Module):
  325. def __init__(self, config):
  326. super().__init__()
  327. self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
  328. self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  329. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  330. def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
  331. hidden_states = self.dense(hidden_states)
  332. hidden_states = self.dropout(hidden_states)
  333. hidden_states = self.LayerNorm(hidden_states + input_tensor)
  334. return hidden_states
  335. class RobertaLayer(GradientCheckpointingLayer):
  336. def __init__(self, config, layer_idx=None):
  337. super().__init__()
  338. self.chunk_size_feed_forward = config.chunk_size_feed_forward
  339. self.seq_len_dim = 1
  340. self.attention = RobertaAttention(config, is_causal=config.is_decoder, layer_idx=layer_idx)
  341. self.is_decoder = config.is_decoder
  342. self.add_cross_attention = config.add_cross_attention
  343. if self.add_cross_attention:
  344. if not self.is_decoder:
  345. raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
  346. self.crossattention = RobertaAttention(
  347. config,
  348. is_causal=False,
  349. layer_idx=layer_idx,
  350. is_cross_attention=True,
  351. )
  352. self.intermediate = RobertaIntermediate(config)
  353. self.output = RobertaOutput(config)
  354. def forward(
  355. self,
  356. hidden_states: torch.Tensor,
  357. attention_mask: torch.FloatTensor | None = None,
  358. encoder_hidden_states: torch.FloatTensor | None = None,
  359. encoder_attention_mask: torch.FloatTensor | None = None,
  360. past_key_values: Cache | None = None,
  361. **kwargs: Unpack[TransformersKwargs],
  362. ) -> torch.Tensor:
  363. self_attention_output, _ = self.attention(
  364. hidden_states,
  365. attention_mask,
  366. past_key_values=past_key_values,
  367. **kwargs,
  368. )
  369. attention_output = self_attention_output
  370. if self.is_decoder and encoder_hidden_states is not None:
  371. if not hasattr(self, "crossattention"):
  372. raise ValueError(
  373. f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
  374. " by setting `config.add_cross_attention=True`"
  375. )
  376. cross_attention_output, _ = self.crossattention(
  377. self_attention_output,
  378. None, # attention_mask
  379. encoder_hidden_states,
  380. encoder_attention_mask,
  381. past_key_values=past_key_values,
  382. **kwargs,
  383. )
  384. attention_output = cross_attention_output
  385. layer_output = apply_chunking_to_forward(
  386. self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
  387. )
  388. return layer_output
  389. def feed_forward_chunk(self, attention_output):
  390. intermediate_output = self.intermediate(attention_output)
  391. layer_output = self.output(intermediate_output, attention_output)
  392. return layer_output
  393. @auto_docstring
  394. class RobertaPreTrainedModel(PreTrainedModel):
  395. config_class = RobertaConfig
  396. base_model_prefix = "roberta"
  397. supports_gradient_checkpointing = True
  398. _supports_flash_attn = True
  399. _supports_sdpa = True
  400. _supports_flex_attn = True
  401. _supports_attention_backend = True
  402. _can_record_outputs = {
  403. "hidden_states": RobertaLayer,
  404. "attentions": RobertaSelfAttention,
  405. "cross_attentions": RobertaCrossAttention,
  406. }
  407. @torch.no_grad()
  408. def _init_weights(self, module):
  409. """Initialize the weights"""
  410. super()._init_weights(module)
  411. if isinstance(module, RobertaLMHead):
  412. init.zeros_(module.bias)
  413. elif isinstance(module, RobertaEmbeddings):
  414. init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
  415. init.zeros_(module.token_type_ids)
  416. class RobertaEncoder(nn.Module):
  417. def __init__(self, config):
  418. super().__init__()
  419. self.config = config
  420. self.layer = nn.ModuleList([RobertaLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)])
  421. def forward(
  422. self,
  423. hidden_states: torch.Tensor,
  424. attention_mask: torch.FloatTensor | None = None,
  425. encoder_hidden_states: torch.FloatTensor | None = None,
  426. encoder_attention_mask: torch.FloatTensor | None = None,
  427. past_key_values: Cache | None = None,
  428. use_cache: bool | None = None,
  429. **kwargs: Unpack[TransformersKwargs],
  430. ) -> tuple[torch.Tensor] | BaseModelOutputWithPastAndCrossAttentions:
  431. for i, layer_module in enumerate(self.layer):
  432. hidden_states = layer_module(
  433. hidden_states,
  434. attention_mask,
  435. encoder_hidden_states, # as a positional argument for gradient checkpointing
  436. encoder_attention_mask=encoder_attention_mask,
  437. past_key_values=past_key_values,
  438. **kwargs,
  439. )
  440. return BaseModelOutputWithPastAndCrossAttentions(
  441. last_hidden_state=hidden_states,
  442. past_key_values=past_key_values if use_cache else None,
  443. )
  444. class RobertaPooler(nn.Module):
  445. def __init__(self, config):
  446. super().__init__()
  447. self.dense = nn.Linear(config.hidden_size, config.hidden_size)
  448. self.activation = nn.Tanh()
  449. def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
  450. # We "pool" the model by simply taking the hidden state corresponding
  451. # to the first token.
  452. first_token_tensor = hidden_states[:, 0]
  453. pooled_output = self.dense(first_token_tensor)
  454. pooled_output = self.activation(pooled_output)
  455. return pooled_output
  456. @auto_docstring(
  457. custom_intro="""
  458. The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
  459. cross-attention is added between the self-attention layers, following the architecture described in [Attention is
  460. all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
  461. Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
  462. To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
  463. to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
  464. `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
  465. """
  466. )
  467. class RobertaModel(RobertaPreTrainedModel):
  468. _no_split_modules = ["RobertaEmbeddings", "RobertaLayer"]
  469. def __init__(self, config, add_pooling_layer=True):
  470. r"""
  471. add_pooling_layer (bool, *optional*, defaults to `True`):
  472. Whether to add a pooling layer
  473. """
  474. super().__init__(config)
  475. self.config = config
  476. self.gradient_checkpointing = False
  477. self.embeddings = RobertaEmbeddings(config)
  478. self.encoder = RobertaEncoder(config)
  479. self.pooler = RobertaPooler(config) if add_pooling_layer else None
  480. # Initialize weights and apply final processing
  481. self.post_init()
  482. def get_input_embeddings(self):
  483. return self.embeddings.word_embeddings
  484. def set_input_embeddings(self, value):
  485. self.embeddings.word_embeddings = value
  486. @merge_with_config_defaults
  487. @capture_outputs
  488. @auto_docstring
  489. def forward(
  490. self,
  491. input_ids: torch.Tensor | None = None,
  492. attention_mask: torch.Tensor | None = None,
  493. token_type_ids: torch.Tensor | None = None,
  494. position_ids: torch.Tensor | None = None,
  495. inputs_embeds: torch.Tensor | None = None,
  496. encoder_hidden_states: torch.Tensor | None = None,
  497. encoder_attention_mask: torch.Tensor | None = None,
  498. past_key_values: Cache | None = None,
  499. use_cache: bool | None = None,
  500. **kwargs: Unpack[TransformersKwargs],
  501. ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
  502. if (input_ids is None) ^ (inputs_embeds is not None):
  503. raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
  504. if self.config.is_decoder:
  505. use_cache = use_cache if use_cache is not None else self.config.use_cache
  506. else:
  507. use_cache = False
  508. if use_cache and past_key_values is None:
  509. past_key_values = (
  510. EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
  511. if encoder_hidden_states is not None or self.config.is_encoder_decoder
  512. else DynamicCache(config=self.config)
  513. )
  514. past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
  515. embedding_output = self.embeddings(
  516. input_ids=input_ids,
  517. position_ids=position_ids,
  518. token_type_ids=token_type_ids,
  519. inputs_embeds=inputs_embeds,
  520. past_key_values_length=past_key_values_length,
  521. )
  522. attention_mask, encoder_attention_mask = self._create_attention_masks(
  523. attention_mask=attention_mask,
  524. encoder_attention_mask=encoder_attention_mask,
  525. embedding_output=embedding_output,
  526. encoder_hidden_states=encoder_hidden_states,
  527. past_key_values=past_key_values,
  528. )
  529. encoder_outputs = self.encoder(
  530. embedding_output,
  531. attention_mask=attention_mask,
  532. encoder_hidden_states=encoder_hidden_states,
  533. encoder_attention_mask=encoder_attention_mask,
  534. past_key_values=past_key_values,
  535. use_cache=use_cache,
  536. position_ids=position_ids,
  537. **kwargs,
  538. )
  539. sequence_output = encoder_outputs.last_hidden_state
  540. pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
  541. return BaseModelOutputWithPoolingAndCrossAttentions(
  542. last_hidden_state=sequence_output,
  543. pooler_output=pooled_output,
  544. past_key_values=encoder_outputs.past_key_values,
  545. )
  546. def _create_attention_masks(
  547. self,
  548. attention_mask,
  549. encoder_attention_mask,
  550. embedding_output,
  551. encoder_hidden_states,
  552. past_key_values,
  553. ):
  554. if self.config.is_decoder:
  555. attention_mask = create_causal_mask(
  556. config=self.config,
  557. inputs_embeds=embedding_output,
  558. attention_mask=attention_mask,
  559. past_key_values=past_key_values,
  560. )
  561. else:
  562. attention_mask = create_bidirectional_mask(
  563. config=self.config,
  564. inputs_embeds=embedding_output,
  565. attention_mask=attention_mask,
  566. )
  567. if encoder_attention_mask is not None:
  568. encoder_attention_mask = create_bidirectional_mask(
  569. config=self.config,
  570. inputs_embeds=embedding_output,
  571. attention_mask=encoder_attention_mask,
  572. encoder_hidden_states=encoder_hidden_states,
  573. )
  574. return attention_mask, encoder_attention_mask
  575. @auto_docstring(
  576. custom_intro="""
  577. RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.
  578. """
  579. )
  580. class RobertaForCausalLM(RobertaPreTrainedModel, GenerationMixin):
  581. _tied_weights_keys = {
  582. "lm_head.decoder.weight": "roberta.embeddings.word_embeddings.weight",
  583. "lm_head.decoder.bias": "lm_head.bias",
  584. }
  585. def __init__(self, config):
  586. super().__init__(config)
  587. if not config.is_decoder:
  588. logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
  589. self.roberta = RobertaModel(config, add_pooling_layer=False)
  590. self.lm_head = RobertaLMHead(config)
  591. # Initialize weights and apply final processing
  592. self.post_init()
  593. def get_output_embeddings(self):
  594. return self.lm_head.decoder
  595. def set_output_embeddings(self, new_embeddings):
  596. self.lm_head.decoder = new_embeddings
  597. @can_return_tuple
  598. @auto_docstring
  599. def forward(
  600. self,
  601. input_ids: torch.LongTensor | None = None,
  602. attention_mask: torch.FloatTensor | None = None,
  603. token_type_ids: torch.LongTensor | None = None,
  604. position_ids: torch.LongTensor | None = None,
  605. inputs_embeds: torch.FloatTensor | None = None,
  606. encoder_hidden_states: torch.FloatTensor | None = None,
  607. encoder_attention_mask: torch.FloatTensor | None = None,
  608. labels: torch.LongTensor | None = None,
  609. past_key_values: tuple[tuple[torch.FloatTensor]] | None = None,
  610. use_cache: bool | None = None,
  611. logits_to_keep: int | torch.Tensor = 0,
  612. **kwargs: Unpack[TransformersKwargs],
  613. ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
  614. r"""
  615. token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  616. Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
  617. - 0 corresponds to a *sentence A* token,
  618. - 1 corresponds to a *sentence B* token.
  619. This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
  620. >= 2. All the value in this tensor should be always < type_vocab_size.
  621. [What are token type IDs?](../glossary#token-type-ids)
  622. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  623. Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
  624. `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
  625. ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
  626. Example:
  627. ```python
  628. >>> from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
  629. >>> import torch
  630. >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
  631. >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
  632. >>> config.is_decoder = True
  633. >>> model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)
  634. >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
  635. >>> outputs = model(**inputs)
  636. >>> prediction_logits = outputs.logits
  637. ```"""
  638. if labels is not None:
  639. use_cache = False
  640. outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta(
  641. input_ids,
  642. attention_mask=attention_mask,
  643. token_type_ids=token_type_ids,
  644. position_ids=position_ids,
  645. inputs_embeds=inputs_embeds,
  646. encoder_hidden_states=encoder_hidden_states,
  647. encoder_attention_mask=encoder_attention_mask,
  648. past_key_values=past_key_values,
  649. use_cache=use_cache,
  650. return_dict=True,
  651. **kwargs,
  652. )
  653. hidden_states = outputs.last_hidden_state
  654. # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
  655. slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
  656. logits = self.lm_head(hidden_states[:, slice_indices, :])
  657. loss = None
  658. if labels is not None:
  659. loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
  660. return CausalLMOutputWithCrossAttentions(
  661. loss=loss,
  662. logits=logits,
  663. past_key_values=outputs.past_key_values,
  664. hidden_states=outputs.hidden_states,
  665. attentions=outputs.attentions,
  666. cross_attentions=outputs.cross_attentions,
  667. )
  668. @auto_docstring
  669. class RobertaForMaskedLM(RobertaPreTrainedModel):
  670. _tied_weights_keys = {
  671. "lm_head.decoder.weight": "roberta.embeddings.word_embeddings.weight",
  672. "lm_head.decoder.bias": "lm_head.bias",
  673. }
  674. def __init__(self, config):
  675. super().__init__(config)
  676. if config.is_decoder:
  677. logger.warning(
  678. "If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for "
  679. "bi-directional self-attention."
  680. )
  681. self.roberta = RobertaModel(config, add_pooling_layer=False)
  682. self.lm_head = RobertaLMHead(config)
  683. # Initialize weights and apply final processing
  684. self.post_init()
  685. def get_output_embeddings(self):
  686. return self.lm_head.decoder
  687. def set_output_embeddings(self, new_embeddings):
  688. self.lm_head.decoder = new_embeddings
  689. @can_return_tuple
  690. @auto_docstring
  691. def forward(
  692. self,
  693. input_ids: torch.LongTensor | None = None,
  694. attention_mask: torch.FloatTensor | None = None,
  695. token_type_ids: torch.LongTensor | None = None,
  696. position_ids: torch.LongTensor | None = None,
  697. inputs_embeds: torch.FloatTensor | None = None,
  698. encoder_hidden_states: torch.FloatTensor | None = None,
  699. encoder_attention_mask: torch.FloatTensor | None = None,
  700. labels: torch.LongTensor | None = None,
  701. **kwargs: Unpack[TransformersKwargs],
  702. ) -> tuple[torch.Tensor] | MaskedLMOutput:
  703. r"""
  704. token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  705. Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
  706. - 0 corresponds to a *sentence A* token,
  707. - 1 corresponds to a *sentence B* token.
  708. This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
  709. >= 2. All the value in this tensor should be always < type_vocab_size.
  710. [What are token type IDs?](../glossary#token-type-ids)
  711. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  712. Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
  713. config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
  714. loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
  715. """
  716. outputs = self.roberta(
  717. input_ids,
  718. attention_mask=attention_mask,
  719. token_type_ids=token_type_ids,
  720. position_ids=position_ids,
  721. inputs_embeds=inputs_embeds,
  722. encoder_hidden_states=encoder_hidden_states,
  723. encoder_attention_mask=encoder_attention_mask,
  724. return_dict=True,
  725. **kwargs,
  726. )
  727. sequence_output = outputs[0]
  728. prediction_scores = self.lm_head(sequence_output)
  729. masked_lm_loss = None
  730. if labels is not None:
  731. # move labels to correct device
  732. labels = labels.to(prediction_scores.device)
  733. loss_fct = CrossEntropyLoss()
  734. masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
  735. return MaskedLMOutput(
  736. loss=masked_lm_loss,
  737. logits=prediction_scores,
  738. hidden_states=outputs.hidden_states,
  739. attentions=outputs.attentions,
  740. )
  741. class RobertaLMHead(nn.Module):
  742. """Roberta Head for masked language modeling."""
  743. def __init__(self, config):
  744. super().__init__()
  745. self.dense = nn.Linear(config.hidden_size, config.hidden_size)
  746. self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  747. self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
  748. self.bias = nn.Parameter(torch.zeros(config.vocab_size))
  749. def forward(self, features, **kwargs):
  750. x = self.dense(features)
  751. x = gelu(x)
  752. x = self.layer_norm(x)
  753. # project back to size of vocabulary with bias
  754. x = self.decoder(x)
  755. return x
  756. @auto_docstring(
  757. custom_intro="""
  758. RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
  759. pooled output) e.g. for GLUE tasks.
  760. """
  761. )
  762. class RobertaForSequenceClassification(RobertaPreTrainedModel):
  763. def __init__(self, config):
  764. super().__init__(config)
  765. self.num_labels = config.num_labels
  766. self.config = config
  767. self.roberta = RobertaModel(config, add_pooling_layer=False)
  768. self.classifier = RobertaClassificationHead(config)
  769. # Initialize weights and apply final processing
  770. self.post_init()
  771. @can_return_tuple
  772. @auto_docstring
  773. def forward(
  774. self,
  775. input_ids: torch.LongTensor | None = None,
  776. attention_mask: torch.FloatTensor | None = None,
  777. token_type_ids: torch.LongTensor | None = None,
  778. position_ids: torch.LongTensor | None = None,
  779. inputs_embeds: torch.FloatTensor | None = None,
  780. labels: torch.LongTensor | None = None,
  781. **kwargs: Unpack[TransformersKwargs],
  782. ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
  783. r"""
  784. token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  785. Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
  786. - 0 corresponds to a *sentence A* token,
  787. - 1 corresponds to a *sentence B* token.
  788. This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
  789. >= 2. All the value in this tensor should be always < type_vocab_size.
  790. [What are token type IDs?](../glossary#token-type-ids)
  791. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
  792. Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
  793. config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
  794. `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
  795. """
  796. outputs = self.roberta(
  797. input_ids,
  798. attention_mask=attention_mask,
  799. token_type_ids=token_type_ids,
  800. position_ids=position_ids,
  801. inputs_embeds=inputs_embeds,
  802. return_dict=True,
  803. **kwargs,
  804. )
  805. sequence_output = outputs[0]
  806. logits = self.classifier(sequence_output)
  807. loss = None
  808. if labels is not None:
  809. # move labels to correct device
  810. labels = labels.to(logits.device)
  811. if self.config.problem_type is None:
  812. if self.num_labels == 1:
  813. self.config.problem_type = "regression"
  814. elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
  815. self.config.problem_type = "single_label_classification"
  816. else:
  817. self.config.problem_type = "multi_label_classification"
  818. if self.config.problem_type == "regression":
  819. loss_fct = MSELoss()
  820. if self.num_labels == 1:
  821. loss = loss_fct(logits.squeeze(), labels.squeeze())
  822. else:
  823. loss = loss_fct(logits, labels)
  824. elif self.config.problem_type == "single_label_classification":
  825. loss_fct = CrossEntropyLoss()
  826. loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
  827. elif self.config.problem_type == "multi_label_classification":
  828. loss_fct = BCEWithLogitsLoss()
  829. loss = loss_fct(logits, labels)
  830. return SequenceClassifierOutput(
  831. loss=loss,
  832. logits=logits,
  833. hidden_states=outputs.hidden_states,
  834. attentions=outputs.attentions,
  835. )
  836. @auto_docstring
  837. class RobertaForMultipleChoice(RobertaPreTrainedModel):
  838. def __init__(self, config):
  839. super().__init__(config)
  840. self.roberta = RobertaModel(config)
  841. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  842. self.classifier = nn.Linear(config.hidden_size, 1)
  843. # Initialize weights and apply final processing
  844. self.post_init()
  845. @can_return_tuple
  846. @auto_docstring
  847. def forward(
  848. self,
  849. input_ids: torch.LongTensor | None = None,
  850. token_type_ids: torch.LongTensor | None = None,
  851. attention_mask: torch.FloatTensor | None = None,
  852. labels: torch.LongTensor | None = None,
  853. position_ids: torch.LongTensor | None = None,
  854. inputs_embeds: torch.FloatTensor | None = None,
  855. **kwargs: Unpack[TransformersKwargs],
  856. ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
  857. r"""
  858. input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
  859. Indices of input sequence tokens in the vocabulary.
  860. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  861. [`PreTrainedTokenizer.__call__`] for details.
  862. [What are input IDs?](../glossary#input-ids)
  863. token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
  864. Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
  865. - 0 corresponds to a *sentence A* token,
  866. - 1 corresponds to a *sentence B* token.
  867. This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
  868. >= 2. All the value in this tensor should be always < type_vocab_size.
  869. [What are token type IDs?](../glossary#token-type-ids)
  870. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
  871. Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
  872. num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
  873. `input_ids` above)
  874. position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
  875. Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
  876. config.max_position_embeddings - 1]`.
  877. [What are position IDs?](../glossary#position-ids)
  878. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
  879. Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
  880. is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
  881. model's internal embedding lookup matrix.
  882. """
  883. num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
  884. flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
  885. flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
  886. flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
  887. flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
  888. flat_inputs_embeds = (
  889. inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
  890. if inputs_embeds is not None
  891. else None
  892. )
  893. outputs = self.roberta(
  894. flat_input_ids,
  895. position_ids=flat_position_ids,
  896. token_type_ids=flat_token_type_ids,
  897. attention_mask=flat_attention_mask,
  898. inputs_embeds=flat_inputs_embeds,
  899. return_dict=True,
  900. **kwargs,
  901. )
  902. pooled_output = outputs[1]
  903. pooled_output = self.dropout(pooled_output)
  904. logits = self.classifier(pooled_output)
  905. reshaped_logits = logits.view(-1, num_choices)
  906. loss = None
  907. if labels is not None:
  908. # move labels to correct device
  909. labels = labels.to(reshaped_logits.device)
  910. loss_fct = CrossEntropyLoss()
  911. loss = loss_fct(reshaped_logits, labels)
  912. return MultipleChoiceModelOutput(
  913. loss=loss,
  914. logits=reshaped_logits,
  915. hidden_states=outputs.hidden_states,
  916. attentions=outputs.attentions,
  917. )
  918. @auto_docstring
  919. class RobertaForTokenClassification(RobertaPreTrainedModel):
  920. def __init__(self, config):
  921. super().__init__(config)
  922. self.num_labels = config.num_labels
  923. self.roberta = RobertaModel(config, add_pooling_layer=False)
  924. classifier_dropout = (
  925. config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
  926. )
  927. self.dropout = nn.Dropout(classifier_dropout)
  928. self.classifier = nn.Linear(config.hidden_size, config.num_labels)
  929. # Initialize weights and apply final processing
  930. self.post_init()
  931. @can_return_tuple
  932. @auto_docstring
  933. def forward(
  934. self,
  935. input_ids: torch.LongTensor | None = None,
  936. attention_mask: torch.FloatTensor | None = None,
  937. token_type_ids: torch.LongTensor | None = None,
  938. position_ids: torch.LongTensor | None = None,
  939. inputs_embeds: torch.FloatTensor | None = None,
  940. labels: torch.LongTensor | None = None,
  941. **kwargs: Unpack[TransformersKwargs],
  942. ) -> tuple[torch.Tensor] | TokenClassifierOutput:
  943. r"""
  944. token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  945. Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
  946. - 0 corresponds to a *sentence A* token,
  947. - 1 corresponds to a *sentence B* token.
  948. This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
  949. >= 2. All the value in this tensor should be always < type_vocab_size.
  950. [What are token type IDs?](../glossary#token-type-ids)
  951. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  952. Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
  953. """
  954. outputs = self.roberta(
  955. input_ids,
  956. attention_mask=attention_mask,
  957. token_type_ids=token_type_ids,
  958. position_ids=position_ids,
  959. inputs_embeds=inputs_embeds,
  960. return_dict=True,
  961. **kwargs,
  962. )
  963. sequence_output = outputs[0]
  964. sequence_output = self.dropout(sequence_output)
  965. logits = self.classifier(sequence_output)
  966. loss = None
  967. if labels is not None:
  968. # move labels to correct device
  969. labels = labels.to(logits.device)
  970. loss_fct = CrossEntropyLoss()
  971. loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
  972. return TokenClassifierOutput(
  973. loss=loss,
  974. logits=logits,
  975. hidden_states=outputs.hidden_states,
  976. attentions=outputs.attentions,
  977. )
  978. class RobertaClassificationHead(nn.Module):
  979. """Head for sentence-level classification tasks."""
  980. def __init__(self, config):
  981. super().__init__()
  982. self.dense = nn.Linear(config.hidden_size, config.hidden_size)
  983. classifier_dropout = (
  984. config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
  985. )
  986. self.dropout = nn.Dropout(classifier_dropout)
  987. self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
  988. def forward(self, features, **kwargs):
  989. x = features[:, 0, :] # take <s> token (equiv. to [CLS])
  990. x = self.dropout(x)
  991. x = self.dense(x)
  992. x = torch.tanh(x)
  993. x = self.dropout(x)
  994. x = self.out_proj(x)
  995. return x
  996. @auto_docstring
  997. class RobertaForQuestionAnswering(RobertaPreTrainedModel):
  998. def __init__(self, config):
  999. super().__init__(config)
  1000. self.num_labels = config.num_labels
  1001. self.roberta = RobertaModel(config, add_pooling_layer=False)
  1002. self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
  1003. # Initialize weights and apply final processing
  1004. self.post_init()
  1005. @can_return_tuple
  1006. @auto_docstring
  1007. def forward(
  1008. self,
  1009. input_ids: torch.LongTensor | None = None,
  1010. attention_mask: torch.FloatTensor | None = None,
  1011. token_type_ids: torch.LongTensor | None = None,
  1012. position_ids: torch.LongTensor | None = None,
  1013. inputs_embeds: torch.FloatTensor | None = None,
  1014. start_positions: torch.LongTensor | None = None,
  1015. end_positions: torch.LongTensor | None = None,
  1016. **kwargs: Unpack[TransformersKwargs],
  1017. ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
  1018. r"""
  1019. token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  1020. Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
  1021. - 0 corresponds to a *sentence A* token,
  1022. - 1 corresponds to a *sentence B* token.
  1023. This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
  1024. >= 2. All the value in this tensor should be always < type_vocab_size.
  1025. [What are token type IDs?](../glossary#token-type-ids)
  1026. """
  1027. outputs = self.roberta(
  1028. input_ids,
  1029. attention_mask=attention_mask,
  1030. token_type_ids=token_type_ids,
  1031. position_ids=position_ids,
  1032. inputs_embeds=inputs_embeds,
  1033. return_dict=True,
  1034. **kwargs,
  1035. )
  1036. sequence_output = outputs[0]
  1037. logits = self.qa_outputs(sequence_output)
  1038. start_logits, end_logits = logits.split(1, dim=-1)
  1039. start_logits = start_logits.squeeze(-1).contiguous()
  1040. end_logits = end_logits.squeeze(-1).contiguous()
  1041. total_loss = None
  1042. if start_positions is not None and end_positions is not None:
  1043. # If we are on multi-GPU, split add a dimension
  1044. if len(start_positions.size()) > 1:
  1045. start_positions = start_positions.squeeze(-1)
  1046. if len(end_positions.size()) > 1:
  1047. end_positions = end_positions.squeeze(-1)
  1048. # sometimes the start/end positions are outside our model inputs, we ignore these terms
  1049. ignored_index = start_logits.size(1)
  1050. start_positions = start_positions.clamp(0, ignored_index)
  1051. end_positions = end_positions.clamp(0, ignored_index)
  1052. loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
  1053. start_loss = loss_fct(start_logits, start_positions)
  1054. end_loss = loss_fct(end_logits, end_positions)
  1055. total_loss = (start_loss + end_loss) / 2
  1056. return QuestionAnsweringModelOutput(
  1057. loss=total_loss,
  1058. start_logits=start_logits,
  1059. end_logits=end_logits,
  1060. hidden_states=outputs.hidden_states,
  1061. attentions=outputs.attentions,
  1062. )
  1063. __all__ = [
  1064. "RobertaForCausalLM",
  1065. "RobertaForMaskedLM",
  1066. "RobertaForMultipleChoice",
  1067. "RobertaForQuestionAnswering",
  1068. "RobertaForSequenceClassification",
  1069. "RobertaForTokenClassification",
  1070. "RobertaModel",
  1071. "RobertaPreTrainedModel",
  1072. ]