| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417 |
- # Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved.
- #
- # This code is based on Llama implementations in this library and Microsoft's
- # Differential Transformer implementations.
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import math
- import torch
- from torch import nn
- from ... import initialization as init
- from ...cache_utils import Cache, StaticCache
- from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
- from ...modeling_utils import PreTrainedModel
- from ...utils import logging
- from ..gemma.modeling_gemma import GemmaForCausalLM
- from ..llama.modeling_llama import (
- LlamaDecoderLayer,
- LlamaForQuestionAnswering,
- LlamaForSequenceClassification,
- LlamaForTokenClassification,
- LlamaModel,
- LlamaPreTrainedModel,
- LlamaRotaryEmbedding,
- apply_rotary_pos_emb,
- repeat_kv,
- )
- from ..mistral.modeling_mistral import MistralMLP
- from .configuration_diffllama import DiffLlamaConfig
- logger = logging.get_logger(__name__)
- _CHECKPOINT_FOR_DOC = "kajuma/DiffLlama-0.3B-handcut"
- _CONFIG_FOR_DOC = "DiffLlamaConfig"
- class DiffLlamaMLP(MistralMLP):
- pass
- def lambda_init_fn(layer_idx):
- return 0.8 - 0.6 * math.exp(-0.3 * layer_idx)
- class DiffLlamaRotaryEmbedding(LlamaRotaryEmbedding):
- pass
- class DiffLlamaAttention(nn.Module):
- """Multi-headed attention from 'Attention Is All You Need' paper"""
- def __init__(self, config: DiffLlamaConfig, layer_idx: int | None = None):
- super().__init__()
- self.config = config
- self.layer_idx = layer_idx
- if layer_idx is None:
- logger.warning_once(
- f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
- "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
- "when creating this class."
- )
- self.attention_dropout = config.attention_dropout
- self.hidden_size = config.hidden_size
- self.num_heads = config.num_attention_heads
- self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
- self.num_key_value_heads = config.num_key_value_heads
- self.num_key_value_groups = self.num_heads // self.num_key_value_heads
- # under this are not used
- self.max_position_embeddings = config.max_position_embeddings
- self.is_causal = True
- self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
- self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
- self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
- self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
- self.lambda_init = lambda_init_fn(layer_idx)
- self.lambda_q1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
- self.lambda_k1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
- self.lambda_q2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
- self.lambda_k2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
- self.groupnorm = nn.RMSNorm(2 * self.head_dim, eps=config.rms_norm_eps, elementwise_affine=False)
- def forward(
- self,
- hidden_states: torch.Tensor,
- position_embeddings: tuple[torch.Tensor, torch.Tensor],
- attention_mask: torch.Tensor | None = None,
- position_ids: torch.LongTensor | None = None,
- past_key_values: Cache | None = None,
- use_cache: bool = False,
- **kwargs,
- ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
- bsz, target_len, _ = hidden_states.size()
- q_len = target_len
- query_states = self.q_proj(hidden_states)
- key_states = self.k_proj(hidden_states)
- value_states = self.v_proj(hidden_states)
- query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
- key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- cos, sin = position_embeddings
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
- if past_key_values is not None:
- key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
- key_states = repeat_kv(key_states, self.num_key_value_groups)
- value_states = repeat_kv(value_states, self.num_key_value_groups)
- value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
- value_states = value_states.repeat(1, 2, 1, 1)
- attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
- if attention_mask is not None:
- attn_weights = attn_weights + attention_mask
- # upcast attention to fp32
- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
- attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
- lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
- query_states.dtype
- )
- lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
- query_states.dtype
- )
- lambda_full = lambda_1 - lambda_2 + self.lambda_init
- attn_output = torch.matmul(attn_weights, value_states)
- attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
- attn_output = attn_output1 - lambda_full * attn_output2
- attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
- attn_output = attn_output.transpose(1, 2).contiguous()
- attn_output = attn_output.reshape(bsz, q_len, -1)
- attn_output = self.o_proj(attn_output)
- return attn_output, attn_weights
- class DiffLlamaFlashAttention2(DiffLlamaAttention):
- """
- DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
- untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
- flash attention and deal with padding tokens in case the input contains any of them.
- """
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
- # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
- # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
- self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
- def forward(
- self,
- hidden_states: torch.Tensor,
- position_embeddings: tuple[torch.Tensor, torch.Tensor],
- attention_mask: torch.LongTensor | None = None,
- position_ids: torch.LongTensor | None = None,
- past_key_values: Cache | None = None,
- use_cache: bool = False,
- ) -> tuple[torch.Tensor, None]:
- if isinstance(past_key_values, StaticCache):
- raise ValueError(
- "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
- "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
- )
- bsz, q_len, _ = hidden_states.size()
- query_states = self.q_proj(hidden_states)
- key_states = self.k_proj(hidden_states)
- value_states = self.v_proj(hidden_states)
- # Flash attention requires the input to have the shape
- # batch_size x seq_length x head_dim x hidden_dim
- # therefore we just need to keep the original shape
- query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
- key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- cos, sin = position_embeddings
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
- if past_key_values is not None:
- key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
- # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
- # to be able to avoid many of these transpose/reshape/view.
- query_states = query_states.transpose(1, 2)
- key_states = key_states.transpose(1, 2)
- value_states = value_states.transpose(1, 2)
- dropout_rate = self.attention_dropout if self.training else 0.0
- # In PEFT, usually we cast the layer norms in float32 for training stability reasons
- # therefore the input hidden states gets silently casted in float32. Hence, we need
- # cast them back in the correct dtype just to be sure everything works as expected.
- # This might slowdown training & inference so it is recommended to not cast the LayerNorms
- # in fp32. (DiffLlamaRMSNorm handles it correctly)
- input_dtype = query_states.dtype
- device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
- if input_dtype == torch.float32:
- if torch.is_autocast_enabled(device_type):
- target_dtype = torch.get_autocast_dtype(device_type)
- # Handle the case where the model is quantized
- elif hasattr(self.config, "_is_quantized"):
- target_dtype = self.config.dtype
- else:
- target_dtype = self.q_proj.weight.dtype
- logger.warning_once(
- f"The input hidden states seems to be silently casted in float32, this might be related to"
- f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
- f" {target_dtype}."
- )
- query_states = query_states.to(target_dtype)
- key_states = key_states.to(target_dtype)
- value_states = value_states.to(target_dtype)
- value_states1, value_states2 = torch.chunk(value_states, 2, dim=2)
- value_states1 = value_states1.repeat(1, 1, 2, 1)
- value_states2 = value_states2.repeat(1, 1, 2, 1)
- attn_output1 = _flash_attention_forward(
- query_states,
- key_states,
- value_states1,
- attention_mask,
- q_len,
- position_ids=position_ids,
- dropout=dropout_rate,
- sliding_window=getattr(self, "sliding_window", None),
- use_top_left_mask=self._flash_attn_uses_top_left_mask,
- is_causal=self.is_causal,
- )
- attn_output2 = _flash_attention_forward(
- query_states,
- key_states,
- value_states2,
- attention_mask,
- q_len,
- position_ids=position_ids,
- dropout=dropout_rate,
- sliding_window=getattr(self, "sliding_window", None),
- use_top_left_mask=self._flash_attn_uses_top_left_mask,
- is_causal=self.is_causal,
- )
- attn_output = torch.cat([attn_output1, attn_output2], dim=-1)
- attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=2)
- lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
- query_states.dtype
- )
- lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
- query_states.dtype
- )
- lambda_full = lambda_1 - lambda_2 + self.lambda_init
- attn_output = attn_output1 - lambda_full * attn_output2
- attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
- attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
- attn_output = self.o_proj(attn_output)
- return attn_output, None
- class DiffLlamaSdpaAttention(DiffLlamaAttention):
- """
- DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
- `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
- SDPA API.
- """
- # Adapted from DiffLlamaAttention.forward
- def forward(
- self,
- hidden_states: torch.Tensor,
- position_embeddings: tuple[torch.Tensor, torch.Tensor],
- attention_mask: torch.Tensor | None = None,
- position_ids: torch.LongTensor | None = None,
- past_key_values: Cache | None = None,
- use_cache: bool = False,
- **kwargs,
- ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
- bsz, q_len, _ = hidden_states.size()
- query_states = self.q_proj(hidden_states)
- key_states = self.k_proj(hidden_states)
- value_states = self.v_proj(hidden_states)
- query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
- key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- cos, sin = position_embeddings
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
- if past_key_values is not None:
- key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
- key_states = repeat_kv(key_states, self.num_key_value_groups)
- value_states = repeat_kv(value_states, self.num_key_value_groups)
- value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
- value_states = value_states.repeat(1, 2, 1, 1)
- causal_mask = attention_mask
- if attention_mask is not None:
- causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
- # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
- # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
- is_causal = causal_mask is None and q_len > 1
- attn_output = torch.nn.functional.scaled_dot_product_attention(
- query_states,
- key_states,
- value_states,
- attn_mask=causal_mask,
- dropout_p=self.attention_dropout if self.training else 0.0,
- is_causal=is_causal,
- )
- attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
- lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
- query_states.dtype
- )
- lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
- query_states.dtype
- )
- lambda_full = lambda_1 - lambda_2 + self.lambda_init
- attn_output = attn_output1 - lambda_full * attn_output2
- attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
- attn_output = attn_output.transpose(1, 2).contiguous()
- attn_output = attn_output.view(bsz, q_len, -1)
- attn_output = self.o_proj(attn_output)
- return attn_output, None
- DIFFLLAMA_ATTENTION_CLASSES = {
- "eager": DiffLlamaAttention,
- "flash_attention_2": DiffLlamaFlashAttention2,
- "sdpa": DiffLlamaSdpaAttention,
- }
- class DiffLlamaDecoderLayer(LlamaDecoderLayer):
- def __init__(self, config: DiffLlamaConfig, layer_idx: int):
- super().__init__(config, layer_idx)
- self.self_attn = DIFFLLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
- class DiffLlamaPreTrainedModel(LlamaPreTrainedModel):
- _supports_flex_attn = False
- _supports_attention_backend = False
- @torch.no_grad()
- def _init_weights(self, module):
- PreTrainedModel._init_weights(self, module)
- if isinstance(module, DiffLlamaAttention):
- init.normal_(module.lambda_q1, 0, self.config.lambda_std_dev)
- init.normal_(module.lambda_k1, 0, self.config.lambda_std_dev)
- init.normal_(module.lambda_q2, 0, self.config.lambda_std_dev)
- init.normal_(module.lambda_k2, 0, self.config.lambda_std_dev)
- class DiffLlamaModel(LlamaModel):
- pass
- class DiffLlamaForCausalLM(GemmaForCausalLM):
- pass
- class DiffLlamaForSequenceClassification(LlamaForSequenceClassification):
- pass
- class DiffLlamaForQuestionAnswering(LlamaForQuestionAnswering):
- pass
- class DiffLlamaForTokenClassification(LlamaForTokenClassification):
- pass
- __all__ = [
- "DiffLlamaPreTrainedModel",
- "DiffLlamaModel",
- "DiffLlamaForCausalLM",
- "DiffLlamaForSequenceClassification",
- "DiffLlamaForQuestionAnswering",
- "DiffLlamaForTokenClassification",
- ]
|