modular_vaultgemma.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. # Copyright 2025 the HuggingFace Team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import torch
  15. from huggingface_hub.dataclasses import strict
  16. from ...cache_utils import Cache
  17. from ...utils import auto_docstring
  18. from ..gemma2.configuration_gemma2 import Gemma2Config
  19. from ..gemma2.modeling_gemma2 import Gemma2Attention, Gemma2DecoderLayer, Gemma2ForCausalLM, Gemma2MLP, Gemma2RMSNorm
  20. @auto_docstring(checkpoint="google/vaultgemma-1b")
  21. @strict
  22. class VaultGemmaConfig(Gemma2Config):
  23. r"""
  24. query_pre_attn_scalar (`float`, *optional*, defaults to 256):
  25. scaling factor used on the attention scores
  26. final_logit_softcapping (`float`, *optional*, defaults to 30.0):
  27. scaling factor when applying tanh softcapping on the logits.
  28. attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
  29. scaling factor when applying tanh softcapping on the attention scores.
  30. ```python
  31. >>> from transformers import VaultGemmaModel, VaultGemmaConfig
  32. >>> # Initializing a VaultGemma vaultgemma-7b style configuration
  33. >>> configuration = VaultGemmaConfig()
  34. >>> # Initializing a model from the vaultgemma-7b style configuration
  35. >>> model = VaultGemmaModel(configuration)
  36. >>> # Accessing the model configuration
  37. >>> configuration = model.config
  38. ```"""
  39. use_bidirectional_attention = AttributeError()
  40. class VaultGemmaRMSNorm(Gemma2RMSNorm):
  41. pass
  42. class VaultGemmaMLP(Gemma2MLP):
  43. pass
  44. class VaultGemmaAttention(Gemma2Attention):
  45. """Multi-headed attention from 'Attention Is All You Need' paper"""
  46. def __init__(self, config: VaultGemmaConfig, layer_idx: int):
  47. super().__init__()
  48. self.is_causal = True
  49. class VaultGemmaDecoderLayer(Gemma2DecoderLayer):
  50. def __init__(self, **super_kwargs):
  51. super().__init__(**super_kwargs)
  52. del self.post_attention_layernorm
  53. del self.post_feedforward_layernorm
  54. def forward(
  55. self,
  56. hidden_states: torch.Tensor,
  57. position_embeddings: tuple[torch.Tensor, torch.Tensor],
  58. attention_mask: torch.Tensor | None = None,
  59. position_ids: torch.LongTensor | None = None,
  60. past_key_values: Cache | None = None,
  61. **kwargs,
  62. ) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
  63. residual = hidden_states
  64. hidden_states = self.input_layernorm(hidden_states)
  65. # Self Attention
  66. hidden_states, _ = self.self_attn(
  67. hidden_states=hidden_states,
  68. position_embeddings=position_embeddings,
  69. attention_mask=attention_mask,
  70. position_ids=position_ids,
  71. past_key_values=past_key_values,
  72. **kwargs,
  73. )
  74. hidden_states = residual + hidden_states
  75. residual = hidden_states
  76. hidden_states = self.pre_feedforward_layernorm(hidden_states)
  77. hidden_states = self.mlp(hidden_states)
  78. hidden_states = residual + hidden_states
  79. return hidden_states
  80. class VaultGemmaForCausalLM(Gemma2ForCausalLM):
  81. pass
  82. __all__ = [
  83. "VaultGemmaConfig",
  84. "VaultGemmaForCausalLM",
  85. "VaultGemmaModel", # noqa: F822
  86. "VaultGemmaPreTrainedModel", # noqa: F822
  87. ]