masking_utils.py 75 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608
  1. # Copyright 2025 HuggingFace Inc. team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import itertools
  15. from collections.abc import Callable
  16. import torch
  17. import torch.nn.functional as F
  18. from .cache_utils import Cache
  19. from .configuration_utils import PreTrainedConfig
  20. from .utils import is_torch_xpu_available, logging
  21. from .utils.deprecation import deprecate_kwarg
  22. from .utils.generic import GeneralInterface, is_flash_attention_requested
  23. from .utils.import_utils import is_torch_flex_attn_available, is_torch_greater_or_equal, is_tracing
  24. if is_torch_flex_attn_available():
  25. from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size
  26. from torch.nn.attention.flex_attention import BlockMask, create_block_mask
  27. else:
  28. # Register a fake type to avoid crashing for annotations and `isinstance` checks
  29. BlockMask = torch.Tensor
  30. _is_torch_greater_or_equal_than_2_5 = is_torch_greater_or_equal("2.5", accept_dev=True)
  31. _is_torch_greater_or_equal_than_2_6 = is_torch_greater_or_equal("2.6", accept_dev=True)
  32. _is_torch_xpu_available = is_torch_xpu_available()
  33. if _is_torch_greater_or_equal_than_2_6:
  34. from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
  35. logger = logging.get_logger(__name__)
  36. def and_masks(*mask_functions: Callable) -> Callable:
  37. """Returns a mask function that is the intersection of provided mask functions"""
  38. if not all(callable(arg) for arg in mask_functions):
  39. raise RuntimeError(f"All inputs should be callable mask_functions: {mask_functions}")
  40. def and_mask(batch_idx, head_idx, q_idx, kv_idx):
  41. result = q_idx.new_ones((), dtype=torch.bool)
  42. for mask in mask_functions:
  43. result = result & mask(batch_idx, head_idx, q_idx, kv_idx).to(result.device)
  44. return result
  45. return and_mask
  46. def or_masks(*mask_functions: Callable) -> Callable:
  47. """Returns a mask function that is the union of provided mask functions"""
  48. if not all(callable(arg) for arg in mask_functions):
  49. raise RuntimeError(f"All inputs should be callable mask_functions: {mask_functions}")
  50. def or_mask(batch_idx, head_idx, q_idx, kv_idx):
  51. result = q_idx.new_zeros((), dtype=torch.bool)
  52. for mask in mask_functions:
  53. result = result | mask(batch_idx, head_idx, q_idx, kv_idx).to(result.device)
  54. return result
  55. return or_mask
  56. def causal_mask_function(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
  57. """
  58. This creates a basic lower-diagonal causal mask.
  59. """
  60. return kv_idx <= q_idx
  61. def bidirectional_mask_function(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
  62. """
  63. This creates a full bidirectional mask.
  64. NOTE: It is important to keep an index-based version for non-vmap expansion.
  65. """
  66. return q_idx >= 0
  67. def sliding_window_overlay(sliding_window: int) -> Callable:
  68. """
  69. This is an overlay depicting a sliding window pattern. Add it on top of a causal mask for a proper sliding
  70. window mask.
  71. """
  72. def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
  73. return kv_idx > q_idx - sliding_window
  74. return inner_mask
  75. def chunked_overlay(chunk_size: int, left_padding: torch.Tensor) -> Callable:
  76. """
  77. This is an overlay depicting a chunked attention pattern. Add it on top of a causal mask for a proper chunked
  78. attention mask.
  79. """
  80. def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
  81. return (kv_idx - left_padding[batch_idx]) // chunk_size == (q_idx - left_padding[batch_idx]) // chunk_size
  82. return inner_mask
  83. def sliding_window_causal_mask_function(sliding_window: int) -> Callable:
  84. """
  85. This return the mask_function function to create a sliding window mask.
  86. """
  87. return and_masks(sliding_window_overlay(sliding_window), causal_mask_function)
  88. def sliding_window_bidirectional_overlay(sliding_window: int) -> Callable:
  89. """
  90. This is an overlay depicting a bidirectional sliding window pattern.
  91. """
  92. def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
  93. """A token can attend to any other token if their absolute distance is within
  94. the (inclusive) sliding window size (distance <= sliding_window)."""
  95. return abs(q_idx - kv_idx) <= sliding_window
  96. return inner_mask
  97. def sliding_window_bidirectional_mask_function(sliding_window: int) -> Callable:
  98. """
  99. This return the mask_function function to create a bidirectional sliding window mask.
  100. """
  101. return and_masks(sliding_window_bidirectional_overlay(sliding_window), bidirectional_mask_function)
  102. def chunked_causal_mask_function(chunk_size: int, left_padding: torch.Tensor) -> Callable:
  103. """
  104. This return the mask_function function to create a chunked attention mask.
  105. """
  106. return and_masks(chunked_overlay(chunk_size, left_padding), causal_mask_function)
  107. def padding_mask_function(padding_mask: torch.Tensor) -> Callable:
  108. """
  109. This return the mask_function function corresponding to a 2D padding mask.
  110. """
  111. def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
  112. # Note that here the mask should ALWAYS be at least of the max `kv_index` size in the dimension 1. This is because
  113. # we cannot pad it here in the mask_function as we don't know the final size, and we cannot try/except, as it is not
  114. # vectorizable on accelerator devices
  115. return padding_mask[batch_idx, kv_idx]
  116. return inner_mask
  117. def packed_sequence_mask_function(packed_sequence_mask: torch.Tensor) -> Callable:
  118. """
  119. This return the mask_function function corresponding to a 2D packed sequence mask.
  120. """
  121. def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
  122. return packed_sequence_mask[batch_idx, q_idx] == packed_sequence_mask[batch_idx, kv_idx]
  123. return inner_mask
  124. def add_offsets_to_mask_function(mask_function: Callable, q_offset: int, kv_offset: int) -> Callable:
  125. """
  126. This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
  127. not start and end indices.
  128. """
  129. def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
  130. return mask_function(batch_idx, head_idx, q_idx + q_offset, kv_idx + kv_offset)
  131. return inner_mask
  132. def prepare_padding_mask(attention_mask: torch.Tensor | None, kv_length: int, kv_offset: int) -> torch.Tensor | None:
  133. """
  134. From the 2D attention mask, prepare the correct padding mask to use by potentially padding it.
  135. """
  136. local_padding_mask = attention_mask
  137. if attention_mask is not None:
  138. # Pad it if necessary
  139. if (padding_length := kv_length + kv_offset - attention_mask.shape[-1]) > 0:
  140. local_padding_mask = torch.nn.functional.pad(attention_mask, (0, padding_length))
  141. return local_padding_mask
  142. def _can_skip_causal_mask_xpu(
  143. padding_mask: torch.Tensor | None,
  144. query_length: int,
  145. kv_length: int,
  146. local_attention_size: int | None,
  147. ) -> bool:
  148. """
  149. XPU-specific logic for determining if we can skip causal mask creation.
  150. For XPU devices, we have special handling:
  151. - Single query tokens (query_length == 1) use the same logic as CUDA
  152. - Multi-query tokens can skip if padding_mask is provided and correctly structured
  153. The mask must have all True values in the query window and all False after
  154. """
  155. if is_tracing(padding_mask):
  156. return False
  157. # Check local attention constraint (same as CUDA)
  158. if local_attention_size is not None and kv_length >= local_attention_size:
  159. return False
  160. if padding_mask is None:
  161. # Without padding mask, can skip if single query token or full causal attention
  162. return query_length == 1 or kv_length == query_length
  163. # XPU allows skipping under additional conditions when padding_mask is provided
  164. if query_length == 1:
  165. # Single query token: skip only if no padding tokens present
  166. return padding_mask.all()
  167. # XPU-specific: check if query window is all True and rest is all False
  168. # This allows XPU to optimize the 1st token in static cache
  169. return padding_mask[:, :query_length].all() and not padding_mask[:, query_length:].any()
  170. def _ignore_causal_mask_sdpa(
  171. padding_mask: torch.Tensor | None,
  172. query_length: int,
  173. kv_length: int,
  174. kv_offset: int,
  175. local_attention_size: int | None = None,
  176. ) -> bool:
  177. """
  178. Detects whether the causal mask can be ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.
  179. In case no token is masked in the 2D `padding_mask` argument, if `query_length == 1` or
  180. `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
  181. allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
  182. passed).
  183. """
  184. if padding_mask is not None and padding_mask.shape[-1] > kv_length:
  185. mask_indices = torch.arange(kv_length, device=padding_mask.device)
  186. mask_indices += kv_offset
  187. padding_mask = padding_mask[:, mask_indices]
  188. if _is_torch_xpu_available:
  189. # XPU devices have special handling for mask skipping:
  190. # - Single query tokens use the same logic as CUDA
  191. # - Multi-query tokens can skip if padding_mask is provided and correctly structured
  192. # (all True in query window, all False after)
  193. return _can_skip_causal_mask_xpu(padding_mask, query_length, kv_length, local_attention_size)
  194. # When using `torch.export` or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is
  195. # hard-coded to the forward. If a user exports a model with query_length > 1, the exported model will hard-code `is_causal=True`
  196. # which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108). Thus, we only set
  197. # `ignore_causal_mask = True` if we are not tracing
  198. if (
  199. not is_tracing(padding_mask)
  200. # only cases when lower and upper diags are the same, see https://github.com/pytorch/pytorch/issues/108108
  201. and (query_length == 1 or kv_length == query_length)
  202. # in this case we need to add special patterns to the mask so cannot be skipped otherwise
  203. and (local_attention_size is None or kv_length < local_attention_size)
  204. # In this case, we need to add padding to the mask, so cannot be skipped otherwise
  205. and (padding_mask is None or padding_mask.all())
  206. ):
  207. return True
  208. return False
  209. def _can_skip_bidirectional_mask_xpu(
  210. padding_mask: torch.Tensor | None,
  211. kv_length: int,
  212. local_attention_size: int | None,
  213. ) -> bool:
  214. """
  215. XPU-specific logic for determining if we can skip bidirectional mask creation.
  216. For XPU devices, we have special handling:
  217. - Skip if no padding and no local attention constraint
  218. """
  219. if is_tracing(padding_mask):
  220. return False
  221. # Check local attention constraint (same as CUDA)
  222. if local_attention_size is not None and kv_length >= local_attention_size:
  223. return False
  224. if padding_mask is None:
  225. # Without padding mask, can always skip for full bidirectional attention
  226. return True
  227. # Skip only if no padding tokens present
  228. return padding_mask.all()
  229. def _ignore_bidirectional_mask_sdpa(
  230. padding_mask: torch.Tensor | None,
  231. kv_length: int,
  232. local_attention_size: int | None = None,
  233. ) -> bool:
  234. """
  235. Detects whether the bidirectional mask can be ignored in case PyTorch's SDPA is used.
  236. In case no token is masked in the 2D `padding_mask` argument and no local attention constraint applies
  237. (i.e. `local_attention_size` is None or `kv_length < local_attention_size`), we skip mask creation,
  238. allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
  239. passed).
  240. """
  241. if _is_torch_xpu_available:
  242. # XPU devices have special handling for mask skipping:
  243. # - Skip if no padding and no local attention constraint
  244. return _can_skip_bidirectional_mask_xpu(padding_mask, kv_length, local_attention_size)
  245. # When using `torch.export` or `torch.onnx.dynamo_export`, we need to avoid to check the contents of the mask;
  246. # otherwise, we will encounter dynamic control flows
  247. if (
  248. not is_tracing(padding_mask)
  249. and (padding_mask is None or padding_mask.all())
  250. # in this case we need to add special patterns to the mask so cannot be skipped otherwise
  251. and (local_attention_size is None or kv_length < local_attention_size)
  252. ):
  253. return True
  254. return False
  255. def _vmap_expansion_sdpa(mask_function: Callable) -> Callable:
  256. """
  257. Used to vmap our mask_functions over the all 4 dimensions (b_idx, h_idx, q_idx, kv_idx) of the inputs.
  258. Using vmap here allows us to keep the performance of vectorized ops, while having a single set of primitive
  259. functions between attention interfaces (i.e. between flex and sdpa/eager, FA2 being a bit different).
  260. """
  261. # We vmap the function over all 4 dimensions, broadcasting [b_idx, h_idx, q_idx, kv_idx]
  262. dimensions = [(None, None, None, 0), (None, None, 0, None), (None, 0, None, None), (0, None, None, None)]
  263. for dims in dimensions:
  264. mask_function = torch.vmap(mask_function, in_dims=dims, out_dims=0)
  265. return mask_function
  266. def _non_vmap_expansion_sdpa(
  267. batch_indices: torch.Tensor, head_indices: torch.Tensor, q_indices: torch.Tensor, kv_indices: torch.Tensor
  268. ):
  269. """
  270. Used to broadcast our mask_functions over the all 4 dimensions (b_idx, h_idx, q_idx, kv_idx) of the inputs.
  271. Allows the usage of any index-based mask function without relying on vmap.
  272. NOTE: This is limited to index based functions only and is not guaranteed to work otherwise.
  273. Reference:
  274. - https://github.com/huggingface/optimum-onnx/blob/c123e8f4fab61b54a8e0e31ce74462bcacca576e/optimum/exporters/onnx/model_patcher.py#L362-L365
  275. """
  276. batch_indices = batch_indices[:, None, None, None]
  277. head_indices = head_indices[None, :, None, None]
  278. q_indices = q_indices[None, None, :, None]
  279. kv_indices = kv_indices[None, None, None, :]
  280. return batch_indices, head_indices, q_indices, kv_indices
  281. def sdpa_mask(
  282. batch_size: int,
  283. q_length: int,
  284. kv_length: int,
  285. q_offset: int = 0,
  286. kv_offset: int = 0,
  287. mask_function: Callable = causal_mask_function,
  288. attention_mask: torch.Tensor | None = None,
  289. local_size: int | None = None,
  290. allow_is_causal_skip: bool = True,
  291. allow_is_bidirectional_skip: bool = False,
  292. allow_torch_fix: bool = True,
  293. use_vmap: bool = False,
  294. device: torch.device | str = "cpu",
  295. **kwargs,
  296. ) -> torch.Tensor | None:
  297. """
  298. Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that
  299. the element should take part in the attention computation, and False that it should not.
  300. This function can only be used with torch>=2.5, as the context manager is otherwise not available.
  301. Args:
  302. batch_size (`int`):
  303. The batch size of the input sequence.
  304. q_length (`int`):
  305. The size that the query states will have during the attention computation.
  306. kv_length (`int`):
  307. The size that the key and value states will have during the attention computation.
  308. kv_offset (`int`, optional):
  309. An optional offset to indicate at which first position the key and values states will refer to.
  310. q_offset (`int`, optional):
  311. An optional offset to indicate at which first position the query states will refer to.
  312. mask_function (`Callable`):
  313. The mask factory function describing the mask pattern.
  314. attention_mask (`torch.Tensor`, optional):
  315. The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
  316. local_size (`int`, optional):
  317. The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
  318. to try to skip mask creation if possible.
  319. allow_is_causal_skip (`bool`, optional):
  320. Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
  321. `torch.sdpa` instead. Default to `True`.
  322. allow_is_bidirectional_skip (`bool`, optional):
  323. Whether to allow to return `None` for the mask under conditions where we do not have to add any bias,
  324. i.e. full attention without any padding. Default to `False`.
  325. allow_torch_fix (`bool`, optional):
  326. Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
  327. versions. We need an arg to skip it when using eager. By default `True`.
  328. use_vmap (`bool`, optional):
  329. Whether to use `vmap` during the mask construction or not. Allows powerful custom patterns that may not be
  330. index-based (for the cost of speed performance). By default `False`.
  331. device (`torch.device` or `str`, optional):
  332. An optional device to create the mask on.
  333. ## Creating a simple causal mask:
  334. To create the following causal mask:
  335. 0 ■ ⬚ ⬚ ⬚ ⬚
  336. 1 ■ ■ ⬚ ⬚ ⬚
  337. 2 ■ ■ ■ ⬚ ⬚
  338. 3 ■ ■ ■ ■ ⬚
  339. 4 ■ ■ ■ ■ ■
  340. You can do
  341. ```python
  342. >>> sdpa_mask(batch_size=1, q_length=5, kv_length=5)
  343. >>> tensor([[[[ True, False, False, False, False],
  344. [ True, True, False, False, False],
  345. [ True, True, True, False, False],
  346. [ True, True, True, True, False],
  347. [ True, True, True, True, True]]]])
  348. ```
  349. ## Creating a sliding window mask:
  350. To create the following sliding window mask (`sliding_window=3`):
  351. 0 ■ ⬚ ⬚ ⬚ ⬚
  352. 1 ■ ■ ⬚ ⬚ ⬚
  353. 2 ■ ■ ■ ⬚ ⬚
  354. 3 ⬚ ■ ■ ■ ⬚
  355. 4 ⬚ ⬚ ■ ■ ■
  356. You can do
  357. ```python
  358. >>> sdpa_mask(batch_size=1, q_length=5, kv_length=5, mask_function=sliding_window_causal_mask_function(3))
  359. >>> tensor([[[[ True, False, False, False, False],
  360. [ True, True, False, False, False],
  361. [ True, True, True, False, False],
  362. [False, True, True, True, False],
  363. [False, False, True, True, True]]]])
  364. ```
  365. ## Creating a chunked attention mask
  366. To create the following chunked attention mask (`chunk_size=3`):
  367. 0 ■ ⬚ ⬚ ⬚ ⬚
  368. 1 ■ ■ ⬚ ⬚ ⬚
  369. 2 ■ ■ ■ ⬚ ⬚
  370. 3 ⬚ ⬚ ⬚ ■ ⬚
  371. 4 ⬚ ⬚ ⬚ ■ ■
  372. You can do
  373. ```python
  374. >>> sdpa_mask(batch_size=1, q_length=5, kv_length=5, mask_function=chunked_causal_mask_function(3, torch.zeros(1, dtype=int)))
  375. >>> tensor([[[[ True, False, False, False, False],
  376. [ True, True, False, False, False],
  377. [ True, True, True, False, False],
  378. [False, False, False, True, False],
  379. [False, False, False, True, True]]]])
  380. ```
  381. """
  382. # For BC on `cache_positions` that used to be an arg at the position of `q_length`
  383. if isinstance(q_length, torch.Tensor):
  384. logger.warning_once(
  385. "`cache_position` is deprecated as an arg, and will be removed in Transformers v5.6. Please use `q_length` and "
  386. "`q_offset` instead, similarly to `kv_length` and `kv_offset`"
  387. )
  388. q_length, q_offset = q_length.shape[0], q_length[0].to(device)
  389. # Potentially pad the 2D mask
  390. padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
  391. # Under specific conditions, we can avoid materializing the mask
  392. # 1. Causal masks can rely on the `is_causal` argument
  393. # 2. Bidirectional do not need any further processing (no bias)
  394. if allow_is_causal_skip and _ignore_causal_mask_sdpa(padding_mask, q_length, kv_length, kv_offset, local_size):
  395. return None
  396. if allow_is_bidirectional_skip and _ignore_bidirectional_mask_sdpa(padding_mask, kv_length, local_size):
  397. return None
  398. # Potentially add the padding 2D mask
  399. if padding_mask is not None:
  400. mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
  401. batch_arange = torch.arange(batch_size, device=device)
  402. head_arange = torch.arange(1, device=device)
  403. q_arange = torch.arange(q_length, device=device) + q_offset
  404. kv_arange = torch.arange(kv_length, device=device) + kv_offset
  405. # Actual mask creation
  406. # Option 1: Fast non-vmap mask creation (default)
  407. if not use_vmap:
  408. # Apply mask function element-wise through broadcasting
  409. attention_mask = mask_function(*_non_vmap_expansion_sdpa(batch_arange, head_arange, q_arange, kv_arange))
  410. # Expand the mask to match batch size and query length if they weren't used in the mask function
  411. attention_mask = attention_mask.expand(batch_size, -1, q_length, kv_length)
  412. # Option 2: Vmap mask creation (torch>=2.6 and custom patterns)
  413. elif _is_torch_greater_or_equal_than_2_6:
  414. # This creates the 4D mask easily. Note that we need this context manager as vmap cannot handle slicing a tensor from
  415. # scalar tensor (it internally calls `.item()` which vmap does not allow, but this context works around it
  416. # We don't need to add an offset to the mask_function either, as we vmap directly the correct indices for k and kv indices
  417. with TransformGetItemToIndex():
  418. attention_mask = _vmap_expansion_sdpa(mask_function)(batch_arange, head_arange, q_arange, kv_arange)
  419. # Option 3: Error out since it indicates that the user did something custom, which they shouldn't have (torch<2.6)
  420. else:
  421. raise ValueError(
  422. "The vmap functionality for mask creation is only supported from torch>=2.6. "
  423. "Please update your torch version or use `use_vmap=False` with index-based masks."
  424. )
  425. # Due to a bug in versions of torch<2.5, we need to update the mask in case a query is not attending to any
  426. # tokens (due to padding). See details in https://github.com/pytorch/pytorch/issues/110213
  427. if not _is_torch_greater_or_equal_than_2_5 and allow_torch_fix:
  428. attention_mask = attention_mask | torch.all(~attention_mask, dim=-1, keepdim=True)
  429. return attention_mask
  430. def eager_mask(
  431. batch_size: int,
  432. q_length: int,
  433. kv_length: int,
  434. q_offset: int = 0,
  435. kv_offset: int = 0,
  436. mask_function: Callable = causal_mask_function,
  437. attention_mask: torch.Tensor | None = None,
  438. dtype: torch.dtype = torch.float32,
  439. allow_is_bidirectional_skip: bool = False,
  440. use_vmap: bool = False,
  441. device: torch.device | str = "cpu",
  442. **kwargs,
  443. ) -> torch.Tensor:
  444. """
  445. Create a 4D float mask of shape `(batch_size, 1, query_length, kv_length)` where a value of 0 indicates that
  446. the element should take part in the attention computation, and -inf (minimum value for the given `dtype`) that
  447. it should not.
  448. Args:
  449. batch_size (`int`):
  450. The batch size of the input sequence.
  451. q_length (`int`):
  452. The size that the query states will have during the attention computation.
  453. kv_length (`int`):
  454. The size that the key and value states will have during the attention computation.
  455. q_offset (`int`, optional):
  456. An optional offset to indicate at which first position the query states will refer to.
  457. kv_offset (`int`, optional):
  458. An optional offset to indicate at which first position the key and values states will refer to.
  459. mask_function (`Callable`):
  460. The mask factory function describing the mask pattern.
  461. attention_mask (`torch.Tensor`, optional):
  462. The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
  463. dtype (`torch.dtype`, optional):
  464. The dtype to use for the mask. By default, `torch.float32`.
  465. allow_is_bidirectional_skip (`bool`, optional):
  466. Whether to allow to return `None` for the mask under conditions where we do not have to add any bias,
  467. i.e. full attention without any padding. Default to `False`.
  468. use_vmap (`bool`, optional):
  469. Whether to use `vmap` during the mask construction or not. Allows powerful custom patterns that may not be
  470. index-based (for the cost of speed performance). By default `False`.
  471. device (`torch.device` or `str`, optional):
  472. An optional device to create the mask on.
  473. """
  474. # The masks for eager attention are simply boolean mask from sdpa, casted to 0 and -inf
  475. _ = kwargs.pop("allow_is_causal_skip", None)
  476. _ = kwargs.pop("allow_torch_fix", None)
  477. mask = sdpa_mask(
  478. batch_size=batch_size,
  479. q_length=q_length,
  480. kv_length=kv_length,
  481. q_offset=q_offset,
  482. kv_offset=kv_offset,
  483. mask_function=mask_function,
  484. attention_mask=attention_mask,
  485. allow_is_causal_skip=False,
  486. allow_is_bidirectional_skip=allow_is_bidirectional_skip,
  487. allow_torch_fix=False,
  488. use_vmap=use_vmap,
  489. device=device,
  490. **kwargs,
  491. )
  492. # only bidirectional masks can be skipped, otherwise we convert bool -> float
  493. if mask is not None:
  494. min_dtype = torch.finfo(dtype).min
  495. # we need 0s where the tokens should be taken into account, and -inf otherwise (mask is already of boolean type)
  496. mask = torch.where(mask, torch.tensor(0.0, device=mask.device, dtype=dtype), min_dtype)
  497. return mask
  498. def flash_attention_mask(
  499. batch_size: int,
  500. q_length: int,
  501. kv_length: int,
  502. q_offset: int = 0,
  503. kv_offset: int = 0,
  504. mask_function: Callable = causal_mask_function,
  505. attention_mask: torch.Tensor | None = None,
  506. **kwargs,
  507. ):
  508. """
  509. Create the attention mask necessary to use FA2. Since FA2 is un-padded by definition, here we simply return
  510. `None` if the mask is fully causal, or we return the 2D mask which will then be used to extract the seq_lens.
  511. We just slice it in case of sliding window.
  512. Args:
  513. batch_size (`int`):
  514. The batch size of the input sequence.
  515. q_length (`int`):
  516. The size that the query states will have during the attention computation.
  517. kv_length (`int`):
  518. The size that the key and value states will have during the attention computation.
  519. q_offset (`int`, optional):
  520. An optional offset to indicate at which first position the query states will refer to.
  521. kv_offset (`int`, optional):
  522. An optional offset to indicate at which first position the key and values states will refer to.
  523. mask_function (`Callable`):
  524. The mask factory function describing the mask pattern.
  525. attention_mask (`torch.Tensor`, optional):
  526. The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
  527. """
  528. if attention_mask is not None:
  529. # Here we need to slice from the right if using sliding or chunked (for full attention, this is equivalent to doing nothing)
  530. attention_mask = attention_mask[:, -kv_length:]
  531. # We only return an actual mask if there is at least 1 padding token, otherwise we return `None` and use `is_causal` in FA2
  532. # (note that the attention_mask is a boolean dtype here)
  533. if attention_mask.all():
  534. attention_mask = None
  535. return attention_mask
  536. def flex_attention_mask(
  537. batch_size: int,
  538. q_length: int,
  539. kv_length: int,
  540. q_offset: int = 0,
  541. kv_offset: int = 0,
  542. mask_function: Callable = causal_mask_function,
  543. attention_mask: torch.Tensor | None = None,
  544. device: torch.device | str = "cpu",
  545. **kwargs,
  546. ) -> BlockMask:
  547. """
  548. Create a 4D block mask which is a compressed representation of the full 4D block causal mask. BlockMask is essential
  549. for performant computation of flex attention. See: https://pytorch.org/blog/flexattention/
  550. Args:
  551. batch_size (`int`):
  552. The batch size of the input sequence.
  553. q_length (`int`):
  554. The size that the query states will have during the attention computation.
  555. kv_length (`int`):
  556. The size that the key and value states will have during the attention computation.
  557. q_offset (`int`, optional):
  558. An optional offset to indicate at which first position the query states will refer to.
  559. kv_offset (`int`, optional):
  560. An optional offset to indicate at which first position the key and values states will refer to.
  561. mask_function (`Callable`):
  562. The mask factory function describing the mask pattern.
  563. attention_mask (`torch.Tensor`, optional):
  564. The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
  565. device (`torch.device` or `str`, optional):
  566. An optional device to create the mask on.
  567. """
  568. # For BC on `cache_positions` that used to be an arg at the position of `q_length`
  569. if isinstance(q_length, torch.Tensor):
  570. logger.warning_once(
  571. "`cache_position` is deprecated as an arg, and will be removed in Transformers v5.6. Please use `q_length` and "
  572. "`q_offset` instead, similarly to `kv_length` and `kv_offset`"
  573. )
  574. q_length, q_offset = q_length.shape[0], q_length[0].to(device)
  575. # Potentially add the padding 2D mask
  576. if attention_mask is not None:
  577. # Older torch (2.5.x) cannot handle sequences not in multiples of 128 (default block size)
  578. # Hence we pad to multiples of this as a minimum to ensure this
  579. pad_len = ((attention_mask.shape[1] // flex_default_block_size) + 1) * flex_default_block_size
  580. pad_len = pad_len - attention_mask.shape[1]
  581. if not _is_torch_greater_or_equal_than_2_6 and pad_len > 0:
  582. attention_mask = torch.nn.functional.pad(attention_mask, value=0, pad=(0, pad_len))
  583. padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
  584. mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
  585. # Add the offsets on top (because flex interface only allows length, not start and end indices)
  586. mask_function = add_offsets_to_mask_function(mask_function, q_offset, kv_offset)
  587. # Finally create the block mask
  588. block_mask = create_block_mask(
  589. mask_mod=mask_function,
  590. B=batch_size,
  591. H=None,
  592. Q_LEN=q_length,
  593. KV_LEN=kv_length,
  594. device=device,
  595. _compile=_is_torch_greater_or_equal_than_2_6,
  596. )
  597. return block_mask
  598. class AttentionMaskInterface(GeneralInterface):
  599. # Class instance object, so that a call to `register` can be reflected into all other files correctly, even if
  600. # a new instance is created (in order to locally override a given function)
  601. _global_mapping = {
  602. "sdpa": sdpa_mask,
  603. "eager": eager_mask,
  604. "flash_attention_2": flash_attention_mask,
  605. "flash_attention_3": flash_attention_mask,
  606. "flash_attention_4": flash_attention_mask,
  607. "flex_attention": flex_attention_mask,
  608. }
  609. # Global AttentionMaskInterface shared by all models which do not need to overwrite any of the existing ones
  610. ALL_MASK_ATTENTION_FUNCTIONS: AttentionMaskInterface = AttentionMaskInterface()
  611. def find_packed_sequence_indices(position_ids: torch.Tensor) -> torch.Tensor | None:
  612. """
  613. Find the indices of the sequence to which each new query token in the sequence belongs when using packed
  614. tensor format (i.e. several sequences packed in the same batch dimension).
  615. Args:
  616. position_ids (`torch.Tensor`)
  617. A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
  618. Returns:
  619. A 2D tensor where each similar integer indicates that the tokens belong to the same sequence. For example, if we
  620. pack 3 sequences of 2, 3 and 1 tokens respectively along a single batch dim, this will return [[0, 0, 1, 1, 1, 2]].
  621. If the there is only one sequence in each batch item (and we don't compile), then we return `None` indicating
  622. no packed sequences. This is the same as [[0, 0, 0, 0, 0, 0]] for the example above.
  623. """
  624. # What separate different sequences is when 2 consecutive positions_ids are separated by more than 1. So
  625. # taking the diff (by prepending the first value - 1 to keep correct indexing) and applying cumsum to the result
  626. # gives exactly the sequence indices
  627. # Note that we assume that a single sequence cannot span several batch dimensions, i.e. 1 single sequence
  628. # cannot be part of the end of the first batch dim and the start of the 2nd one for example
  629. first_dummy_value = position_ids[:, :1] - 1 # We just need the diff on this first value to be 1
  630. position_diff = torch.diff(position_ids, prepend=first_dummy_value, dim=-1)
  631. packed_sequence_mask = (position_diff != 1).cumsum(-1)
  632. # Sadly this is a dynamic control flow, so we cannot enable this check on anything compile related
  633. if not is_tracing(packed_sequence_mask) and (packed_sequence_mask[:, -1] == 0).all():
  634. return None
  635. return packed_sequence_mask
  636. @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
  637. def _preprocess_mask_arguments(
  638. config: PreTrainedConfig,
  639. inputs_embeds: torch.Tensor,
  640. attention_mask: torch.Tensor | BlockMask | None,
  641. past_key_values: Cache | None,
  642. position_ids: torch.Tensor | None,
  643. layer_idx: int | None,
  644. encoder_hidden_states: torch.Tensor | None = None,
  645. ) -> tuple[bool, torch.Tensor | BlockMask | None, int, int]:
  646. """
  647. Perform some common pre-processing of the mask arguments we get from the modeling code. Mostly determine the
  648. key-value length and offsets, and if we should early exit or not.
  649. Args:
  650. config (`PreTrainedConfig`):
  651. The model config.
  652. inputs_embeds (`torch.Tensor`):
  653. The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
  654. batch size, query length and dtype.
  655. attention_mask (`torch.Tensor`, optional):
  656. The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
  657. It can also be an already prepared 4D mask, in which case it is returned as-is.
  658. past_key_values (`Cache`, optional):
  659. The past key values, if we use a cache.
  660. position_ids (`torch.Tensor`, optional)
  661. A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
  662. layer_idx (`int`, optional):
  663. If `past_key_values` is not None, this is the layer index of the cache from which to get the key-value
  664. length and offset. Indeed, for hybrid caches, different layers may return different lengths.
  665. encoder_hidden_states (`torch.Tensor`, optional):
  666. The input embeddings of shape (batch_size, kv_length, hidden_dim). If provided, it is used instead of
  667. `inputs_embeds` to infer the kv length.
  668. Returns:
  669. early_exit (`bool`):
  670. Whether we should early exit mask creation, and return the mask as-is.
  671. attention_mask (`torch.Tensor` or `BlockMask` or `None`):
  672. The attention mask to either return immediately, or to use in downstream mask creation.
  673. packed_sequence_mask (`torch.Tensor`, optional):
  674. In case we detected packed sequence format, this is a tensor where each similar integer indicates that
  675. the tokens belong to the same sequence.
  676. q_length (`int`):
  677. The size that the query states will have during the attention computation.
  678. kv_length (`int`):
  679. The size that the key and value states will have during the attention computation.
  680. q_offset (`int`, optional):
  681. An optional offset to indicate at which first position the query states will refer to.
  682. kv_offset (`int`):
  683. An offset to indicate at which first position the key and values states will refer to.
  684. """
  685. # If the mask is already 4D, simply return as-is (it was already prepared, or it is custom)
  686. if isinstance(attention_mask, (torch.Tensor, BlockMask)) and len(attention_mask.shape) == 4:
  687. return True, attention_mask, None, None, None, None, None
  688. # For TGI/vLLM backends, or other custom attention without equivalent mask creation: we don't need a mask!
  689. # Note: it's not ideal to check the `_global_mapping` attribute instead of the object itself, however otherwise
  690. # full graph dynamo tracing (i.e. torch.export or compile with `fullgraph=True`) will fail on Python<3.11
  691. # with `torch._dynamo.exc.Unsupported: 'inline in skipfiles:Mapping.__contains__ | __contains__, skipped
  692. # according trace_rules.lookup SKIP_DIRS'` -- can be removed when we require Python>=3.11
  693. if config._attn_implementation not in ALL_MASK_ATTENTION_FUNCTIONS._global_mapping:
  694. return True, None, None, None, None, None, None
  695. # Move the mask to correct device, and potentially switch dtype for efficiency
  696. if attention_mask is not None and attention_mask.ndim == 2:
  697. attention_mask = attention_mask.to(device=inputs_embeds.device, dtype=torch.bool)
  698. q_length = inputs_embeds.shape[1]
  699. # If using a cache, it can give all information about mask sizes based on seen tokens
  700. if past_key_values is not None:
  701. q_offset = past_key_values.get_seq_length()
  702. # To avoid graph breaks, StaticLayer return a tensor instead of int -> this has no impact on the ops, but we
  703. # need the correct device
  704. q_offset = q_offset.to(inputs_embeds.device) if isinstance(q_offset, torch.Tensor) else q_offset
  705. kv_length, kv_offset = past_key_values.get_mask_sizes(q_length, layer_idx)
  706. # Otherwise, we infer based on our input
  707. else:
  708. q_offset = 0
  709. # 1. Rely on input directly
  710. if attention_mask is None:
  711. # For encoder-decoders, use encoder_hidden_states to infer kv_length if provided
  712. kv_length = encoder_hidden_states.shape[1] if encoder_hidden_states is not None else q_length
  713. kv_offset = 0
  714. # 2. Rely on the mask instead - needed for special cases like prefix tuning in PEFT
  715. #
  716. # This is a very unique and special case where an encoder utilizes a cache and expects its length
  717. # to be accounted for (usually, they should never use a cache). In general, the mask should always
  718. # match with the input sizes nonetheless (i.e. it does not affect others).
  719. # Conclusion: "prefix tuning is evil"
  720. else:
  721. kv_length, kv_offset = attention_mask.shape[-1], 0
  722. # We check the position_ids for potential packed sequence format (only if the 2D attention mask is explicitly None,
  723. # and we don't have past_key_values, i.e. generally a training setup)
  724. packed_sequence_mask = None
  725. if position_ids is not None and attention_mask is None and past_key_values is None:
  726. batch_size = inputs_embeds.shape[0]
  727. # The position ids are sometimes just unsqueezed, without being expanded
  728. if batch_size != position_ids.shape[0]:
  729. position_ids = position_ids.expand(batch_size, -1)
  730. packed_sequence_mask = find_packed_sequence_indices(position_ids)
  731. return False, attention_mask, packed_sequence_mask, q_length, kv_length, q_offset, kv_offset
  732. @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
  733. def create_causal_mask(
  734. config: PreTrainedConfig,
  735. inputs_embeds: torch.Tensor,
  736. attention_mask: torch.Tensor | None,
  737. cache_position: torch.Tensor | None = None, # not used anymore but kept for BC
  738. *,
  739. past_key_values: Cache | None,
  740. position_ids: torch.Tensor | None = None,
  741. or_mask_function: Callable | None = None,
  742. and_mask_function: Callable | None = None,
  743. ) -> torch.Tensor | BlockMask | None:
  744. """
  745. Create a standard causal mask based on the attention implementation used (stored in the config). If `past_key_values`
  746. has an hybrid cache structure, this function will return the mask corresponding to one of the "full_attention" layers (to align
  747. to what is needed in the `modeling_xxx.py` files).
  748. Args:
  749. config (`PreTrainedConfig`):
  750. The model config.
  751. inputs_embeds (`torch.Tensor`):
  752. The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
  753. batch size, query length and dtype.
  754. attention_mask (`torch.Tensor`, optional):
  755. The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
  756. It can also be an already prepared 4D mask, in which case it is returned as-is.
  757. cache_position (`torch.Tensor`):
  758. Deprecated and unused.
  759. past_key_values (`Cache`, optional):
  760. The past key values, if we use a cache.
  761. position_ids (`torch.Tensor`, optional)
  762. A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
  763. or_mask_function (`Callable`, optional):
  764. An optional mask function to combine with the causal mask function (by doing the union of both). This is
  765. useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
  766. and_mask_function (`Callable`, optional):
  767. An optional mask function to combine with the causal mask function (by doing the intersection of both). This is
  768. useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
  769. """
  770. # Power feature: if `is_causal` is False, then fallback to bi-directional mask for bi-directional attention.
  771. # It allows to use decoder-only models with bi-directional attention as well
  772. if not getattr(config, "is_causal", True):
  773. return create_bidirectional_mask(
  774. config,
  775. inputs_embeds,
  776. attention_mask,
  777. past_key_values=past_key_values,
  778. or_mask_function=or_mask_function,
  779. and_mask_function=and_mask_function,
  780. )
  781. # If we have an hybrid cache structure, here we want to create the mask for the full layers
  782. if hasattr(past_key_values, "is_sliding") and False in past_key_values.is_sliding:
  783. layer_idx = past_key_values.is_sliding.index(False)
  784. else:
  785. layer_idx = 0
  786. early_exit, attention_mask, packed_sequence_mask, q_length, kv_length, q_offset, kv_offset = (
  787. _preprocess_mask_arguments(config, inputs_embeds, attention_mask, past_key_values, position_ids, layer_idx)
  788. )
  789. if early_exit:
  790. return attention_mask
  791. batch_size, dtype, device = inputs_embeds.shape[0], inputs_embeds.dtype, inputs_embeds.device
  792. mask_factory_function = causal_mask_function
  793. mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
  794. # Defaulting to using non-vmap based mask creations except when detecting
  795. # users passing custom mask functions (as we cannot guarantee that they
  796. # are properly index-based as required by our implementation).
  797. use_vmap = False
  798. # Do not allow skip if we are compiling (this is to match BC)
  799. # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
  800. if _is_torch_xpu_available:
  801. # Do not allow skip if we are compiling for decoding, but for prefill, we still allow skip to optimization the perf of 1st token generation
  802. allow_is_causal_skip = not (getattr(past_key_values, "is_compileable", False) and q_length == 1)
  803. else:
  804. allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
  805. # Allow slight deviations from causal mask
  806. # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
  807. # padding mask, etc) as the resulting mask may otherwise not be correct!
  808. if or_mask_function is not None:
  809. if not _is_torch_greater_or_equal_than_2_6:
  810. raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
  811. mask_factory_function = or_masks(mask_factory_function, or_mask_function)
  812. allow_is_causal_skip = False
  813. use_vmap = True
  814. if and_mask_function is not None:
  815. if not _is_torch_greater_or_equal_than_2_6:
  816. raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
  817. mask_factory_function = and_masks(mask_factory_function, and_mask_function)
  818. allow_is_causal_skip = False
  819. use_vmap = True
  820. # If we detected packing format
  821. if packed_sequence_mask is not None:
  822. mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
  823. allow_is_causal_skip = False
  824. # We now create the mask
  825. causal_mask = mask_interface(
  826. batch_size=batch_size,
  827. q_length=q_length,
  828. kv_length=kv_length,
  829. q_offset=q_offset,
  830. kv_offset=kv_offset,
  831. mask_function=mask_factory_function,
  832. attention_mask=attention_mask,
  833. allow_is_causal_skip=allow_is_causal_skip, # additional kwarg for sdpa
  834. dtype=dtype, # Additional kwarg for eager
  835. config=config, # Pass the config as well, in case someone wants to easily have their own mask_interface
  836. use_vmap=use_vmap, # Short-circuit to non-vmap expansions for the mask
  837. device=device,
  838. )
  839. return causal_mask
  840. @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
  841. def create_bidirectional_mask(
  842. config: PreTrainedConfig,
  843. inputs_embeds: torch.Tensor,
  844. attention_mask: torch.Tensor | None,
  845. encoder_hidden_states: torch.Tensor | None = None,
  846. past_key_values: Cache | None = None,
  847. or_mask_function: Callable | None = None,
  848. and_mask_function: Callable | None = None,
  849. ) -> torch.Tensor | BlockMask | None:
  850. """
  851. Create a standard bidirectional mask based on the attention implementation used (stored in the config).
  852. Args:
  853. config (`PreTrainedConfig`):
  854. The model config.
  855. inputs_embeds (`torch.Tensor`):
  856. The input embeddings of shape (batch_size, query_length, hidden_dim). This is only used to infer metadata
  857. such as the batch size, query length, dtype, and device.
  858. past_key_values (`Cache`, optional):
  859. The past key values, if we use a cache.
  860. attention_mask (`torch.Tensor`, optional):
  861. The 2D attention mask corresponding to padded tokens of shape (batch_size, kv_length).
  862. It can also be an already prepared 4D mask of shape (batch_size, 1, query_length, kv_length),
  863. in which case it is returned as-is.
  864. encoder_hidden_states (`torch.Tensor`, optional):
  865. The input embeddings of shape (batch_size, kv_length, hidden_dim). If provided, it is used instead of
  866. `inputs_embeds` to infer the batch size, kv length and dtype.
  867. or_mask_function (`Callable`, optional):
  868. An optional mask function to combine with the base mask function (by doing the union of both). This is
  869. useful to easily overlay another mask on top, for example for image tokens handling.
  870. and_mask_function (`Callable`, optional):
  871. An optional mask function to combine with the base mask function (by doing the intersection of both). This is
  872. useful to easily overlay another mask on top, for example for image tokens handling.
  873. """
  874. # We ignore a few irrelevant arguments at the end as we do not have a (growing) cache here
  875. early_exit, attention_mask, _, q_length, kv_length, q_offset, kv_offset = _preprocess_mask_arguments(
  876. config, inputs_embeds, attention_mask, past_key_values, None, 0, encoder_hidden_states
  877. )
  878. if early_exit:
  879. return attention_mask
  880. embeds = encoder_hidden_states if encoder_hidden_states is not None else inputs_embeds
  881. batch_size, dtype, device = embeds.shape[0], embeds.dtype, embeds.device
  882. mask_factory_function = bidirectional_mask_function
  883. mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
  884. # Allow skipping the mask creation except we have additional masking operators (and/or masks)
  885. allow_is_bidirectional_skip = True
  886. # Defaulting to using non-vmap based mask creations except when detecting
  887. # users passing custom mask functions (as we cannot guarantee that they
  888. # are properly index-based as required by our implementation).
  889. use_vmap = False
  890. # Allow slight deviations from the base mask
  891. # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
  892. # padding mask, etc) as the resulting mask may otherwise not be correct!
  893. if or_mask_function is not None:
  894. if not _is_torch_greater_or_equal_than_2_6:
  895. raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
  896. mask_factory_function = or_masks(mask_factory_function, or_mask_function)
  897. allow_is_bidirectional_skip = False
  898. use_vmap = True
  899. if and_mask_function is not None:
  900. if not _is_torch_greater_or_equal_than_2_6:
  901. raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
  902. mask_factory_function = and_masks(mask_factory_function, and_mask_function)
  903. allow_is_bidirectional_skip = False
  904. use_vmap = True
  905. # We now create the mask
  906. attention_mask = mask_interface(
  907. batch_size=batch_size,
  908. q_length=q_length,
  909. kv_length=kv_length,
  910. q_offset=q_offset,
  911. kv_offset=kv_offset,
  912. mask_function=mask_factory_function,
  913. attention_mask=attention_mask,
  914. # Additional kwargs for sdpa
  915. allow_is_causal_skip=False,
  916. allow_is_bidirectional_skip=allow_is_bidirectional_skip,
  917. dtype=dtype, # Additional kwarg for eager
  918. config=config, # Pass the config as well, in case someone wants to easily have their own mask_interface
  919. use_vmap=use_vmap, # Short-circuit to non-vmap expansions for the mask
  920. device=device,
  921. )
  922. return attention_mask
  923. @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
  924. def create_sliding_window_causal_mask(
  925. config: PreTrainedConfig,
  926. inputs_embeds: torch.Tensor,
  927. attention_mask: torch.Tensor | None,
  928. cache_position: torch.Tensor | None = None, # not used anymore but kept for BC
  929. *,
  930. past_key_values: Cache | None,
  931. position_ids: torch.Tensor | None = None,
  932. or_mask_function: Callable | None = None,
  933. and_mask_function: Callable | None = None,
  934. ) -> torch.Tensor | BlockMask | None:
  935. """
  936. Create a sliding window causal mask based on the attention implementation used (stored in the config). This type
  937. of attention pattern was mostly democratized by Mistral. If `past_key_values` has an hybrid cache structure, this
  938. function will return the mask corresponding to one of the "sliding_attention" layers (to align to what is needed in the
  939. `modeling_xxx.py` files).
  940. Args:
  941. config (`PreTrainedConfig`):
  942. The model config.
  943. inputs_embeds (`torch.Tensor`):
  944. The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
  945. batch size, query length and dtype.
  946. attention_mask (`torch.Tensor`, optional):
  947. The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
  948. It can also be an already prepared 4D mask, in which case it is returned as-is.
  949. cache_position (`torch.Tensor`):
  950. Deprecated and unused.
  951. past_key_values (`Cache`, optional):
  952. The past key values, if we use a cache.
  953. position_ids (`torch.Tensor`, optional)
  954. A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
  955. or_mask_function (`Callable`, optional):
  956. An optional mask function to combine with the sliding causal mask function (by doing the union of both). This is
  957. useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
  958. and_mask_function (`Callable`, optional):
  959. An optional mask function to combine with the sliding causal mask function (by doing the intersection of both). This is
  960. useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
  961. """
  962. # Power feature: if `is_causal` is False, then fallback to bi-directional mask for bi-directional attention
  963. # It allows to use decoder-only models with bi-directional attention as well
  964. if not getattr(config, "is_causal", True):
  965. return create_bidirectional_sliding_window_mask(
  966. config,
  967. inputs_embeds,
  968. attention_mask,
  969. past_key_values=past_key_values,
  970. or_mask_function=or_mask_function,
  971. and_mask_function=and_mask_function,
  972. )
  973. # If we have an hybrid cache structure, here we want to create the mask for the sliding layers
  974. if hasattr(past_key_values, "is_sliding") and True in past_key_values.is_sliding:
  975. layer_idx = past_key_values.is_sliding.index(True)
  976. else:
  977. layer_idx = 0
  978. early_exit, attention_mask, packed_sequence_mask, q_length, kv_length, q_offset, kv_offset = (
  979. _preprocess_mask_arguments(config, inputs_embeds, attention_mask, past_key_values, position_ids, layer_idx)
  980. )
  981. if early_exit:
  982. return attention_mask
  983. sliding_window = getattr(config, "sliding_window", None)
  984. if sliding_window is None:
  985. raise ValueError("Could not find a `sliding_window` argument in the config, or it is not set")
  986. batch_size, dtype, device = inputs_embeds.shape[0], inputs_embeds.dtype, inputs_embeds.device
  987. mask_factory_function = sliding_window_causal_mask_function(sliding_window)
  988. mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
  989. # Defaulting to using non-vmap based mask creations except when detecting
  990. # users passing custom mask functions (as we cannot guarantee that they
  991. # are properly index-based as required by our implementation).
  992. use_vmap = False
  993. # Do not allow skip if we are compiling (this is to match BC)
  994. # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
  995. allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
  996. # Allow slight deviations from causal mask
  997. # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
  998. # padding mask, etc) as the resulting mask may otherwise not be correct!
  999. if or_mask_function is not None:
  1000. if not _is_torch_greater_or_equal_than_2_6:
  1001. raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
  1002. mask_factory_function = or_masks(mask_factory_function, or_mask_function)
  1003. allow_is_causal_skip = False
  1004. use_vmap = True
  1005. if and_mask_function is not None:
  1006. if not _is_torch_greater_or_equal_than_2_6:
  1007. raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
  1008. mask_factory_function = and_masks(mask_factory_function, and_mask_function)
  1009. allow_is_causal_skip = False
  1010. use_vmap = True
  1011. # If we detected packing format
  1012. if packed_sequence_mask is not None:
  1013. mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
  1014. allow_is_causal_skip = False
  1015. # We now create the mask
  1016. causal_mask = mask_interface(
  1017. batch_size=batch_size,
  1018. q_length=q_length,
  1019. kv_length=kv_length,
  1020. q_offset=q_offset,
  1021. kv_offset=kv_offset,
  1022. mask_function=mask_factory_function,
  1023. attention_mask=attention_mask,
  1024. allow_is_causal_skip=allow_is_causal_skip, # additional kwarg for sdpa
  1025. local_size=sliding_window, # Additional kwarg for sdpa
  1026. dtype=dtype, # Additional kwarg for eager
  1027. config=config, # Pass the config as well, in case someone wants to easily have their own mask_interface
  1028. use_vmap=use_vmap, # Short-circuit to non-vmap expansions for the mask
  1029. device=device,
  1030. )
  1031. return causal_mask
  1032. @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
  1033. def create_bidirectional_sliding_window_mask(
  1034. config: PreTrainedConfig,
  1035. inputs_embeds: torch.Tensor,
  1036. attention_mask: torch.Tensor | None,
  1037. past_key_values: Cache | None = None,
  1038. or_mask_function: Callable | None = None,
  1039. and_mask_function: Callable | None = None,
  1040. ) -> torch.Tensor | BlockMask | None:
  1041. """
  1042. Create a standard bidirectional sliding window mask based on the attention implementation used (stored in the config).
  1043. Args:
  1044. config (`PreTrainedConfig`):
  1045. The model config.
  1046. inputs_embeds (`torch.Tensor`):
  1047. The input embeddings of shape (batch_size, query_length, hidden_dim). This is only used to infer metadata
  1048. such as the batch size, query length, dtype, and device.
  1049. past_key_values (`Cache`, optional):
  1050. The past key values, if we use a cache.
  1051. attention_mask (`torch.Tensor`, optional):
  1052. The 2D attention mask corresponding to padded tokens of shape (batch_size, kv_length).
  1053. It can also be an already prepared 4D mask of shape (batch_size, 1, query_length, kv_length),
  1054. in which case it is returned as-is.
  1055. or_mask_function (`Callable`, optional):
  1056. An optional mask function to combine with the base mask function (by doing the union of both). This is
  1057. useful to easily overlay another mask on top, for example for image tokens handling.
  1058. and_mask_function (`Callable`, optional):
  1059. An optional mask function to combine with the base mask function (by doing the intersection of both). This is
  1060. useful to easily overlay another mask on top, for example for image tokens handling.
  1061. """
  1062. # We ignore a few irrelevant arguments at the end as we do not have a (growing) cache here
  1063. early_exit, attention_mask, _, q_length, kv_length, q_offset, kv_offset = _preprocess_mask_arguments(
  1064. config, inputs_embeds, attention_mask, past_key_values, None, 0
  1065. )
  1066. if early_exit:
  1067. return attention_mask
  1068. sliding_window = getattr(config, "sliding_window", None)
  1069. if sliding_window is None:
  1070. raise ValueError("Could not find a `sliding_window` argument in the config, or it is not set")
  1071. batch_size, dtype, device = inputs_embeds.shape[0], inputs_embeds.dtype, inputs_embeds.device
  1072. mask_factory_function = sliding_window_bidirectional_mask_function(sliding_window)
  1073. mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
  1074. use_vmap = False
  1075. allow_is_bidirectional_skip = True
  1076. if or_mask_function is not None:
  1077. if not _is_torch_greater_or_equal_than_2_6:
  1078. raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
  1079. mask_factory_function = or_masks(mask_factory_function, or_mask_function)
  1080. allow_is_bidirectional_skip = False
  1081. use_vmap = True
  1082. if and_mask_function is not None:
  1083. if not _is_torch_greater_or_equal_than_2_6:
  1084. raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
  1085. mask_factory_function = and_masks(mask_factory_function, and_mask_function)
  1086. allow_is_bidirectional_skip = False
  1087. use_vmap = True
  1088. attention_mask = mask_interface(
  1089. batch_size=batch_size,
  1090. q_length=q_length,
  1091. kv_length=kv_length,
  1092. q_offset=q_offset,
  1093. kv_offset=kv_offset,
  1094. mask_function=mask_factory_function,
  1095. attention_mask=attention_mask,
  1096. allow_is_causal_skip=False,
  1097. allow_is_bidirectional_skip=allow_is_bidirectional_skip,
  1098. local_size=sliding_window, # Additional kwarg for sdpa
  1099. dtype=dtype, # Additional kwarg for eager
  1100. config=config, # Pass the config as well, in case someone wants to easily have their own mask_interface
  1101. use_vmap=use_vmap, # Short-circuit to non-vmap expansions for the mask
  1102. device=device,
  1103. )
  1104. return attention_mask
  1105. @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
  1106. def create_chunked_causal_mask(
  1107. config: PreTrainedConfig,
  1108. inputs_embeds: torch.Tensor,
  1109. attention_mask: torch.Tensor | None,
  1110. cache_position: torch.Tensor | None = None, # not used anymore but kept for BC
  1111. *,
  1112. past_key_values: Cache | None,
  1113. position_ids: torch.Tensor | None = None,
  1114. or_mask_function: Callable | None = None,
  1115. and_mask_function: Callable | None = None,
  1116. ) -> torch.Tensor | BlockMask | None:
  1117. """
  1118. Create a chunked attention causal mask based on the attention implementation used (stored in the config). This type
  1119. of attention pattern was mostly democratized by Llama4. If `past_key_values` has an hybrid cache structure, this
  1120. function will return the mask corresponding to one of the "chunked_attention" layers (to align to what is needed in the
  1121. `modeling_xxx.py` files).
  1122. Args:
  1123. config (`PreTrainedConfig`):
  1124. The model config.
  1125. inputs_embeds (`torch.Tensor`):
  1126. The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
  1127. batch size, query length and dtype.
  1128. attention_mask (`torch.Tensor`, optional):
  1129. The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
  1130. It can also be an already prepared 4D mask, in which case it is returned as-is.
  1131. cache_position (`torch.Tensor`):
  1132. Deprecated and unused.
  1133. past_key_values (`Cache`, optional):
  1134. The past key values, if we use a cache.
  1135. position_ids (`torch.Tensor`, optional)
  1136. A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
  1137. or_mask_function (`Callable`, optional):
  1138. An optional mask function to combine with the chunked causal mask function (by doing the union of both). This is
  1139. useful to easily overlay another mask on top of the chunked causal one, for example for image tokens handling.
  1140. and_mask_function (`Callable`, optional):
  1141. An optional mask function to combine with the chunked causal mask function (by doing the intersection of both). This is
  1142. useful to easily overlay another mask on top of the chunked causal one, for example for image tokens handling.
  1143. """
  1144. # If we have an hybrid cache structure, here we want to create the mask for the sliding layers
  1145. if hasattr(past_key_values, "is_sliding") and True in past_key_values.is_sliding:
  1146. layer_idx = past_key_values.is_sliding.index(True)
  1147. else:
  1148. layer_idx = 0
  1149. early_exit, attention_mask, packed_sequence_mask, q_length, kv_length, q_offset, kv_offset = (
  1150. _preprocess_mask_arguments(config, inputs_embeds, attention_mask, past_key_values, position_ids, layer_idx)
  1151. )
  1152. if early_exit:
  1153. return attention_mask
  1154. chunk_size = getattr(config, "attention_chunk_size", None)
  1155. if chunk_size is None:
  1156. raise ValueError("Could not find an `attention_chunk_size` argument in the config, or it is not set")
  1157. # Raise if using chunked attention on context too large with FA
  1158. if is_flash_attention_requested(config) and kv_length + kv_offset > chunk_size:
  1159. raise ValueError(
  1160. "Flash attention cannot handle chunked attention, and the key-value length is larger than the chunk size so the "
  1161. "chunked pattern cannot be respected. You should use another `attn_implementation` when instantiating the model"
  1162. )
  1163. batch_size, dtype, device = inputs_embeds.shape[0], inputs_embeds.dtype, inputs_embeds.device
  1164. # For chunked attention and batched inputs, we need to take the number of left padding tokens into account
  1165. # to start the chunk from the actual start of the sequence for the padded sequence
  1166. if attention_mask is not None:
  1167. # Only count the left padding tokens, not all of them
  1168. left_padding_tokens = (attention_mask.cumsum(dim=-1) == torch.zeros_like(attention_mask)).sum(dim=-1)
  1169. else:
  1170. left_padding_tokens = torch.zeros(batch_size, device=device, dtype=int)
  1171. mask_factory_function = chunked_causal_mask_function(chunk_size, left_padding_tokens)
  1172. mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
  1173. # Defaulting to using non-vmap based mask creations except when detecting
  1174. # users passing custom mask functions (as we cannot guarantee that they
  1175. # are properly index-based as required by our implementation).
  1176. use_vmap = False
  1177. # Do not allow skip if we are compiling (this is to match BC)
  1178. # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
  1179. allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
  1180. # Allow slight deviations from causal mask
  1181. # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
  1182. # padding mask, etc) as the resulting mask may otherwise not be correct!
  1183. if or_mask_function is not None:
  1184. if not _is_torch_greater_or_equal_than_2_6:
  1185. raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
  1186. mask_factory_function = or_masks(mask_factory_function, or_mask_function)
  1187. allow_is_causal_skip = False
  1188. use_vmap = True
  1189. if and_mask_function is not None:
  1190. if not _is_torch_greater_or_equal_than_2_6:
  1191. raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
  1192. mask_factory_function = and_masks(mask_factory_function, and_mask_function)
  1193. allow_is_causal_skip = False
  1194. use_vmap = True
  1195. # If we detected packing format
  1196. if packed_sequence_mask is not None:
  1197. mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
  1198. allow_is_causal_skip = False
  1199. # We now create the mask
  1200. causal_mask = mask_interface(
  1201. batch_size=batch_size,
  1202. q_length=q_length,
  1203. kv_length=kv_length,
  1204. q_offset=q_offset,
  1205. kv_offset=kv_offset,
  1206. mask_function=mask_factory_function,
  1207. attention_mask=attention_mask,
  1208. allow_is_causal_skip=allow_is_causal_skip, # additional kwarg for sdpa
  1209. local_size=chunk_size, # Additional kwarg for sdpa
  1210. dtype=dtype, # Additional kwarg for eager
  1211. config=config, # Pass the config as well, in case someone wants to easily have their own mask_interface
  1212. use_vmap=use_vmap, # Short-circuit to non-vmap expansions for the mask
  1213. device=device,
  1214. )
  1215. return causal_mask
  1216. LAYER_PATTERN_TO_MASK_FUNCTION_MAPPING = {
  1217. "full_attention": create_causal_mask,
  1218. "sliding_attention": create_sliding_window_causal_mask,
  1219. "chunked_attention": create_chunked_causal_mask,
  1220. }
  1221. @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
  1222. def create_masks_for_generate(
  1223. config: PreTrainedConfig,
  1224. inputs_embeds: torch.Tensor,
  1225. attention_mask: torch.Tensor | None,
  1226. past_key_values: Cache | None,
  1227. position_ids: torch.Tensor | None = None,
  1228. or_mask_function: Callable | None = None,
  1229. and_mask_function: Callable | None = None,
  1230. **kwargs,
  1231. ):
  1232. """
  1233. This function mimics how we create the masks in the `modeling_xxx.py` files, and is used in places like `generate`
  1234. in order to easily create the masks in advance, when we compile the forwards with Static caches.
  1235. Args:
  1236. config (`PreTrainedConfig`):
  1237. The model config.
  1238. inputs_embeds (`torch.Tensor`):
  1239. The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
  1240. batch size, query length and dtype.
  1241. attention_mask (`torch.Tensor`, optional):
  1242. The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
  1243. It can also be an already prepared 4D mask, in which case it is returned as-is.
  1244. past_key_values (`Cache`, optional):
  1245. The past key values, if we use a cache.
  1246. position_ids (`torch.Tensor`, optional)
  1247. A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
  1248. or_mask_function (`Callable`, optional):
  1249. An optional mask function to combine with the other mask function (by doing the union of both). This is
  1250. useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
  1251. and_mask_function (`Callable`, optional):
  1252. An optional mask function to combine with the other mask function (by doing the intersection of both). This is
  1253. useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
  1254. """
  1255. # The attribute reside in the text config for composite models
  1256. effective_config = config.get_text_config()
  1257. # Prepare the mask args
  1258. mask_kwargs = {
  1259. "config": effective_config,
  1260. "inputs_embeds": inputs_embeds,
  1261. "attention_mask": attention_mask,
  1262. "past_key_values": past_key_values,
  1263. "position_ids": position_ids,
  1264. "or_mask_function": or_mask_function,
  1265. "and_mask_function": and_mask_function,
  1266. }
  1267. # If the attribute exist, we need several masks
  1268. if hasattr(effective_config, "layer_types"):
  1269. causal_masks = {}
  1270. for layer_pattern in set(effective_config.layer_types):
  1271. causal_masks[layer_pattern] = LAYER_PATTERN_TO_MASK_FUNCTION_MAPPING[layer_pattern](**mask_kwargs)
  1272. return causal_masks
  1273. # In this case, all layers are sliding
  1274. elif getattr(effective_config, "sliding_window", None) is not None:
  1275. return create_sliding_window_causal_mask(**mask_kwargs)
  1276. # In this case, all layers are chunked
  1277. elif getattr(effective_config, "attention_chunk_size", None) is not None:
  1278. return create_chunked_causal_mask(**mask_kwargs)
  1279. # All layers use standard causal attention
  1280. return create_causal_mask(**mask_kwargs)
  1281. # Below are utilities to pretty-print the different masks
  1282. # Print the matrix with words as row labels
  1283. GREEN = "\033[92m"
  1284. YELLOW = "\033[93m"
  1285. RESET = "\033[0m"
  1286. BLACK_SQUARE = "■"
  1287. WHITE_SQUARE = "⬚"
  1288. GREY_SQUARE = "∙"
  1289. LOW_TRIANGLE = "⬕"
  1290. UPPER_TRIANGLE = "⬔"
  1291. def get_style(style):
  1292. if style == "majong":
  1293. BLACK_SQUARE = "🀞" # Full block (represents "on" or active)
  1294. BLACK_SQUARE = "🀙" # Full block (represents "on" or active)
  1295. WHITE_SQUARE = "🀆" # "▒" # Light shade (represents "off" or inactive)
  1296. LOW_TRIANGLE = "🀛" # Lower left triangle (stylized indication)
  1297. UPPER_TRIANGLE = "🀛" # Upper left triangle (stylized indication)
  1298. else:
  1299. BLACK_SQUARE = "█" # Full block (represents "on" or active)
  1300. WHITE_SQUARE = "░" # "▒" # Light shade (represents "off" or inactive)
  1301. LOW_TRIANGLE = "▙" # Lower left triangle (stylized indication))
  1302. UPPER_TRIANGLE = "▜" # Upper left triangle (stylized indication)
  1303. return BLACK_SQUARE, WHITE_SQUARE, LOW_TRIANGLE, UPPER_TRIANGLE
  1304. # LOW_TRIANGLE = UPPER_TRIANGLE = "⟍" # Upper right triangle (stylized indication)
  1305. YELLOW_SQUARE = f"{YELLOW}{BLACK_SQUARE}{RESET}"
  1306. GREEN_SQUARE = f"{GREEN}{BLACK_SQUARE}{RESET}"
  1307. def tensor_to_mask_visual(original_tensor: torch.Tensor, grid_size=(20, 40), style="majong") -> str:
  1308. BLACK_SQUARE, WHITE_SQUARE, LOW_TRIANGLE, UPPER_TRIANGLE = get_style(style)
  1309. h, w = original_tensor.shape
  1310. max_h, max_w = grid_size
  1311. if not (h < max_h and w < max_w):
  1312. # Preserve aspect ratio within max grid size
  1313. aspect_ratio = 2 * w / h
  1314. if aspect_ratio > 1:
  1315. w = max_w
  1316. h = min(max_h, max(1, round(max_w / aspect_ratio)))
  1317. else:
  1318. h = max_h
  1319. w = max(1, round(max_h * aspect_ratio))
  1320. # Step 1: Rescale tensor by average pooling
  1321. tensor = original_tensor.unsqueeze(0).unsqueeze(0) # Add batch and channel dimensions
  1322. tensor = F.adaptive_avg_pool2d(tensor, output_size=(h, w))[0, 0] # Remove extra dims
  1323. else:
  1324. tensor = original_tensor
  1325. # Step 3: Build the string representation
  1326. result = []
  1327. for i in range(h):
  1328. row = ""
  1329. for j in range(w):
  1330. if tensor[i, j] == 1:
  1331. row += BLACK_SQUARE
  1332. elif tensor[i, j] == 0:
  1333. row += WHITE_SQUARE
  1334. else:
  1335. if j > 0:
  1336. if tensor[i, j - 1] == 1:
  1337. row += LOW_TRIANGLE
  1338. elif tensor[i, j - 1] == 0:
  1339. row += UPPER_TRIANGLE
  1340. else:
  1341. row += BLACK_SQUARE if tensor[i, j] == 1 else WHITE_SQUARE
  1342. else:
  1343. row += (
  1344. BLACK_SQUARE
  1345. if tensor[i, j] == 1
  1346. else (
  1347. WHITE_SQUARE
  1348. if tensor[i, j] == 0
  1349. else (UPPER_TRIANGLE if tensor[i, j + 1] == 1 else LOW_TRIANGLE)
  1350. )
  1351. )
  1352. result.append(row)
  1353. return "\n".join(result)
  1354. class AttentionMask(torch.Tensor):
  1355. def __new__(cls, data, style=None):
  1356. # Create a new instance of AttentionMask as a Tensor
  1357. cls.style = style
  1358. return torch.Tensor._make_subclass(cls, data, require_grad=False)
  1359. def __init__(self, data):
  1360. # You can initialize any additional metadata here if needed
  1361. pass
  1362. def to_string(self, grid_size=(20, 40), limit=4):
  1363. """Returns a string representation of the block mask."""
  1364. dense_mask = self
  1365. *batch_dims, num_rows, num_cols = dense_mask.shape
  1366. total_vis = []
  1367. for idx, batch_idx in enumerate(itertools.product(*[range(i) for i in batch_dims])):
  1368. if idx == limit:
  1369. total_vis.append("...")
  1370. total_vis.append("To print out more, set AttentionMask.to_string(limit=N)")
  1371. total_vis.append("You can also index (AttentionMask[batch, head]) to choose a specific batch or head")
  1372. break
  1373. block_vis = tensor_to_mask_visual(dense_mask[batch_idx], grid_size=grid_size, style=self.style)
  1374. total_vis.append(block_vis)
  1375. total_vis.append(f"torch.Tensor(shape={tuple(self.shape)}, dtype={self.dtype})")
  1376. return "\n".join(total_vis)
  1377. def __repr__(self):
  1378. return self.to_string()
  1379. def __str__(self):
  1380. return self.to_string()
  1381. @classmethod
  1382. def from_tensor(cls, tensor: torch.Tensor, style: str | None = None) -> "AttentionMask":
  1383. res = cls(tensor)
  1384. res.style = style
  1385. return res