configuration_utils.py 93 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805
  1. # Copyright 2022 The HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Generation configuration class and utilities."""
  15. import copy
  16. import json
  17. import os
  18. from abc import ABC, abstractmethod
  19. from collections.abc import Callable
  20. from dataclasses import dataclass, is_dataclass
  21. from typing import TYPE_CHECKING, Any, Optional, Union
  22. from huggingface_hub import create_repo
  23. from .. import __version__
  24. from ..utils import (
  25. GENERATION_CONFIG_NAME,
  26. ExplicitEnum,
  27. PushToHubMixin,
  28. cached_file,
  29. extract_commit_hash,
  30. is_torch_available,
  31. logging,
  32. )
  33. if TYPE_CHECKING:
  34. import torch
  35. from ..configuration_utils import PreTrainedConfig
  36. from ..modeling_utils import PreTrainedModel
  37. logger = logging.get_logger(__name__)
  38. METADATA_FIELDS = ("_from_model_config", "_commit_hash", "_original_object_hash", "transformers_version")
  39. STATIC_CACHE_IMPLEMENTATIONS = ("static", "offloaded_static")
  40. DYNAMIC_CACHE_IMPLEMENTATIONS = ("dynamic", "dynamic_full", "offloaded", "quantized")
  41. # All the following are redundant and deprecated, but kept for BC
  42. DEPRECATED_STATIC_CACHE_IMPLEMENTATIONS = (
  43. "sliding_window",
  44. "hybrid",
  45. "hybrid_chunked",
  46. "offloaded_hybrid",
  47. "offloaded_hybrid_chunked",
  48. )
  49. ALL_STATIC_CACHE_IMPLEMENTATIONS = STATIC_CACHE_IMPLEMENTATIONS + DEPRECATED_STATIC_CACHE_IMPLEMENTATIONS
  50. ALL_CACHE_IMPLEMENTATIONS = ALL_STATIC_CACHE_IMPLEMENTATIONS + DYNAMIC_CACHE_IMPLEMENTATIONS
  51. if is_torch_available():
  52. from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor
  53. class GenerationMode(ExplicitEnum):
  54. """
  55. Possible generation modes, downstream of the [`~generation.GenerationMixin.generate`] method.
  56. """
  57. # Non-beam methods
  58. CONTRASTIVE_SEARCH = "contrastive_search"
  59. GREEDY_SEARCH = "greedy_search"
  60. SAMPLE = "sample"
  61. ASSISTED_GENERATION = "assisted_generation"
  62. DOLA_GENERATION = "dola_generation"
  63. # Beam methods
  64. BEAM_SEARCH = "beam_search"
  65. BEAM_SAMPLE = "beam_sample"
  66. CONSTRAINED_BEAM_SEARCH = "constrained_beam_search"
  67. GROUP_BEAM_SEARCH = "group_beam_search"
  68. class GenerationConfig(PushToHubMixin):
  69. # no-format
  70. """
  71. Class that holds a configuration for a generation task. A `generate` call supports the following generation methods
  72. for text-decoder, text-to-text, speech-to-text, and vision-to-text models:
  73. - *greedy decoding* if `num_beams=1` and `do_sample=False`
  74. - *multinomial sampling* if `num_beams=1` and `do_sample=True`
  75. - *beam-search decoding* if `num_beams>1` and `do_sample=False`
  76. - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
  77. - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
  78. To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
  79. <Tip>
  80. A large number of these flags control the logits or the stopping criteria of the generation. Make sure you check
  81. the [generate-related classes](https://huggingface.co/docs/transformers/internal/generation_utils) for a full
  82. description of the possible manipulations, as well as examples of their usage.
  83. </Tip>
  84. Note: the configuration fields that are still `None` will be overridden by `GenerationConfig._get_default_generation_params()`
  85. during the generation loop. If you want to use different values for these fields, make sure to explicitly set them in the
  86. generation config.
  87. Args:
  88. > Parameters that control the length of the output
  89. max_length (`int`, *optional*):
  90. `max_new_tokens` is recommended for controlling how many tokens the model generates.
  91. `max_length` remains for backward compatibility.
  92. max_new_tokens (`int`, *optional*):
  93. The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
  94. min_length (`int`, *optional*):
  95. The minimum length of the sequence to be generated. Corresponds to the length of the input prompt +
  96. `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.
  97. min_new_tokens (`int`, *optional*):
  98. The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.
  99. early_stopping (`bool` or `str`, *optional*):
  100. Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
  101. `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
  102. heuristic is applied and the generation stops when is it very unlikely to find better candidates;
  103. `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
  104. beam search algorithm).
  105. max_time (`float`, *optional*):
  106. The maximum amount of time you allow the computation to run for in seconds. generation will still finish
  107. the current pass after allocated time has been passed.
  108. stop_strings (`str or list[str]`, *optional*):
  109. A string or a list of strings that should terminate generation if the model outputs them.
  110. > Parameters that control the generation strategy used
  111. do_sample (`bool`):
  112. Whether or not to use sampling ; use greedy decoding otherwise.
  113. num_beams (`int`, *optional*):
  114. Number of beams for beam search. 1 means no beam search.
  115. > Parameters that control the cache
  116. use_cache (`bool`):
  117. Whether or not the model should use the past last key/values attentions (if applicable to the model) to
  118. speed up decoding.
  119. cache_implementation (`str`, *optional*):
  120. Name of the cache class that will be instantiated in `generate`, for faster decoding. Possible values are:
  121. - `"dynamic"`: [`DynamicCache`]
  122. - `"static"`: [`StaticCache`]
  123. - `"offloaded"`: [`DynamicCache(offloaded=True)`]
  124. - `"offloaded_static"`: [`StaticCache(offloaded=True)`]
  125. - `"quantized"`: [`QuantizedCache`]
  126. If none is specified, we will use the default cache for the model (which is often [`DynamicCache`]). See
  127. our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
  128. cache_config (`dict`, *optional*, default to `None`):
  129. Arguments used in the key-value cache class can be passed in `cache_config`.
  130. > Parameters for manipulation of the model output logits
  131. temperature (`float`, *optional*):
  132. The value used to module the next token probabilities. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0
  133. top_k (`int`, *optional*):
  134. The number of highest probability vocabulary tokens to keep for top-k-filtering. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 50.
  135. top_p (`float`, *optional*):
  136. If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to
  137. `top_p` or higher are kept for generation. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0
  138. min_p (`float`, *optional*):
  139. Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
  140. value between 0 and 1. Typical values are in the 0.01-0.2 range, comparably selective as setting `top_p` in
  141. the 0.99-0.8 range (use the opposite of normal `top_p` values).
  142. top_h (`float`, *optional*):
  143. Entropy budget scaling factor, which controls how much of the distribution’s entropy is preserved when sampling.
  144. Must be a value between 0 and 1. At each step, tokens are sorted by probability, and the smallest prefix of tokens
  145. is kept whose *renormalized* entropy is less than or equal to `top_h` times the entropy of the full distribution.
  146. Smaller values (e.g., 0.2–0.5) lead to more focused, deterministic outputs, while values closer to 1.0 allow more
  147. randomness and diversity. Typical values are in the 0.3–0.6 range.
  148. typical_p (`float`, *optional*):
  149. Local typicality measures how similar the conditional probability of predicting a target token next is to
  150. the expected conditional probability of predicting a random token next, given the partial text already
  151. generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that
  152. add up to `typical_p` or higher are kept for generation. See [this
  153. paper](https://huggingface.co/papers/2202.00666) for more details.
  154. epsilon_cutoff (`float`, *optional*):
  155. If set to float strictly between 0 and 1, only tokens with a conditional probability greater than
  156. `epsilon_cutoff` will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the
  157. size of the model. See [Truncation Sampling as Language Model
  158. Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
  159. eta_cutoff (`float`, *optional*):
  160. Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between
  161. 0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) *
  162. exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token
  163. probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3,
  164. depending on the size of the model. See [Truncation Sampling as Language Model
  165. Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
  166. repetition_penalty (`float`, *optional*):
  167. The parameter for repetition penalty. 1.0 means no penalty. See [this
  168. paper](https://huggingface.co/papers/1909.05858) for more details.
  169. encoder_repetition_penalty (`float`, *optional*):
  170. The parameter for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
  171. original input. 1.0 means no penalty.
  172. length_penalty (`float`, *optional*):
  173. Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
  174. the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
  175. likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
  176. `length_penalty` < 0.0 encourages shorter sequences.
  177. no_repeat_ngram_size (`int`, *optional*):
  178. If set to int > 0, all ngrams of that size can only occur once.
  179. bad_words_ids (`list[list[int]]`, *optional*):
  180. List of list of token ids that are not allowed to be generated. Check
  181. [`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
  182. renormalize_logits (`bool`):
  183. Whether to renormalize the logits after applying all the logits processors (including the custom
  184. ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits
  185. are normalized but some logit processors break the normalization.
  186. forced_bos_token_id (`int`, *optional*, defaults to `model.config.forced_bos_token_id`):
  187. The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
  188. multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
  189. language token.
  190. forced_eos_token_id (`int` or list[int]`, *optional*, defaults to `model.config.forced_eos_token_id`):
  191. The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a
  192. list to set multiple *end-of-sequence* tokens.
  193. remove_invalid_values (`bool`):
  194. Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash.
  195. Note that using `remove_invalid_values` can slow down generation.
  196. exponential_decay_length_penalty (`tuple(int, float)`, *optional*):
  197. This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
  198. generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where
  199. penalty starts and `decay_factor` represents the factor of exponential decay
  200. suppress_tokens (`list[int]`, *optional*):
  201. A list of tokens that will be suppressed at generation. The `SuppressTokens` logit processor will set their
  202. log probs to `-inf` so that they are not sampled.
  203. begin_suppress_tokens (`list[int]`, *optional*):
  204. A list of tokens that will be suppressed at the beginning of the generation. The `SuppressBeginTokens` logit
  205. processor will set their log probs to `-inf` so that they are not sampled.
  206. sequence_bias (`dict[tuple[int], float]`, *optional*)):
  207. Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
  208. sequence being selected, while negative biases do the opposite. Check
  209. [`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples.
  210. token_healing (`bool`):
  211. Heal tail tokens of prompts by replacing them with their appropriate extensions.
  212. This enhances the quality of completions for prompts affected by greedy tokenization bias.
  213. guidance_scale (`float`, *optional*):
  214. The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
  215. Higher guidance scale encourages the model to generate samples that are more closely linked to the input
  216. prompt, usually at the expense of poorer quality.
  217. watermarking_config (`BaseWatermarkingConfig` or `dict`, *optional*):
  218. Arguments used to watermark the model outputs by adding a small bias to randomly selected set of "green"
  219. tokens. See the docs of [`SynthIDTextWatermarkingConfig`] and [`WatermarkingConfig`] for more
  220. details. If passed as `Dict`, it will be converted to a `WatermarkingConfig` internally.
  221. > Parameters that define the output variables of generate
  222. num_return_sequences (`int`, *optional*):
  223. The number of independently computed returned sequences for each element in the batch.
  224. output_attentions (`bool`):
  225. Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
  226. tensors for more details.
  227. output_hidden_states (`bool`):
  228. Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
  229. more details.
  230. output_scores (`bool`):
  231. Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
  232. output_logits (`bool`):
  233. Whether or not to return the unprocessed prediction logit scores. See `logits` under returned tensors for
  234. more details.
  235. return_dict_in_generate (`bool`):
  236. Whether or not to return a [`~utils.ModelOutput`], as opposed to returning exclusively the generated
  237. sequence. This flag must be set to `True` to return the generation cache (when `use_cache` is `True`)
  238. or optional outputs (see flags starting with `output_`)
  239. > Special tokens that can be used at generation time
  240. pad_token_id (`int`, *optional*):
  241. The id of the *padding* token.
  242. bos_token_id (`int`, *optional*):
  243. The id of the *beginning-of-sequence* token.
  244. eos_token_id (`Union[int, list[int]]`, *optional*):
  245. The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
  246. > Generation parameters exclusive to encoder-decoder models
  247. encoder_no_repeat_ngram_size (`int`, *optional*):
  248. If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
  249. `decoder_input_ids`.
  250. decoder_start_token_id (`int` or `list[int]`, *optional*):
  251. If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token or a list of length
  252. `batch_size`. Indicating a list enables different start ids for each element in the batch
  253. (e.g. multilingual models with different target languages in one batch)
  254. > Generation parameters exclusive to assistant generation
  255. is_assistant (`bool`):
  256. Whether the model is an assistant (draft) model.
  257. num_assistant_tokens (`int`, *optional*):
  258. Defines the number of _speculative tokens_ that shall be generated by the assistant model before being
  259. checked by the target model at each iteration. Higher values for `num_assistant_tokens` make the generation
  260. more _speculative_ : If the assistant model is performant larger speed-ups can be reached, if the assistant
  261. model requires lots of corrections, lower speed-ups are reached.
  262. num_assistant_tokens_schedule (`str`, *optional*):
  263. Defines the schedule at which max assistant tokens shall be changed during inference.
  264. - `"heuristic"`: When all speculative tokens are correct, increase `num_assistant_tokens` by 2 else
  265. reduce by 1. `num_assistant_tokens` value is persistent over multiple generation calls with the same assistant model.
  266. - `"heuristic_transient"`: Same as `"heuristic"` but `num_assistant_tokens` is reset to its initial value after each generation call.
  267. - `"constant"`: `num_assistant_tokens` stays unchanged during generation
  268. assistant_confidence_threshold (`float`, *optional*):
  269. The confidence threshold for the assistant model. If the assistant model's confidence in its prediction for the current token is lower
  270. than this threshold, the assistant model stops the current token generation iteration, even if the number of _speculative tokens_
  271. (defined by `num_assistant_tokens`) is not yet reached. The assistant's confidence threshold is adjusted throughout the speculative iterations to reduce the number of unnecessary draft and target forward passes, biased towards avoiding false negatives.
  272. `assistant_confidence_threshold` value is persistent over multiple generation calls with the same assistant model.
  273. It is an unsupervised version of the dynamic speculation lookahead
  274. from Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large Language Models <https://huggingface.co/papers/2405.04304>.
  275. prompt_lookup_num_tokens (`int`, *optional*):
  276. The number of tokens to be output as candidate tokens.
  277. max_matching_ngram_size (`int`, *optional*):
  278. The maximum ngram size to be considered for matching in the prompt. Default to 2 if not provided.
  279. assistant_early_exit(`int`, *optional*):
  280. If set to a positive integer, early exit of the model will be used as an assistant. Can only be used with
  281. models that support early exit (i.e. models where logits from intermediate layers can be interpreted by the LM head).
  282. assistant_lookbehind(`int`, *optional*):
  283. If set to a positive integer, the re-encodeing process will additionally consider the last `assistant_lookbehind` assistant tokens
  284. to correctly align tokens. Can only be used with different tokenizers in speculative decoding.
  285. See this [blog](https://huggingface.co/blog/universal_assisted_generation) for more details.
  286. target_lookbehind(`int`, *optional*):
  287. If set to a positive integer, the re-encodeing process will additionally consider the last `target_lookbehind` target tokens
  288. to correctly align tokens. Can only be used with different tokenizers in speculative decoding.
  289. See this [blog](https://huggingface.co/blog/universal_assisted_generation) for more details.
  290. > Parameters related to performances and compilation
  291. compile_config (CompileConfig, *optional*):
  292. If using a compilable cache, this controls how `generate` will `compile` the forward pass for faster
  293. inference.
  294. disable_compile (`bool`):
  295. Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when
  296. specific criteria are met, including using a compilable cache. Please open an issue if you find the
  297. need to use this flag.
  298. """
  299. extra_output_flags = ("output_attentions", "output_hidden_states", "output_scores", "output_logits")
  300. # Tensor versions of token IDs, set by _prepare_special_tokens() at generation time
  301. _bos_token_tensor: "torch.Tensor | None"
  302. _eos_token_tensor: "torch.Tensor | None"
  303. _pad_token_tensor: "torch.Tensor | None"
  304. _decoder_start_token_tensor: "torch.Tensor | None"
  305. # Hash to detect whether the instance was modified after loading
  306. _original_object_hash: int | None
  307. def __init__(self, **kwargs):
  308. # Parameters that control the length of the output
  309. self.max_length = kwargs.pop("max_length", None)
  310. self.max_new_tokens = kwargs.pop("max_new_tokens", None)
  311. self.min_length = kwargs.pop("min_length", None)
  312. self.min_new_tokens = kwargs.pop("min_new_tokens", None)
  313. self.early_stopping = kwargs.pop("early_stopping", None)
  314. self.max_time = kwargs.pop("max_time", None)
  315. self.stop_strings = kwargs.pop("stop_strings", None)
  316. # Parameters that control the generation strategy used
  317. self.do_sample = kwargs.pop("do_sample", None)
  318. self.num_beams = kwargs.pop("num_beams", None)
  319. # Parameters that control the cache
  320. self.use_cache = kwargs.pop("use_cache", None)
  321. self.cache_implementation = kwargs.pop("cache_implementation", None)
  322. self.cache_config = kwargs.pop("cache_config", None)
  323. # Parameters for manipulation of the model output logits
  324. self.temperature = kwargs.pop("temperature", None)
  325. self.top_k = kwargs.pop("top_k", None)
  326. self.top_p = kwargs.pop("top_p", None)
  327. self.min_p = kwargs.pop("min_p", None)
  328. self.top_h = kwargs.pop("top_h", None)
  329. self.typical_p = kwargs.pop("typical_p", None)
  330. self.epsilon_cutoff = kwargs.pop("epsilon_cutoff", None)
  331. self.eta_cutoff = kwargs.pop("eta_cutoff", None)
  332. self.repetition_penalty = kwargs.pop("repetition_penalty", None)
  333. self.encoder_repetition_penalty = kwargs.pop("encoder_repetition_penalty", None)
  334. self.length_penalty = kwargs.pop("length_penalty", None)
  335. self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", None)
  336. self.bad_words_ids = kwargs.pop("bad_words_ids", None)
  337. self.renormalize_logits = kwargs.pop("renormalize_logits", None)
  338. self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
  339. self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
  340. self.remove_invalid_values = kwargs.pop("remove_invalid_values", None)
  341. self.exponential_decay_length_penalty = kwargs.pop("exponential_decay_length_penalty", None)
  342. self.suppress_tokens = kwargs.pop("suppress_tokens", None)
  343. self.begin_suppress_tokens = kwargs.pop("begin_suppress_tokens", None)
  344. self.sequence_bias = kwargs.pop("sequence_bias", None)
  345. self.token_healing = kwargs.pop("token_healing", None)
  346. self.guidance_scale = kwargs.pop("guidance_scale", None)
  347. self.watermarking_config = kwargs.pop("watermarking_config", None)
  348. if isinstance(self.watermarking_config, dict):
  349. self.watermarking_config = WatermarkingConfig.from_dict(self.watermarking_config)
  350. # Parameters that define the output variables of `generate`
  351. self.num_return_sequences = kwargs.pop("num_return_sequences", None)
  352. self.output_attentions = kwargs.pop("output_attentions", None)
  353. self.output_hidden_states = kwargs.pop("output_hidden_states", None)
  354. self.output_scores = kwargs.pop("output_scores", None)
  355. self.output_logits = kwargs.pop("output_logits", None)
  356. self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", None)
  357. # Special tokens that can be used at generation time
  358. self.pad_token_id = kwargs.pop("pad_token_id", None)
  359. self.bos_token_id = kwargs.pop("bos_token_id", None)
  360. self.eos_token_id = kwargs.pop("eos_token_id", None)
  361. # Generation parameters exclusive to encoder-decoder models
  362. self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", None)
  363. self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
  364. # Assistant generation
  365. self.is_assistant = kwargs.pop("is_assistant", None)
  366. self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", None)
  367. self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", None)
  368. self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", None)
  369. self.prompt_lookup_num_tokens = kwargs.pop("prompt_lookup_num_tokens", None)
  370. self.max_matching_ngram_size = kwargs.pop("max_matching_ngram_size", None)
  371. self.assistant_early_exit = kwargs.pop("assistant_early_exit", None)
  372. self.assistant_lookbehind = kwargs.pop("assistant_lookbehind", None)
  373. self.target_lookbehind = kwargs.pop("target_lookbehind", None)
  374. # Performance
  375. self.compile_config = kwargs.pop("compile_config", None)
  376. self.disable_compile = kwargs.pop("disable_compile", None)
  377. self.continuous_batching_config = kwargs.pop("continuous_batching_config", None)
  378. # Deprecated (moved to the Hub). TODO remove for v5
  379. self.low_memory = kwargs.pop("low_memory", None)
  380. self.penalty_alpha = kwargs.pop("penalty_alpha", None)
  381. self.dola_layers = kwargs.pop("dola_layers", None)
  382. self.diversity_penalty = kwargs.pop("diversity_penalty", None)
  383. self.num_beam_groups = kwargs.pop("num_beam_groups", None)
  384. self.constraints = kwargs.pop("constraints", None)
  385. self.force_words_ids = kwargs.pop("force_words_ids", None)
  386. self.prefill_chunk_size = kwargs.pop("prefill_chunk_size", None)
  387. # Common attributes
  388. self._commit_hash = kwargs.pop("_commit_hash", None)
  389. self._from_model_config = kwargs.pop("_from_model_config", None)
  390. self.transformers_version = kwargs.pop("transformers_version", None)
  391. # Additional attributes without default values
  392. if not self._from_model_config:
  393. # we don't want to copy values from the model config if we're initializing
  394. # a `GenerationConfig` from a model's default configuration file
  395. for key, value in kwargs.items():
  396. try:
  397. setattr(self, key, value)
  398. except AttributeError as err:
  399. logger.error(f"Can't set {key} with value {value} for {self}")
  400. raise err
  401. else:
  402. # Ensure backward compatibility for models that use `forced_bos_token_id` within their config
  403. if kwargs.get("force_bos_token_to_be_generated", False):
  404. self.forced_bos_token_id = self.bos_token_id
  405. logger.warning_once(
  406. f"Please make sure the generation config includes `forced_bos_token_id={self.bos_token_id}`. "
  407. )
  408. # Validate the values of the attributes
  409. self.validate()
  410. def __hash__(self):
  411. return hash(self.to_json_string(ignore_metadata=True))
  412. def __eq__(self, other):
  413. if not isinstance(other, GenerationConfig):
  414. return False
  415. self_without_metadata = self.to_json_string(use_diff=False, ignore_metadata=True)
  416. other_without_metadata = other.to_json_string(use_diff=False, ignore_metadata=True)
  417. return self_without_metadata == other_without_metadata
  418. def __repr__(self):
  419. return f"{self.__class__.__name__} {self.to_json_string(ignore_metadata=True)}"
  420. def get_generation_mode(self, assistant_model: Optional["PreTrainedModel"] = None) -> GenerationMode:
  421. """
  422. Returns the generation mode triggered by the [`GenerationConfig`] instance.
  423. Arg:
  424. assistant_model (`PreTrainedModel`, *optional*):
  425. The assistant model to be used for assisted generation. If set, the generation mode will be
  426. assisted generation.
  427. Returns:
  428. `GenerationMode`: The generation mode triggered by the instance.
  429. """
  430. # TODO joao: find out a way of not depending on external fields (e.g. `assistant_model`), then make this a
  431. # property and part of the `__repr__`
  432. if self.constraints is not None or self.force_words_ids is not None:
  433. generation_mode = GenerationMode.CONSTRAINED_BEAM_SEARCH
  434. elif self.num_beams is None or self.num_beams == 1:
  435. if self.do_sample is not True:
  436. if (
  437. self.top_k is not None
  438. and self.top_k > 1
  439. and self.penalty_alpha is not None
  440. and self.penalty_alpha > 0
  441. ):
  442. generation_mode = GenerationMode.CONTRASTIVE_SEARCH
  443. else:
  444. generation_mode = GenerationMode.GREEDY_SEARCH
  445. else:
  446. generation_mode = GenerationMode.SAMPLE
  447. else:
  448. if self.num_beam_groups is not None and self.num_beam_groups > 1:
  449. generation_mode = GenerationMode.GROUP_BEAM_SEARCH
  450. elif self.do_sample is True:
  451. generation_mode = GenerationMode.BEAM_SAMPLE
  452. else:
  453. generation_mode = GenerationMode.BEAM_SEARCH
  454. # Assisted generation may extend some generation modes
  455. if (
  456. assistant_model is not None
  457. or self.prompt_lookup_num_tokens is not None
  458. or self.assistant_early_exit is not None
  459. ):
  460. if generation_mode in ("greedy_search", "sample"):
  461. generation_mode = GenerationMode.ASSISTED_GENERATION
  462. else:
  463. logger.warning(
  464. "You've set `assistant_model`, which triggers assisted generate. Currently, assisted generate "
  465. "is only supported with Greedy Search and Sample. However, the base decoding mode (based on "
  466. f"current flags) is {generation_mode} -- some of the set flags will be ignored."
  467. )
  468. # DoLa generation may extend some generation modes
  469. # TODO joao, manuel: remove this in v4.62.0
  470. if self.dola_layers is not None:
  471. if generation_mode in ("greedy_search", "sample"):
  472. generation_mode = GenerationMode.DOLA_GENERATION
  473. else:
  474. logger.warning(
  475. "You've set `dola_layers`, which triggers DoLa generate. Currently, DoLa generate "
  476. "is only supported with Greedy Search and Sample. However, the base decoding mode (based on "
  477. f"current flags) is {generation_mode} -- some of the set flags will be ignored."
  478. )
  479. return generation_mode
  480. @staticmethod
  481. def _get_default_generation_params() -> dict[str, Any]:
  482. return {
  483. "max_length": 20,
  484. "min_length": 0,
  485. "do_sample": False,
  486. "use_cache": True,
  487. "early_stopping": False,
  488. "num_beams": 1,
  489. "temperature": 1.0,
  490. "top_k": 50,
  491. "top_p": 1.0,
  492. "typical_p": 1.0,
  493. "repetition_penalty": 1.0,
  494. "length_penalty": 1.0,
  495. "no_repeat_ngram_size": 0,
  496. "encoder_no_repeat_ngram_size": 0,
  497. "bad_words_ids": None,
  498. "num_return_sequences": 1,
  499. "output_scores": False,
  500. "return_dict_in_generate": False,
  501. "forced_bos_token_id": None,
  502. "forced_eos_token_id": None,
  503. "remove_invalid_values": False,
  504. "exponential_decay_length_penalty": None,
  505. "suppress_tokens": None,
  506. "begin_suppress_tokens": None,
  507. "epsilon_cutoff": 0.0,
  508. "eta_cutoff": 0.0,
  509. "encoder_repetition_penalty": 1.0,
  510. "num_assistant_tokens": 20,
  511. "num_assistant_tokens_schedule": "constant",
  512. "assistant_confidence_threshold": 0.4,
  513. "assistant_lookbehind": 10,
  514. "target_lookbehind": 10,
  515. # Deprecated arguments (moved to the Hub). TODO joao, manuel: remove in v4.62.0
  516. "num_beam_groups": 1,
  517. "diversity_penalty": 0.0,
  518. }
  519. def validate(self, strict=False):
  520. """
  521. Validates the values of the attributes of the [`GenerationConfig`] instance. Raises exceptions in the presence
  522. of parameterization that can be detected as incorrect from the configuration instance alone.
  523. Note that some parameters not validated here are best validated at generate runtime, as they may depend on
  524. other inputs and/or the model, such as parameters related to the generation length.
  525. Args:
  526. strict (bool): If True, raise an exception for any issues found. If False, only log issues.
  527. """
  528. minor_issues = {} # format: {attribute_name: issue_description}
  529. # 1. Validation of individual attributes
  530. # 1.1. Decoding attributes
  531. if self.early_stopping not in {None, True, False, "never"}:
  532. raise ValueError(f"`early_stopping` must be a boolean or 'never', but is {self.early_stopping}.")
  533. if self.max_new_tokens is not None and self.max_new_tokens <= 0:
  534. raise ValueError(f"`max_new_tokens` must be greater than 0, but is {self.max_new_tokens}.")
  535. if self.pad_token_id is not None and self.pad_token_id < 0:
  536. minor_issues["pad_token_id"] = (
  537. f"`pad_token_id` should be positive but got {self.pad_token_id}. This will cause errors when batch "
  538. "generating, if there is padding. Please set `pad_token_id` explicitly as "
  539. "`model.generation_config.pad_token_id=PAD_TOKEN_ID` to avoid errors in generation"
  540. )
  541. # 1.2. Cache attributes
  542. # "paged" re-routes to continuous batching and so it is a valid cache implementation. But we do not want to test
  543. # it with the `generate` as the other would be, so we we cannot add it to ALL_CACHE_IMPLEMENTATIONS
  544. valid_cache_implementations = ALL_CACHE_IMPLEMENTATIONS + ("paged",)
  545. if self.cache_implementation is not None and self.cache_implementation not in valid_cache_implementations:
  546. raise ValueError(
  547. f"Invalid `cache_implementation` ({self.cache_implementation}). Choose one of: "
  548. f"{valid_cache_implementations}"
  549. )
  550. # 1.3. Performance attributes
  551. if self.compile_config is not None and not isinstance(self.compile_config, CompileConfig):
  552. raise ValueError(
  553. f"You provided `compile_config` as an instance of {type(self.compile_config)}, but it must be an "
  554. "instance of `CompileConfig`."
  555. )
  556. # 1.4. Watermarking attributes
  557. if self.watermarking_config is not None:
  558. self.watermarking_config.validate()
  559. # 2. Validation of attribute combinations
  560. # 2.1. detect sampling-only parameterization when not in sampling mode
  561. # Note that we check `is not True` in purpose. Boolean fields can also be `None` so we
  562. # have to be explicit. Value of `None` is same as having `False`, i.e. the default value
  563. if self.do_sample is not True:
  564. greedy_wrong_parameter_msg = (
  565. "`do_sample` is set not to set `True`. However, `{flag_name}` is set to `{flag_value}` -- this flag is only "
  566. "used in sample-based generation modes. You should set `do_sample=True` or unset `{flag_name}`."
  567. )
  568. if self.temperature is not None and self.temperature != 1.0:
  569. minor_issues["temperature"] = greedy_wrong_parameter_msg.format(
  570. flag_name="temperature", flag_value=self.temperature
  571. )
  572. if self.top_p is not None and self.top_p != 1.0:
  573. minor_issues["top_p"] = greedy_wrong_parameter_msg.format(flag_name="top_p", flag_value=self.top_p)
  574. if self.min_p is not None:
  575. minor_issues["min_p"] = greedy_wrong_parameter_msg.format(flag_name="min_p", flag_value=self.min_p)
  576. if self.top_h is not None:
  577. minor_issues["top_h"] = greedy_wrong_parameter_msg.format(flag_name="top_h", flag_value=self.top_h)
  578. if self.typical_p is not None and self.typical_p != 1.0:
  579. minor_issues["typical_p"] = greedy_wrong_parameter_msg.format(
  580. flag_name="typical_p", flag_value=self.typical_p
  581. )
  582. if self.top_k is not None and self.top_k != 50:
  583. minor_issues["top_k"] = greedy_wrong_parameter_msg.format(flag_name="top_k", flag_value=self.top_k)
  584. if self.epsilon_cutoff is not None and self.epsilon_cutoff != 0.0:
  585. minor_issues["epsilon_cutoff"] = greedy_wrong_parameter_msg.format(
  586. flag_name="epsilon_cutoff", flag_value=self.epsilon_cutoff
  587. )
  588. if self.eta_cutoff is not None and self.eta_cutoff != 0.0:
  589. minor_issues["eta_cutoff"] = greedy_wrong_parameter_msg.format(
  590. flag_name="eta_cutoff", flag_value=self.eta_cutoff
  591. )
  592. # 2.2. detect beam-only parameterization when not in beam mode
  593. if self.num_beams is None or self.num_beams == 1:
  594. single_beam_wrong_parameter_msg = (
  595. "`num_beams` is set to {num_beams}. However, `{flag_name}` is set to `{flag_value}` -- this flag is only used "
  596. "in beam-based generation modes. You should set `num_beams>1` or unset `{flag_name}`."
  597. )
  598. if self.early_stopping is not None and self.early_stopping is not False:
  599. minor_issues["early_stopping"] = single_beam_wrong_parameter_msg.format(
  600. num_beams=self.num_beams, flag_name="early_stopping", flag_value=self.early_stopping
  601. )
  602. if self.length_penalty is not None and self.length_penalty != 1.0:
  603. minor_issues["length_penalty"] = single_beam_wrong_parameter_msg.format(
  604. num_beams=self.num_beams, flag_name="length_penalty", flag_value=self.length_penalty
  605. )
  606. # 2.4. check `num_return_sequences`
  607. if self.num_return_sequences is not None and self.num_return_sequences > 1:
  608. if self.num_beams is None or self.num_beams == 1:
  609. if not self.do_sample:
  610. raise ValueError(
  611. "Greedy methods (do_sample != True) without beam search do not support "
  612. f"`num_return_sequences` different than 1 (got {self.num_return_sequences})."
  613. )
  614. elif (
  615. self.num_beams is not None
  616. and self.num_return_sequences is not None
  617. and self.num_return_sequences > self.num_beams
  618. ):
  619. raise ValueError(
  620. f"`num_return_sequences` ({self.num_return_sequences}) has to be smaller or equal to `num_beams` "
  621. f"({self.num_beams})."
  622. )
  623. # 2.5. check cache-related arguments
  624. if self.use_cache is False:
  625. # In this case, all cache-related arguments should be unset. However, since `use_cache=False` is often used
  626. # passed to `generate` directly to hot-fix cache issues, let's raise a warning instead of an error
  627. # (otherwise a user might need to overwrite several parameters).
  628. no_cache_warning = (
  629. "You have not set `use_cache` to `True`, but {cache_arg} is set to {cache_arg_value}."
  630. "{cache_arg} will have no effect."
  631. )
  632. for arg_name in ("cache_implementation", "cache_config"):
  633. if getattr(self, arg_name) is not None:
  634. minor_issues[arg_name] = no_cache_warning.format(
  635. cache_arg=arg_name, cache_arg_value=getattr(self, arg_name)
  636. )
  637. # 2.6. other incorrect combinations
  638. if self.return_dict_in_generate is not True:
  639. for extra_output_flag in self.extra_output_flags:
  640. if getattr(self, extra_output_flag) is True:
  641. minor_issues[extra_output_flag] = (
  642. f"`return_dict_in_generate` is NOT set to `True`, but `{extra_output_flag}` is. When "
  643. f"`return_dict_in_generate` is not `True`, `{extra_output_flag}` is ignored."
  644. )
  645. # 3. Check common issue: passing `generate` arguments inside the generation config
  646. generate_arguments = (
  647. "logits_processor",
  648. "stopping_criteria",
  649. "prefix_allowed_tokens_fn",
  650. "synced_gpus",
  651. "assistant_model",
  652. "streamer",
  653. "negative_prompt_ids",
  654. "negative_prompt_attention_mask",
  655. )
  656. for arg in generate_arguments:
  657. if hasattr(self, arg):
  658. raise ValueError(
  659. f"Argument `{arg}` is not a valid argument of `GenerationConfig`. It should be passed to "
  660. "`generate()` (or a pipeline) directly."
  661. )
  662. # Finally, handle caught minor issues. With default parameterization, we will throw a minimal warning.
  663. if len(minor_issues) > 0:
  664. # Full list of issues with potential fixes
  665. info_message = []
  666. for attribute_name, issue_description in minor_issues.items():
  667. info_message.append(f"- `{attribute_name}`: {issue_description}")
  668. info_message = "\n".join(info_message)
  669. info_message += (
  670. "\nIf you're using a pretrained model, note that some of these attributes may be set through the "
  671. "model's `generation_config.json` file."
  672. )
  673. if strict:
  674. raise ValueError("GenerationConfig is invalid: \n" + info_message)
  675. else:
  676. attributes_with_issues = list(minor_issues.keys())
  677. warning_message = (
  678. f"The following generation flags are not valid and may be ignored: {attributes_with_issues}."
  679. )
  680. if logging.get_verbosity() >= logging.WARNING:
  681. warning_message += " Set `TRANSFORMERS_VERBOSITY=info` for more details."
  682. logger.warning_once(warning_message)
  683. logger.info_once(info_message)
  684. def save_pretrained(
  685. self,
  686. save_directory: str | os.PathLike,
  687. config_file_name: str | os.PathLike | None = None,
  688. push_to_hub: bool = False,
  689. **kwargs,
  690. ):
  691. r"""
  692. Save a generation configuration object to the directory `save_directory`, so that it can be re-loaded using the
  693. [`~GenerationConfig.from_pretrained`] class method.
  694. Args:
  695. save_directory (`str` or `os.PathLike`):
  696. Directory where the configuration JSON file will be saved (will be created if it does not exist).
  697. config_file_name (`str` or `os.PathLike`, *optional*, defaults to `"generation_config.json"`):
  698. Name of the generation configuration JSON file to be saved in `save_directory`.
  699. push_to_hub (`bool`, *optional*, defaults to `False`):
  700. Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
  701. repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
  702. namespace).
  703. kwargs (`dict[str, Any]`, *optional*):
  704. Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
  705. """
  706. # At save time, validate the instance enforcing strictness -- if any warning/exception would be thrown, we
  707. # refuse to save the instance.
  708. # This strictness is enforced to prevent bad configurations from being saved and re-used.
  709. try:
  710. self.validate(strict=True)
  711. except ValueError as exc:
  712. raise ValueError(str(exc) + "\n\nFix these issues to save the configuration.")
  713. config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME
  714. if os.path.isfile(save_directory):
  715. raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
  716. os.makedirs(save_directory, exist_ok=True)
  717. if push_to_hub:
  718. commit_message = kwargs.pop("commit_message", None)
  719. repo_id = kwargs.pop("repo_id", str(save_directory).split(os.path.sep)[-1])
  720. repo_id = create_repo(repo_id, exist_ok=True, **kwargs).repo_id
  721. files_timestamps = self._get_files_timestamps(save_directory)
  722. output_config_file = os.path.join(save_directory, config_file_name)
  723. self.to_json_file(output_config_file, use_diff=True, keys_to_pop=["compile_config"])
  724. logger.info(f"Configuration saved in {output_config_file}")
  725. if push_to_hub:
  726. self._upload_modified_files(
  727. save_directory,
  728. repo_id,
  729. files_timestamps,
  730. commit_message=commit_message,
  731. token=kwargs.get("token"),
  732. )
  733. @classmethod
  734. def from_pretrained(
  735. cls,
  736. pretrained_model_name: str | os.PathLike,
  737. config_file_name: str | os.PathLike | None = None,
  738. cache_dir: str | os.PathLike | None = None,
  739. force_download: bool = False,
  740. local_files_only: bool = False,
  741. token: str | bool | None = None,
  742. revision: str = "main",
  743. **kwargs,
  744. ) -> "GenerationConfig":
  745. r"""
  746. Instantiate a [`GenerationConfig`] from a generation configuration file.
  747. Args:
  748. pretrained_model_name (`str` or `os.PathLike`):
  749. This can be either:
  750. - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
  751. huggingface.co.
  752. - a path to a *directory* containing a configuration file saved using the
  753. [`~GenerationConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
  754. config_file_name (`str` or `os.PathLike`, *optional*, defaults to `"generation_config.json"`):
  755. Name of the generation configuration JSON file to be loaded from `pretrained_model_name`.
  756. cache_dir (`str` or `os.PathLike`, *optional*):
  757. Path to a directory in which a downloaded pretrained model configuration should be cached if the
  758. standard cache should not be used.
  759. force_download (`bool`, *optional*, defaults to `False`):
  760. Whether or not to force to (re-)download the configuration files and override the cached versions if
  761. they exist.
  762. proxies (`dict[str, str]`, *optional*):
  763. A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
  764. 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
  765. token (`str` or `bool`, *optional*):
  766. The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
  767. the token generated when running `hf auth login` (stored in `~/.huggingface`).
  768. revision (`str`, *optional*, defaults to `"main"`):
  769. The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
  770. git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
  771. identifier allowed by git.
  772. <Tip>
  773. To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.
  774. </Tip>
  775. return_unused_kwargs (`bool`, *optional*, defaults to `False`):
  776. If `False`, then this function returns just the final configuration object.
  777. If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs* is a
  778. dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the
  779. part of `kwargs` which has not been used to update `config` and is otherwise ignored.
  780. subfolder (`str`, *optional*, defaults to `""`):
  781. In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
  782. specify the folder name here.
  783. kwargs (`dict[str, Any]`, *optional*):
  784. The values in kwargs of any keys which are configuration attributes will be used to override the loaded
  785. values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
  786. by the `return_unused_kwargs` keyword parameter.
  787. Returns:
  788. [`GenerationConfig`]: The configuration object instantiated from this pretrained model.
  789. Examples:
  790. ```python
  791. >>> from transformers import GenerationConfig
  792. >>> # Download configuration from huggingface.co and cache.
  793. >>> generation_config = GenerationConfig.from_pretrained("openai-community/gpt2")
  794. >>> # E.g. config was saved using *save_pretrained('./test/saved_model/')*
  795. >>> generation_config.save_pretrained("./test/saved_model/")
  796. >>> generation_config = GenerationConfig.from_pretrained("./test/saved_model/")
  797. >>> # You can also specify configuration names to your generation configuration file
  798. >>> generation_config.save_pretrained("./test/saved_model/", config_file_name="my_configuration.json")
  799. >>> generation_config = GenerationConfig.from_pretrained("./test/saved_model/", "my_configuration.json")
  800. >>> # If you'd like to try a minor variation to an existing configuration, you can also pass generation
  801. >>> # arguments to `.from_pretrained()`. Be mindful that typos and unused arguments will be ignored
  802. >>> generation_config, unused_kwargs = GenerationConfig.from_pretrained(
  803. ... "openai-community/gpt2", top_k=1, foo=False, do_sample=True, return_unused_kwargs=True
  804. ... )
  805. >>> generation_config.top_k
  806. 1
  807. >>> unused_kwargs
  808. {'foo': False}
  809. ```"""
  810. config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME
  811. proxies = kwargs.pop("proxies", None)
  812. subfolder = kwargs.pop("subfolder", "")
  813. from_pipeline = kwargs.pop("_from_pipeline", None)
  814. from_auto_class = kwargs.pop("_from_auto", False)
  815. commit_hash = kwargs.pop("_commit_hash", None)
  816. user_agent = {"file_type": "config", "from_auto_class": from_auto_class}
  817. if from_pipeline is not None:
  818. user_agent["using_pipeline"] = from_pipeline
  819. config_path = os.path.join(pretrained_model_name, config_file_name)
  820. config_path = str(config_path)
  821. is_local = os.path.exists(config_path)
  822. if os.path.isfile(os.path.join(subfolder, config_path)):
  823. # Special case when config_path is a local file
  824. resolved_config_file = config_path
  825. is_local = True
  826. else:
  827. configuration_file = config_file_name
  828. try:
  829. # Load from local folder or from cache or download from model Hub and cache
  830. resolved_config_file = cached_file(
  831. pretrained_model_name,
  832. configuration_file,
  833. cache_dir=cache_dir,
  834. force_download=force_download,
  835. proxies=proxies,
  836. local_files_only=local_files_only,
  837. token=token,
  838. user_agent=user_agent,
  839. revision=revision,
  840. subfolder=subfolder,
  841. _commit_hash=commit_hash,
  842. )
  843. commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
  844. except OSError:
  845. # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
  846. # the original exception.
  847. raise
  848. except Exception:
  849. # For any other exception, we throw a generic error.
  850. raise OSError(
  851. f"Can't load the configuration of '{pretrained_model_name}'. If you were trying to load it"
  852. " from 'https://huggingface.co/models', make sure you don't have a local directory with the same"
  853. f" name. Otherwise, make sure '{pretrained_model_name}' is the correct path to a directory"
  854. f" containing a {configuration_file} file"
  855. )
  856. try:
  857. # Load config dict
  858. config_dict = cls._dict_from_json_file(resolved_config_file)
  859. config_dict["_commit_hash"] = commit_hash
  860. except (json.JSONDecodeError, UnicodeDecodeError):
  861. raise OSError(f"It looks like the config file at '{resolved_config_file}' is not a valid JSON file.")
  862. if is_local:
  863. logger.info(f"loading configuration file {resolved_config_file}")
  864. else:
  865. logger.info(f"loading configuration file {configuration_file} from cache at {resolved_config_file}")
  866. if kwargs.get("_from_model_config", False):
  867. return cls.from_model_config(config_dict)
  868. elif kwargs.get("return_unused_kwargs") is True:
  869. config, unused_kwargs = cls.from_dict(config_dict, **kwargs)
  870. config._original_object_hash = hash(config) # Hash to detect whether the instance was modified
  871. return config, unused_kwargs
  872. else:
  873. config = cls.from_dict(config_dict, **kwargs)
  874. config._original_object_hash = hash(config) # Hash to detect whether the instance was modified
  875. return config
  876. @classmethod
  877. def _dict_from_json_file(cls, json_file: str | os.PathLike):
  878. with open(json_file, "r", encoding="utf-8") as reader:
  879. text = reader.read()
  880. return json.loads(text)
  881. @classmethod
  882. def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "GenerationConfig":
  883. """
  884. Instantiates a [`GenerationConfig`] from a Python dictionary of parameters.
  885. Args:
  886. config_dict (`dict[str, Any]`):
  887. Dictionary that will be used to instantiate the configuration object.
  888. kwargs (`dict[str, Any]`):
  889. Additional parameters from which to initialize the configuration object.
  890. Returns:
  891. [`GenerationConfig`]: The configuration object instantiated from those parameters.
  892. """
  893. return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
  894. # Those arguments may be passed along for our internal telemetry.
  895. # We remove them so they don't appear in `return_unused_kwargs`.
  896. kwargs.pop("_from_auto", None)
  897. kwargs.pop("_from_pipeline", None)
  898. # The commit hash might have been updated in the `config_dict`, we don't want the kwargs to erase that update.
  899. if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
  900. kwargs["_commit_hash"] = config_dict["_commit_hash"]
  901. # The line below allows model-specific config to be loaded as well through kwargs, with safety checks.
  902. # See https://github.com/huggingface/transformers/pull/21269
  903. config = cls(**{**config_dict, **kwargs})
  904. unused_kwargs = config.update(**kwargs)
  905. logger.info(f"Generate config {config}")
  906. if return_unused_kwargs:
  907. return config, unused_kwargs
  908. else:
  909. return config
  910. def dict_dtype_to_str(self, d: dict[str, Any]) -> None:
  911. """
  912. Checks whether the passed dictionary and its nested dicts have a *dtype* key and if it's not None,
  913. converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
  914. string, which can then be stored in the json format.
  915. """
  916. if d.get("dtype") is not None and not isinstance(d["dtype"], str):
  917. d["dtype"] = str(d["dtype"]).split(".")[1]
  918. for value in d.values():
  919. if isinstance(value, dict):
  920. self.dict_dtype_to_str(value)
  921. def to_diff_dict(self) -> dict[str, Any]:
  922. """
  923. Removes all attributes from config which correspond to the default config attributes for better readability and
  924. serializes to a Python dictionary.
  925. Returns:
  926. `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
  927. """
  928. config_dict = self.to_dict()
  929. # get the default config dict
  930. default_config_dict = GenerationConfig().to_dict()
  931. serializable_config_dict = {}
  932. # only serialize values that differ from the default config
  933. for key, value in config_dict.items():
  934. if key not in default_config_dict or key == "transformers_version" or value != default_config_dict[key]:
  935. serializable_config_dict[key] = value
  936. self.dict_dtype_to_str(serializable_config_dict)
  937. return serializable_config_dict
  938. def to_dict(self) -> dict[str, Any]:
  939. """
  940. Serializes this instance to a Python dictionary.
  941. Returns:
  942. `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
  943. """
  944. output = copy.deepcopy(self.__dict__)
  945. # Fields to ignore at serialization time
  946. if "_commit_hash" in output:
  947. del output["_commit_hash"]
  948. if "_original_object_hash" in output:
  949. del output["_original_object_hash"]
  950. # Transformers version when serializing this file
  951. output["transformers_version"] = __version__
  952. self.dict_dtype_to_str(output)
  953. return output
  954. def to_json_string(
  955. self, use_diff: bool = True, ignore_metadata: bool = False, keys_to_pop: list[str] | None = None
  956. ) -> str:
  957. """
  958. Serializes this instance to a JSON string.
  959. Args:
  960. use_diff (`bool`, *optional*, defaults to `True`):
  961. If set to `True`, only the difference between the config instance and the default `GenerationConfig()`
  962. is serialized to JSON string.
  963. ignore_metadata (`bool`, *optional*, defaults to `False`):
  964. Whether to ignore the metadata fields present in the instance
  965. keys_to_pop (`list[str]`, *optional*):
  966. Keys to pop from the config dictionary before serializing
  967. Returns:
  968. `str`: String containing all the attributes that make up this configuration instance in JSON format.
  969. """
  970. if use_diff is True:
  971. config_dict = self.to_diff_dict()
  972. else:
  973. config_dict = self.to_dict()
  974. if keys_to_pop is not None:
  975. for key in keys_to_pop:
  976. config_dict.pop(key, None)
  977. if ignore_metadata:
  978. for metadata_field in METADATA_FIELDS:
  979. config_dict.pop(metadata_field, None)
  980. def convert_keys_to_string(obj):
  981. if isinstance(obj, dict):
  982. return {str(key): convert_keys_to_string(value) for key, value in obj.items()}
  983. elif isinstance(obj, list):
  984. return [convert_keys_to_string(item) for item in obj]
  985. else:
  986. return obj
  987. def convert_dataclass_to_dict(obj):
  988. if isinstance(obj, dict):
  989. return {key: convert_dataclass_to_dict(value) for key, value in obj.items()}
  990. elif is_dataclass(obj):
  991. return obj.to_dict()
  992. else:
  993. return obj
  994. config_dict = convert_keys_to_string(config_dict)
  995. config_dict = convert_dataclass_to_dict(config_dict)
  996. return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
  997. def to_json_file(
  998. self, json_file_path: str | os.PathLike, use_diff: bool = True, keys_to_pop: list[str] | None = None
  999. ) -> None:
  1000. """
  1001. Save this instance to a JSON file.
  1002. Args:
  1003. json_file_path (`str` or `os.PathLike`):
  1004. Path to the JSON file in which this configuration instance's parameters will be saved.
  1005. use_diff (`bool`, *optional*, defaults to `True`):
  1006. If set to `True`, only the difference between the config instance and the default `GenerationConfig()`
  1007. is serialized to JSON file.
  1008. keys_to_pop (`list[str]`, *optional*):
  1009. Keys to pop from the config dictionary before serializing
  1010. """
  1011. with open(json_file_path, "w", encoding="utf-8") as writer:
  1012. writer.write(self.to_json_string(use_diff=use_diff, keys_to_pop=keys_to_pop))
  1013. @classmethod
  1014. def from_model_config(cls, model_config: Union["PreTrainedConfig", dict]) -> "GenerationConfig":
  1015. """
  1016. Instantiates a [`GenerationConfig`] from a [`PreTrainedConfig`]. This function is useful to convert legacy
  1017. [`PreTrainedConfig`] objects, which may contain generation parameters, into a stand-alone [`GenerationConfig`].
  1018. Args:
  1019. model_config (`PreTrainedConfig | dict`):
  1020. The model config that will be used to instantiate the generation config.
  1021. Returns:
  1022. [`GenerationConfig`]: The configuration object instantiated from those parameters.
  1023. """
  1024. config_dict = model_config.to_dict() if not isinstance(model_config, dict) else model_config
  1025. config_dict.pop("_from_model_config", None)
  1026. # Removes all `None` from the model config dict -- this lets the generation config defaults to take hold
  1027. config_dict = {key: value for key, value in config_dict.items() if value is not None}
  1028. generation_config = cls.from_dict(config_dict, return_unused_kwargs=False, _from_model_config=True)
  1029. # Special case: some models have generation attributes set in the decoder. Use them if still unset in the
  1030. # generation config (which in turn is defined from the outer attributes of model config).
  1031. if isinstance(model_config, dict):
  1032. decoder_possible_text_config_names = ("decoder", "generator", "text_config")
  1033. for text_config_name in decoder_possible_text_config_names:
  1034. if text_config := model_config.get(text_config_name):
  1035. model_config = text_config
  1036. break
  1037. else:
  1038. model_config = model_config.get_text_config(decoder=True)
  1039. model_config = model_config.to_dict()
  1040. default_generation_config = GenerationConfig()
  1041. for attr in generation_config.to_dict():
  1042. is_unset = getattr(generation_config, attr) == getattr(default_generation_config, attr)
  1043. if attr in model_config and is_unset:
  1044. setattr(generation_config, attr, model_config[attr])
  1045. # If any `output_...` flag is set to `True`, we ensure `return_dict_in_generate` is set to `True`.
  1046. if not generation_config.return_dict_in_generate:
  1047. if any(
  1048. getattr(generation_config, extra_output_flag, False)
  1049. for extra_output_flag in generation_config.extra_output_flags
  1050. ):
  1051. generation_config.return_dict_in_generate = True
  1052. # Hash to detect whether the instance was modified
  1053. generation_config._original_object_hash = hash(generation_config)
  1054. return generation_config
  1055. def update(self, defaults_only=False, allow_custom_entries=False, **kwargs):
  1056. """
  1057. Updates attributes of this class instance with attributes from `kwargs` if they match existing attributes,
  1058. returning all the unused kwargs.
  1059. Args:
  1060. defaults_only (`bool`, *optional*, defaults to `False`):
  1061. Whether to update all keys in config with `kwargs` or only those that are set to `None` (i.e. default value).
  1062. allow_custom_entries (`bool`, *optional*, defaults to `False`):
  1063. Whether to allow updating custom entries into the config with `kwargs` if not present in the current config.
  1064. kwargs (`dict[str, Any]`):
  1065. Dictionary of attributes to tentatively update this class.
  1066. Returns:
  1067. `dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
  1068. """
  1069. to_remove = []
  1070. for key, value in kwargs.items():
  1071. if allow_custom_entries and not hasattr(self, key):
  1072. setattr(self, key, value)
  1073. to_remove.append(key)
  1074. elif hasattr(self, key):
  1075. if not defaults_only or getattr(self, key) is None:
  1076. setattr(self, key, value)
  1077. to_remove.append(key)
  1078. # Confirm that the updated instance is still valid
  1079. self.validate()
  1080. # Remove all the attributes that were updated, without modifying the input dict
  1081. unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove}
  1082. return unused_kwargs
  1083. @dataclass
  1084. class BaseWatermarkingConfig(ABC):
  1085. """Generic watermarking config"""
  1086. @classmethod
  1087. def from_dict(cls, config_dict, **kwargs):
  1088. """
  1089. Constructs a BaseWatermarkingConfig instance from a dictionary of parameters.
  1090. Args:
  1091. config_dict (dict[str, Any]): Dictionary containing configuration parameters.
  1092. **kwargs: Additional keyword arguments to override dictionary values.
  1093. Returns:
  1094. BaseWatermarkingConfig: Instance of BaseWatermarkingConfig constructed from the dictionary.
  1095. """
  1096. config = cls(**config_dict)
  1097. to_remove = []
  1098. for key, value in kwargs.items():
  1099. if hasattr(config, key):
  1100. setattr(config, key, value)
  1101. to_remove.append(key)
  1102. for key in to_remove:
  1103. kwargs.pop(key, None)
  1104. return config
  1105. def to_json_file(self, json_file_path: str | os.PathLike):
  1106. """
  1107. Save this instance to a JSON file.
  1108. Args:
  1109. json_file_path (Union[str, os.PathLike]): Path to the JSON file in which this configuration instance's parameters will be saved.
  1110. """
  1111. with open(json_file_path, "w", encoding="utf-8") as writer:
  1112. config_dict = self.to_dict()
  1113. json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
  1114. writer.write(json_string)
  1115. def to_dict(self) -> dict[str, Any]:
  1116. """
  1117. Serializes this instance to a Python dictionary.
  1118. Returns:
  1119. dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
  1120. """
  1121. output = copy.deepcopy(self.__dict__)
  1122. return output
  1123. def __iter__(self):
  1124. yield from copy.deepcopy(self.__dict__).items()
  1125. def __repr__(self):
  1126. return f"{self.__class__.__name__} {self.to_json_string()}"
  1127. def to_json_string(self):
  1128. """
  1129. Serializes this instance to a JSON formatted string.
  1130. Returns:
  1131. str: JSON formatted string representing the configuration instance.
  1132. """
  1133. return json.dumps(self.__dict__, indent=2) + "\n"
  1134. def update(self, **kwargs):
  1135. """
  1136. Update the configuration attributes with new values.
  1137. Args:
  1138. **kwargs: Keyword arguments representing configuration attributes and their new values.
  1139. """
  1140. for key, value in kwargs.items():
  1141. if hasattr(self, key):
  1142. setattr(self, key, value)
  1143. @abstractmethod
  1144. def validate(self): ...
  1145. @abstractmethod
  1146. def construct_processor(self, vocab_size): ...
  1147. @dataclass
  1148. class WatermarkingConfig(BaseWatermarkingConfig):
  1149. """
  1150. Class that holds arguments for watermark generation and should be passed into `GenerationConfig` during `generate`.
  1151. See [this paper](https://huggingface.co/papers/2306.04634) for more details on the arguments.
  1152. Accepts the following keys:
  1153. - greenlist_ratio (`float`):
  1154. Used for watermarking. The ratio of "green" tokens used to the vocabulary size. Defaults to 0.25.
  1155. - bias (`float`):
  1156. Used with watermarking. The bias added to the selected "green" tokens' logits. Defaults to 2.0.
  1157. - hashing_key (`int`):
  1158. Hashing key used for watermarking. Defaults to 15485863 (the millionth prime).
  1159. - seeding_scheme (`str`):
  1160. Algorithm to use for watermarking. Accepts values:
  1161. - "lefthash" (default): "green" tokens selection depend on the last token (Algorithm 2 from the paper)
  1162. - "selfhash": "green" tokens selection depends on the current token itself (Algorithm 3 from the paper)
  1163. The downside of this scheme is that it considers all possible next tokens and can be slower than "lefthash".
  1164. - context_width(`int`):
  1165. The context length of previous tokens to use in seeding. Higher context length makes watermarking more robust.
  1166. """
  1167. def __init__(
  1168. self,
  1169. greenlist_ratio: float = 0.25,
  1170. bias: float = 2.0,
  1171. hashing_key: int = 15485863,
  1172. seeding_scheme: str = "lefthash",
  1173. context_width: int = 1,
  1174. ):
  1175. self.greenlist_ratio = greenlist_ratio
  1176. self.bias = bias
  1177. self.hashing_key = hashing_key
  1178. self.seeding_scheme = seeding_scheme
  1179. self.context_width = context_width
  1180. def validate(self):
  1181. watermark_missing_arg_msg = (
  1182. "Some of the keys in `watermarking_config` are defined incorrectly. `{key}` should be {correct_value}` "
  1183. "but found {found_value}"
  1184. )
  1185. if self.seeding_scheme not in ["selfhash", "lefthash"]:
  1186. raise ValueError(
  1187. watermark_missing_arg_msg.format(
  1188. key="seeding_scheme",
  1189. correct_value="[`selfhash`, `lefthash`]",
  1190. found_value=self.seeding_scheme,
  1191. ),
  1192. )
  1193. if not 0.0 <= self.greenlist_ratio <= 1.0:
  1194. raise ValueError(
  1195. watermark_missing_arg_msg.format(
  1196. key="greenlist_ratio",
  1197. correct_value="in range between 0.0 and 1.0",
  1198. found_value=self.seeding_scheme,
  1199. ),
  1200. )
  1201. if not self.context_width >= 1:
  1202. raise ValueError(
  1203. watermark_missing_arg_msg.format(
  1204. key="context_width",
  1205. correct_value="a positive integer",
  1206. found_value=self.context_width,
  1207. ),
  1208. )
  1209. def construct_processor(self, vocab_size: int, device) -> "WatermarkLogitsProcessor":
  1210. return WatermarkLogitsProcessor(
  1211. vocab_size=vocab_size,
  1212. device=device,
  1213. greenlist_ratio=self.greenlist_ratio,
  1214. bias=self.bias,
  1215. hashing_key=self.hashing_key,
  1216. seeding_scheme=self.seeding_scheme,
  1217. context_width=self.context_width,
  1218. )
  1219. @dataclass
  1220. class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):
  1221. """
  1222. Class that holds arguments for watermark generation and should be passed into `GenerationConfig` during `generate`.
  1223. See [this paper](https://www.nature.com/articles/s41586-024-08025-4) for more details on the arguments.
  1224. Args:
  1225. ngram_len (`int`):
  1226. Ngram length.
  1227. keys (`list[int]`):
  1228. A sequence of watermarking keys, one for each depth.
  1229. context_history_size (`int`, *optional*, defaults to 1024):
  1230. Size of the tensor to keep track of seen contexts.
  1231. sampling_table_seed (`int`, *optional*, defaults to 0):
  1232. Random seed to generate the sampling table.
  1233. sampling_table_size (`int`, *optional*, defaults to 65536):
  1234. Size of the sampling table.
  1235. skip_first_ngram_calls (`bool`, *optional*, defaults to `False`):
  1236. Whether to skip first ngram calls.
  1237. debug_mode (`bool`, optional, *optional*, defaults to `False`):
  1238. Logits are modified to uniform one got before watermarking modification is applied. This is to test the
  1239. implementation.
  1240. Examples:
  1241. ```python
  1242. >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig
  1243. >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b', padding_side="left")
  1244. >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b')
  1245. >>> # SynthID Text configuration
  1246. >>> watermarking_config = SynthIDTextWatermarkingConfig(
  1247. ... keys=[654, 400, 836, 123, 340, 443, 597, 160, 57],
  1248. ... ngram_len=5,
  1249. ... )
  1250. >>> # Generation with watermarking
  1251. >>> tokenized_prompts = tokenizer(["Once upon a time, "], return_tensors="pt", padding=True)
  1252. >>> output_sequences = model.generate(
  1253. ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, max_new_tokens=10
  1254. ... )
  1255. >>> watermarked_text = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
  1256. ```
  1257. """
  1258. def __init__(
  1259. self,
  1260. ngram_len: int,
  1261. keys: list[int],
  1262. context_history_size: int = 1024,
  1263. sampling_table_seed: int = 0,
  1264. sampling_table_size: int = 2**16,
  1265. skip_first_ngram_calls: bool = False,
  1266. debug_mode: bool = False,
  1267. ):
  1268. self.ngram_len = ngram_len
  1269. self.keys = keys
  1270. self.sampling_table_size = sampling_table_size
  1271. self.sampling_table_seed = sampling_table_seed
  1272. self.context_history_size = context_history_size
  1273. self.skip_first_ngram_calls = skip_first_ngram_calls
  1274. self.debug_mode = debug_mode
  1275. def validate(self):
  1276. watermark_missing_arg_msg = (
  1277. "Some of the keys in `watermarking_config` are defined incorrectly. `{key}` should be {correct_value}` "
  1278. "but found {found_value}"
  1279. )
  1280. if self.sampling_table_size > 2**24:
  1281. raise ValueError(
  1282. watermark_missing_arg_msg.format(
  1283. key="sampling_table_size",
  1284. correct_value="< 2**24",
  1285. found_value=self.sampling_table_size,
  1286. ),
  1287. )
  1288. def construct_processor(self, vocab_size: int, device) -> "WatermarkLogitsProcessor":
  1289. return SynthIDTextWatermarkLogitsProcessor(
  1290. ngram_len=self.ngram_len,
  1291. keys=self.keys,
  1292. sampling_table_size=self.sampling_table_size,
  1293. sampling_table_seed=self.sampling_table_seed,
  1294. context_history_size=self.context_history_size,
  1295. device=device,
  1296. skip_first_ngram_calls=self.skip_first_ngram_calls,
  1297. debug_mode=self.debug_mode,
  1298. )
  1299. @dataclass
  1300. class CompileConfig:
  1301. """
  1302. Class that holds arguments relative to `torch.compile` behavior, when using automatic compilation in `generate`.
  1303. See [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) for more details on the arguments.
  1304. Args:
  1305. fullgraph (`bool`, *optional*, defaults to `False`):
  1306. If False (default), attempts to discover compilable regions that will be optimized. If True, then require
  1307. that the entire function be capturable into a single graph. If this is not possible (that is, if there are
  1308. graph breaks), then an error will be raised.
  1309. dynamic (`bool` or `None`, *optional*):
  1310. Whether to try to use dynamic shape graphs.
  1311. backend (`str` or `Callable`, *optional*, defaults to `"inductor"`):
  1312. Backend to be used.
  1313. mode (`str`, *optional*, defaults to `"reduce-overhead"`):
  1314. Controls balance between performance and overhead.
  1315. options (`dict`, *optional*):
  1316. A dictionary of options to pass to the backend.
  1317. Examples:
  1318. ```python
  1319. >>> from transformers import AutoModelForCausalLM, AutoTokenizer, CompileConfig
  1320. >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b')
  1321. >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b').cuda()
  1322. >>> # Automatic compile configuration, used with static cache
  1323. >>> compile_config = CompileConfig(dynamic=True)
  1324. >>> # Generation with static cache and compile config
  1325. >>> input = tokenizer.encode("Hello there, how", return_tensors="pt").cuda()
  1326. >>> output = model.generate(
  1327. ... input, do_sample=False, max_new_tokens=300, cache_implementation="static", compile_config=compile_config
  1328. ... )
  1329. >>> output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
  1330. ```
  1331. """
  1332. fullgraph: bool = False
  1333. dynamic: bool | None = None
  1334. backend: str | Callable = "inductor"
  1335. mode: str = "reduce-overhead"
  1336. options: dict | None = None
  1337. # Used to flag our `generate` call to compile on e.g. CPU. Often not optimal, but useful for testing purposes.
  1338. _compile_all_devices = None
  1339. def to_dict(self) -> dict[str, Any]:
  1340. """Serializes this instance to a Python dictionary."""
  1341. return copy.deepcopy({key: value for key, value in self.__dict__.items() if key != "_compile_all_devices"})
  1342. # TODO: add the @strict decorator to prevent attributes passed as args rather than kwargs
  1343. @dataclass
  1344. class ContinuousBatchingConfig:
  1345. """
  1346. Class that holds arguments relative to continuous batching, when using continuous batching through the
  1347. `generate_batch` method or the `continuous_batching_context_manager` context manager.
  1348. Args:
  1349. block_size (`int`, *optional*, defaults to 256):
  1350. Size of each KV cache block in tokens.
  1351. num_blocks (`int`, *optional*):
  1352. Number of blocks in the KV cache. Auto-inferred from GPU memory when `None`.
  1353. max_batch_tokens (`int`, *optional*):
  1354. Maximum number of tokens in a batch. Auto-inferred from GPU memory when `None`.
  1355. max_memory_percent (`float`, *optional*, defaults to 0.8):
  1356. Maximum percentage of free GPU memory (after the model is loaded) to use for the KV cache.
  1357. max_blocks_per_request (`int`, *optional*, defaults to 0):
  1358. Maximum blocks per request, used in the `flash_attn_with_kvcache` fast decode path to dimension
  1359. the block table. Setting this to 0 disables the fast decode path.
  1360. allow_block_sharing (`bool`, *optional*, defaults to `True`):
  1361. Whether to allow block sharing for prefix caching. Block sharing can only be allowed, never forced,
  1362. as some models do not support it. Disable if you have few short prompts but long generation lengths.
  1363. use_async_batching (`bool`, *optional*):
  1364. Whether to enable async double-buffering, which removes CPU overhead from the continuous batching
  1365. loop at the cost of doubled VRAM usage. Auto-detected when `None`.
  1366. use_cuda_graph (`bool`, *optional*):
  1367. Whether to enable CUDA graphs. Auto-inferred when `None`.
  1368. q_padding_interval_size (`int`, *optional*, defaults to 0):
  1369. Query padding granularity in tokens for CUDA graphs. Uses a preset from `continuous_api.py` when
  1370. set to 0.
  1371. kv_padding_interval_size (`int`, *optional*, defaults to 0):
  1372. KV padding granularity in tokens for CUDA graphs. Uses a preset from `continuous_api.py` when
  1373. set to 0.
  1374. max_cached_graphs (`int`, *optional*, defaults to 0):
  1375. Maximum number of cached CUDA graphs. Uses a preset from `continuous_api.py` when set to 0.
  1376. varlen_compile_config (`CompileConfig`, *optional*):
  1377. CompileConfig for varlen (prefill) path. Default is None (uses generation_config fallback)
  1378. The varlen path handles batches with varying query and KV lengths, often benefiting from dynamic=True.
  1379. decode_compile_config (`CompileConfig`, *optional*):
  1380. CompileConfig for decode (fast) path. Default is None (uses generation_config fallback)
  1381. The decode path handles batches has no dynamic KV length, so static shapes are a better fit.
  1382. use_default_compile_configs (`bool`, *optional*, defaults to `False`):
  1383. If True, a default compile config will be used for paths that are not explicitly set.
  1384. scheduler_type (`str`, *optional*, defaults to `"fifo"`):
  1385. Scheduler type to use.
  1386. return_logprobs (`bool`, *optional*, defaults to `False`):
  1387. Whether to return log probabilities along with the generated tokens.
  1388. max_queue_size (`int`, *optional*, defaults to 0):
  1389. Maximum request queue size for serving. 0 means unlimited.
  1390. """
  1391. # Size of each KV cache block
  1392. block_size: int = 256
  1393. # The number of blocks used in the KV cache and the maximum number of tokens in a batch. Once the block size is set,
  1394. # these can be auto inferred using GPU size.
  1395. num_blocks: int | None = None
  1396. max_batch_tokens: int | None = None
  1397. # The max percentage of free GPU memory (after the model is loaded) to use for the KV cache.
  1398. max_memory_percent: float = 0.8
  1399. # This is only used in the flash_attn_with_kvcache fast decode path to dimension the block table. If it is set to 0,
  1400. # the fast decode path will not be used. Currently turned off by default.
  1401. max_blocks_per_request: int | None = 0
  1402. # Block sharing can only be allowed, but never forced: some model just do not support it. If you only have a few
  1403. # short prompts, but long generation lengths, you might want to disable block sharing.
  1404. allow_block_sharing: bool = True
  1405. # Enables asynchronous batching. This removes the CPU overhead from the continuous batching loop, at the cost of
  1406. # doubling the VRAM usage. If None, will be automatically detected.
  1407. use_async_batching: bool | None = None
  1408. # If any of these parameters are set to a non-default, CUDA graphs will be used. Otherwise we automatically infer
  1409. # if they should be turned on. Padding interval sizes are in tokens and further explained in the docstring at the
  1410. # top of the continuous_batching/continuous_api.py file.
  1411. use_cuda_graph: bool | None = None
  1412. q_padding_interval_size: int = 0
  1413. kv_padding_interval_size: int = 0
  1414. max_cached_graphs: int = 0
  1415. # Compile configs for the two execution paths. If None, uses the compile_config from generation_config as fallback.
  1416. # The varlen path is used for prefill and when fast decode is unavailable. The decode path is used when
  1417. # max_blocks_per_request > 0 (fast decode with block table).
  1418. varlen_compile_config: CompileConfig | None = None
  1419. decode_compile_config: CompileConfig | None = None
  1420. # If this flag is set to True, a default compile config will be used for paths that are not explicitly set.
  1421. use_default_compile_configs: bool = False
  1422. # Scheduler type. FIFO by default. For all types available, checks SCHEDULER_MAPPING in scheduler.py
  1423. scheduler_type: str = "fifo"
  1424. # Whether to generate log probabilities, which is the log of the softmax of the processed logits. If True, the log
  1425. # probabilities will be returned along with the generated tokens in the generation output.
  1426. return_logprobs: bool = False
  1427. # The parameters below are mostly useful in the context of serving
  1428. max_queue_size: int = 0
  1429. def account_for_cb_deprecated_arguments(
  1430. self,
  1431. max_queue_size: int = 0,
  1432. q_padding_interval_size: int = 0,
  1433. kv_padding_interval_size: int = 0,
  1434. allow_block_sharing: bool = True,
  1435. use_async_batching: bool | None = None,
  1436. max_cached_graphs: int = 0,
  1437. ) -> None:
  1438. """Some arguments given to `generate_batch`, `init_continuous_batching` or `continuous_batching_context_manager`
  1439. are now deprecated and are expected inside the continuous batching config. This method checks if any were
  1440. passed and accounts for them in the continuous batching config. It raises a deprecation warning if any were
  1441. passed.
  1442. """
  1443. kwargs_to_warn = []
  1444. if max_queue_size > 0:
  1445. kwargs_to_warn.append("max_queue_size")
  1446. self.max_queue_size = max_queue_size
  1447. if q_padding_interval_size > 0:
  1448. kwargs_to_warn.append("q_padding_interval_size")
  1449. self.q_padding_interval_size = q_padding_interval_size
  1450. if kv_padding_interval_size > 0:
  1451. kwargs_to_warn.append("kv_padding_interval_size")
  1452. self.kv_padding_interval_size = kv_padding_interval_size
  1453. if not allow_block_sharing: # config default is True, so False means the user explicitly set it to False
  1454. kwargs_to_warn.append("allow_block_sharing")
  1455. self.allow_block_sharing = allow_block_sharing
  1456. if use_async_batching is not None:
  1457. kwargs_to_warn.append("use_async_batching")
  1458. self.use_async_batching = use_async_batching
  1459. if max_cached_graphs > 0:
  1460. kwargs_to_warn.append("max_cached_graphs")
  1461. self.max_cached_graphs = max_cached_graphs
  1462. if kwargs_to_warn:
  1463. logger.warning(
  1464. "The following arguments were provided to a continuous batching entry point instead of being passed "
  1465. "through the continuous_batching_config: " + ", ".join(kwargs_to_warn)
  1466. )
  1467. def decide_use_cuda_graphs(self, compile_config: CompileConfig | None, is_attn_mask_needed: bool) -> bool:
  1468. """Returns whether or not to use cuda graphs for continuous batching. If the user specified this in the config
  1469. or if they specified a parameter related to cuda graphs, they are turned on. Otherwise, we use a heuristic
  1470. based on the attention implementation: we turn on cuda graphs if and only if no attention mask is needed.
  1471. This function modifies the `use_cuda_graph` attribute of the config in place.
  1472. """
  1473. # If cuda is not available, we cannot use cuda graphs
  1474. import torch
  1475. if not torch.cuda.is_available():
  1476. if self.use_cuda_graph: # throw a warning only if the user intended to use cuda graphs
  1477. logger.warning(f"use_cuda_graph is True but {torch.cuda.is_available() = }: turning off cuda graphs.")
  1478. self.use_cuda_graph = False
  1479. # Else if use_cuda_graph is specified, we follow the user's choice
  1480. elif self.use_cuda_graph is not None:
  1481. pass # nothing to do but catch this case and wait for the function to return later
  1482. # Else if the user specified a parameter related to cuda graphs, we activate cuda graphs
  1483. elif self.q_padding_interval_size or self.kv_padding_interval_size or self.max_cached_graphs:
  1484. self.use_cuda_graph = True
  1485. # Else if a compile config was found, turn off cuda graphs if the compile config already uses them
  1486. elif compile_config is not None:
  1487. options = torch._inductor.list_mode_options().get(compile_config.mode, compile_config.options)
  1488. compile_uses_cudagraphs = options.get("triton.cudagraphs", False)
  1489. if compile_uses_cudagraphs:
  1490. logger.warning(
  1491. f"Compile config {compile_config.mode = } uses cudagraphs, which usually does not work well with "
  1492. "continuous batching. We recommend using mode 'default' or 'max-autotune-no-cudagraphs' instead."
  1493. )
  1494. self.use_cuda_graph = not compile_uses_cudagraphs # TODO: should this also match the dynamic shapes?
  1495. # Otherwise we have a default heuristic based on the attention implementation:
  1496. # attention implementations where an attention mask is needed suffer a lot more from the padding associated
  1497. # with cuda graphs, so default is to turn cuda graphs off for those implementations
  1498. else:
  1499. self.use_cuda_graph = not is_attn_mask_needed
  1500. logger.warning(
  1501. f"No behavior specified for use_cuda_graph, defaulting to {self.use_cuda_graph = } because "
  1502. f"{is_attn_mask_needed = }. If you want to save memory, turn off cuda graphs, but they tend to improve "
  1503. "performances by a lot."
  1504. )
  1505. # Return the decision
  1506. return self.use_cuda_graph
  1507. def decide_use_async_batching(self, is_attn_mask_needed: bool) -> bool:
  1508. """Returns whether or not to use asynchronous batching for continuous batching. If the user specified this in
  1509. the config, we follow their choice. Otherwise, we turn on asynchronous batching if and only if CUDA graphs are
  1510. turned on and no attention mask is needed.
  1511. This function modifies the `use_async_batching` attribute of the config in place.
  1512. """
  1513. # If the user specifies to use async or not, no need to decide ourselves
  1514. if self.use_async_batching is None:
  1515. self.use_async_batching = self.use_cuda_graph and not is_attn_mask_needed
  1516. logger.info(
  1517. f"No behavior specified for use_async_batching, choosing {self.use_async_batching = } because "
  1518. f"{self.use_cuda_graph = } and {not is_attn_mask_needed = }. If you want to save memory, you can "
  1519. "disable asynchronous batching but it will degrade performance."
  1520. )
  1521. return self.use_async_batching
  1522. def resolve_sentinel_values(self) -> None:
  1523. """For some parameters (padding intervals and max cached graphs), the default is a sentinel value of 0: that
  1524. way, if the user specifies a value for those parameters, we know they want it used, ie. we turn on cuda graphs.
  1525. But in the case the user does not specify those values, we still need them to resolve to a non-zero value.
  1526. This function takes care of that."""
  1527. # Interval sizes are in tokens for both Q and KV
  1528. if self.q_padding_interval_size == 0:
  1529. self.q_padding_interval_size = 64
  1530. if self.kv_padding_interval_size == 0:
  1531. self.kv_padding_interval_size = 64 * 256 # 64 blocks of 256 tokens ie. 16384 tokens
  1532. if self.max_cached_graphs == 0:
  1533. self.max_cached_graphs = 32
  1534. def resolve_compile_configs(
  1535. self, fallback_compile_config: CompileConfig | None, is_flash_attn: bool, decode_fast_path_available: bool
  1536. ) -> None:
  1537. """Resolve if the compile configs for varlen and decode paths, modifying these attributes in place if needed.
  1538. Default config use full compile over regional compile, because the throughput is significantly higher (~15%)"""
  1539. logger_ = logging.get_logger("ContinuousBatchingLogger")
  1540. # For each config, priority is: explicit config, default config, fallback config, None
  1541. if self.varlen_compile_config is None:
  1542. if self.use_default_compile_configs:
  1543. # Flash does not support fullgraph but other (sdpa and eager) do
  1544. fullgraph = not is_flash_attn
  1545. varlen_config = CompileConfig(mode="max-autotune-no-cudagraphs", fullgraph=fullgraph, dynamic=True)
  1546. elif fallback_compile_config is not None:
  1547. varlen_config = fallback_compile_config
  1548. else:
  1549. varlen_config = None
  1550. else:
  1551. varlen_config = self.varlen_compile_config
  1552. if self.decode_compile_config is None:
  1553. if self.use_default_compile_configs:
  1554. # Paged attention is wrapped in @torch.compiler.disable so we can't use fullgraph
  1555. decode_config = CompileConfig(mode="max-autotune-no-cudagraphs", fullgraph=False, dynamic=False)
  1556. elif fallback_compile_config is not None:
  1557. decode_config = fallback_compile_config
  1558. else:
  1559. decode_config = None
  1560. else:
  1561. decode_config = self.decode_compile_config
  1562. # For decode, we throw a warning if the fast decode path is not available and a compile config was found
  1563. if not decode_fast_path_available and self.decode_compile_config is not None:
  1564. decode_config = None
  1565. logger_.warning("A decode_compile_config was set but fast decode path is not available. Ignoring it.")
  1566. # Log what will be compiled
  1567. if varlen_config is not None:
  1568. logger_.info(f"Varlen path will be compiled with {varlen_config.to_dict()}")
  1569. if decode_config is not None:
  1570. logger_.info(f"Decode path will be compiled with {decode_config.to_dict()}")
  1571. # Modify in place
  1572. self.varlen_compile_config = varlen_config
  1573. self.decode_compile_config = decode_config