trainer.py 210 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412
  1. # Copyright 2020-present the HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task.
  16. """
  17. import contextlib
  18. import functools
  19. import glob
  20. import inspect
  21. import json
  22. import math
  23. import os
  24. import random
  25. import shutil
  26. import sys
  27. import tempfile
  28. import time
  29. import warnings
  30. from collections.abc import Callable, Iterator, Mapping
  31. from functools import partial
  32. from pathlib import Path
  33. from typing import TYPE_CHECKING, Any
  34. # Integrations must be imported before ML frameworks:
  35. # ruff: isort: off
  36. from .integrations import (
  37. get_reporting_integration_callbacks,
  38. )
  39. # ruff: isort: on
  40. import numpy as np
  41. import safetensors.torch
  42. import torch
  43. import torch.distributed as dist
  44. from huggingface_hub import CommitInfo, ModelCard, create_repo, upload_folder
  45. from packaging import version
  46. from torch import nn
  47. from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler
  48. from . import __version__
  49. from .configuration_utils import PreTrainedConfig
  50. from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
  51. from .debug_utils import DebugOption, DebugUnderflowOverflow
  52. from .feature_extraction_sequence_utils import SequenceFeatureExtractor
  53. from .feature_extraction_utils import FeatureExtractionMixin
  54. from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend
  55. from .image_processing_utils import BaseImageProcessor
  56. from .integrations.deepspeed import (
  57. deepspeed_init,
  58. deepspeed_load_checkpoint,
  59. deepspeed_sp_compute_loss,
  60. is_deepspeed_available,
  61. propagate_args_to_deepspeed,
  62. )
  63. from .integrations.fsdp import get_fsdp_ckpt_kwargs, update_fsdp_plugin_peft
  64. from .integrations.liger import apply_liger_kernel
  65. from .integrations.neftune import activate_neftune, deactivate_neftune
  66. from .integrations.peft import MIN_PEFT_VERSION
  67. from .integrations.tpu import save_tpu_checkpoint, tpu_spmd_dataloader, wrap_model_xla_fsdp
  68. from .modelcard import TrainingSummary
  69. from .modeling_utils import PreTrainedModel, unwrap_model
  70. from .models.auto.modeling_auto import (
  71. MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
  72. MODEL_MAPPING_NAMES,
  73. )
  74. from .optimization import GreedyLR, get_scheduler
  75. from .processing_utils import ProcessorMixin
  76. from .tokenization_utils_base import PreTrainedTokenizerBase
  77. from .trainer_callback import (
  78. CallbackHandler,
  79. DefaultFlowCallback,
  80. ExportableState,
  81. PrinterCallback,
  82. ProgressCallback,
  83. TrainerCallback,
  84. TrainerControl,
  85. TrainerState,
  86. )
  87. from .trainer_optimizer import (
  88. _OPTIMIZER_HANDLERS,
  89. OptimizerContext,
  90. _parse_optim_args,
  91. is_optimizer_factory,
  92. )
  93. from .trainer_pt_utils import (
  94. EvalLoopContainer,
  95. IterableDatasetShard,
  96. LabelSmoother,
  97. LengthGroupedSampler,
  98. distributed_broadcast_scalars,
  99. find_batch_size,
  100. get_model_param_count,
  101. get_parameter_names,
  102. is_attention_mask_causal,
  103. nested_detach,
  104. nested_gather,
  105. reissue_pt_warnings,
  106. remove_dummy_checkpoint,
  107. safe_globals,
  108. set_rng_state_for_device,
  109. )
  110. from .trainer_utils import (
  111. PREFIX_CHECKPOINT_DIR,
  112. BestRun,
  113. EvalLoopOutput,
  114. EvalPrediction,
  115. HPSearchBackend,
  116. HubStrategy,
  117. PredictionOutput,
  118. RemoveColumnsCollator,
  119. SaveStrategy,
  120. TrainerMemoryTracker,
  121. TrainOutput,
  122. _is_peft_model,
  123. align_special_tokens,
  124. compare_trainer_and_checkpoint_args,
  125. default_compute_objective,
  126. denumpify_detensorize,
  127. enable_full_determinism,
  128. find_executable_batch_size,
  129. get_last_checkpoint,
  130. has_length,
  131. load_sharded_checkpoint,
  132. number_of_arguments,
  133. rotate_checkpoints,
  134. seed_worker,
  135. set_seed,
  136. sort_checkpoints,
  137. speed_metrics,
  138. suppress_progress_bars,
  139. unwrap_peft_model,
  140. validate_quantization_for_training,
  141. )
  142. from .training_args import OptimizerNames, ParallelMode, TrainingArguments
  143. from .utils import (
  144. ADAPTER_CONFIG_NAME,
  145. ADAPTER_SAFE_WEIGHTS_NAME,
  146. ADAPTER_WEIGHTS_NAME,
  147. CONFIG_NAME,
  148. GENERATION_CONFIG_NAME,
  149. SAFE_WEIGHTS_INDEX_NAME,
  150. SAFE_WEIGHTS_NAME,
  151. WEIGHTS_INDEX_NAME,
  152. WEIGHTS_NAME,
  153. XLA_FSDPV2_MIN_VERSION,
  154. PushInProgress,
  155. can_return_loss,
  156. check_torch_load_is_safe,
  157. find_labels,
  158. is_accelerate_available,
  159. is_datasets_available,
  160. is_in_notebook,
  161. is_peft_available,
  162. is_sagemaker_dp_enabled,
  163. is_sagemaker_mp_enabled,
  164. is_torch_hpu_available,
  165. is_torch_mlu_available,
  166. is_torch_musa_available,
  167. is_torch_npu_available,
  168. is_torch_xla_available,
  169. logging,
  170. )
  171. from .utils.import_utils import requires
  172. from .utils.quantization_config import QuantizationMethod
  173. DEFAULT_CALLBACKS = [DefaultFlowCallback]
  174. DEFAULT_PROGRESS_CALLBACK = ProgressCallback
  175. if is_in_notebook():
  176. from .utils.notebook import NotebookProgressCallback
  177. DEFAULT_PROGRESS_CALLBACK = NotebookProgressCallback
  178. if is_datasets_available():
  179. import datasets
  180. if is_torch_xla_available():
  181. import torch_xla.core.xla_model as xm
  182. import torch_xla.debug.metrics as met
  183. import torch_xla.runtime as xr
  184. from torch_xla import __version__ as XLA_VERSION
  185. IS_XLA_FSDPV2_POST_2_2 = version.parse(XLA_VERSION) >= version.parse(XLA_FSDPV2_MIN_VERSION)
  186. if IS_XLA_FSDPV2_POST_2_2:
  187. import torch_xla.distributed.spmd as xs
  188. else:
  189. IS_XLA_FSDPV2_POST_2_2 = False
  190. if is_sagemaker_mp_enabled():
  191. import smdistributed.modelparallel.torch as smp
  192. from .trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_nested_concat
  193. if is_peft_available():
  194. from peft import PeftModel
  195. if is_accelerate_available():
  196. from accelerate import Accelerator, skip_first_batches
  197. from accelerate.state import AcceleratorState
  198. from accelerate.utils import (
  199. DataLoaderConfiguration,
  200. DistributedDataParallelKwargs,
  201. DistributedType,
  202. GradientAccumulationPlugin,
  203. load_fsdp_model,
  204. load_fsdp_optimizer,
  205. release_memory,
  206. save_fsdp_model,
  207. save_fsdp_optimizer,
  208. )
  209. from accelerate.utils.memory import clear_device_cache
  210. if is_deepspeed_available():
  211. from accelerate.utils import DeepSpeedSchedulerWrapper
  212. if TYPE_CHECKING:
  213. import optuna
  214. logger = logging.get_logger(__name__)
  215. # Name of the files used for checkpointing
  216. TRAINING_ARGS_NAME = "training_args.bin"
  217. TRAINER_STATE_NAME = "trainer_state.json"
  218. OPTIMIZER_NAME = "optimizer.pt"
  219. SCALER_NAME = "scaler.pt"
  220. OPTIMIZER_NAME_BIN = "optimizer.bin"
  221. SCHEDULER_NAME = "scheduler.pt"
  222. FSDP_MODEL_NAME = "pytorch_model_fsdp"
  223. @requires(
  224. backends=(
  225. "torch",
  226. "accelerate",
  227. )
  228. )
  229. class Trainer:
  230. """
  231. Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.
  232. Args:
  233. model ([`PreTrainedModel`] or `torch.nn.Module`, *optional*):
  234. The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed.
  235. <Tip>
  236. [`Trainer`] is optimized to work with the [`PreTrainedModel`] provided by the library. You can still use
  237. your own models defined as `torch.nn.Module` as long as they work the same way as the 🤗 Transformers
  238. models.
  239. </Tip>
  240. args ([`TrainingArguments`], *optional*):
  241. The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the
  242. `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided.
  243. data_collator (`DataCollator`, *optional*):
  244. The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will
  245. default to [`default_data_collator`] if no `processing_class` is provided, an instance of
  246. [`DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or tokenizer.
  247. train_dataset (`torch.utils.data.Dataset` | `torch.utils.data.IterableDataset` | `datasets.Dataset`, *optional*):
  248. The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the
  249. `model.forward()` method are automatically removed.
  250. Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
  251. distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
  252. `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
  253. manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
  254. sets the seed of the RNGs used.
  255. eval_dataset (`torch.utils.data.Dataset` | dict[str, `torch.utils.data.Dataset`] | `datasets.Dataset`, *optional*):
  256. The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the
  257. `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
  258. dataset prepending the dictionary key to the metric name.
  259. processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
  260. Processing class used to process the data. If provided, will be used to automatically process the inputs
  261. for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
  262. reuse the fine-tuned model.
  263. model_init (`Callable[[], PreTrainedModel]`, *optional*):
  264. A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
  265. from a new instance of the model as given by this function.
  266. The function may have zero argument, or a single one containing the optuna/Ray Tune trial object, to
  267. be able to choose different architectures according to hyperparameters (such as layer count, sizes of
  268. inner layers, dropout probabilities etc).
  269. compute_loss_func (`Callable`, *optional*):
  270. A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated
  271. batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618) used by [`Trainer`].
  272. compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
  273. The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
  274. a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to
  275. `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered
  276. after the last eval batch to signal that the function needs to calculate and return the global summary
  277. statistics rather than accumulating the batch-level statistics
  278. callbacks (List of [`TrainerCallback`], *optional*):
  279. A list of callbacks to customize the training loop. Will add those to the list of default callbacks
  280. detailed in [here](callback).
  281. If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method.
  282. optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
  283. A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
  284. model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
  285. optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], dict[str, Any]]`, *optional*):
  286. A tuple containing the optimizer class and keyword arguments to use.
  287. Overrides `optim` and `optim_args` in `args`. Incompatible with the `optimizers` argument.
  288. Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before initializing the Trainer.
  289. preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
  290. A function that preprocess the logits right before caching them at each evaluation step. Must take two
  291. tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
  292. by this function will be reflected in the predictions received by `compute_metrics`.
  293. Note that the labels (second parameter) will be `None` if the dataset does not have them.
  294. Important attributes:
  295. - **model** -- Always points to the core model. If using a transformers model, it will be a [`PreTrainedModel`]
  296. subclass.
  297. - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
  298. original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`,
  299. the inner model is wrapped in `DeepSpeed` and then again in `torch.nn.DistributedDataParallel`. If the inner
  300. model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`.
  301. - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from
  302. data parallelism, this means some of the model layers are split on different GPUs).
  303. - **place_model_on_device** -- Whether or not to automatically place the model on the device. Defaults to
  304. `True` unless model parallel, DeepSpeed, FSDP, full fp16/bf16 eval, or SageMaker MP is active. Can be
  305. overridden by subclassing `TrainingArguments` and overriding the `place_model_on_device` property.
  306. - **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called while
  307. in `train`)
  308. """
  309. # Those methods are not used in Trainer itself but are available as methods for external use.
  310. from .trainer_pt_utils import (
  311. get_learning_rates,
  312. get_num_trainable_parameters,
  313. get_optimizer_group,
  314. log_metrics,
  315. metrics_format,
  316. save_metrics,
  317. save_state,
  318. )
  319. # ---- Initialization & Validation ----
  320. def __init__(
  321. self,
  322. model: PreTrainedModel | nn.Module | None = None,
  323. args: TrainingArguments | None = None,
  324. data_collator: DataCollator | None = None,
  325. train_dataset: "Dataset | IterableDataset | datasets.Dataset | None" = None,
  326. eval_dataset: "Dataset | dict[str, Dataset] | datasets.Dataset | None" = None,
  327. processing_class: PreTrainedTokenizerBase
  328. | BaseImageProcessor
  329. | FeatureExtractionMixin
  330. | ProcessorMixin
  331. | None = None,
  332. model_init: Callable[..., PreTrainedModel] | None = None,
  333. compute_loss_func: Callable | None = None,
  334. compute_metrics: Callable[[EvalPrediction], dict] | None = None,
  335. callbacks: list[TrainerCallback] | None = None,
  336. optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None),
  337. optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]] | None = None,
  338. preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
  339. ):
  340. # Init flow:
  341. # 1. Args & seed – defaults, determinism
  342. # 2. Accelerator & logging – accelerator, memory tracker, log level, device setup
  343. # 3. Model resolution – model / model_init, Liger Kernel, quantization checks
  344. # 4. Distributed strategy – model-parallel, FSDP, SageMaker MP flags
  345. # 5. Device placement – move model to device, model wrapping
  346. # 6. Model introspection – loss kwargs, label names, label smoother
  347. # 7. Store init arguments – data, callables, optimizer, scheduler, validation
  348. # 8. Callbacks – reporting integrations, JIT checkpoint, progress bar
  349. # 9. Hub & output – repo init, output directory
  350. # 10. Training state – TrainerState, TrainerControl, internal bookkeeping
  351. # 11. Finalize – use_cache, XLA FSDPv2 mesh, memory tracker stop
  352. # ---- 1. Args & seed --------------------------------------------------------
  353. if args is None:
  354. output_dir = "tmp_trainer"
  355. logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.")
  356. args = TrainingArguments(output_dir=output_dir)
  357. self.args = args
  358. # Seed must be set before instantiating the model when using model_init
  359. enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
  360. # ---- 2. Accelerator & logging ----------------------------------------------
  361. # `create_accelerator_and_postprocess` reads self.model and self.args,
  362. # and may set self.deepspeed — store temporary refs before calling it.
  363. self.deepspeed = None
  364. self.model = model
  365. self.create_accelerator_and_postprocess()
  366. self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
  367. self._memory_tracker.start()
  368. log_level = args.get_process_log_level()
  369. logging.set_verbosity(log_level)
  370. args._setup_devices # force device and distributed setup init explicitly
  371. # ---- 3. Model resolution ----------------------------------------------------
  372. if model is None:
  373. if model_init is not None:
  374. self.model_init = model_init
  375. model = self.call_model_init()
  376. else:
  377. raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument")
  378. else:
  379. if model_init is not None:
  380. raise ValueError("`Trainer` requires either a `model` or `model_init` argument, but not both.")
  381. self.model_init = model_init
  382. if model.__class__.__name__ in MODEL_MAPPING_NAMES:
  383. raise ValueError(
  384. f"The model you have picked ({model.__class__.__name__}) cannot be used as is for training: it only "
  385. "computes hidden states and does not accept any labels. You should choose a model with a head "
  386. "suitable for your task like any of the `AutoModelForXxx` listed at "
  387. "https://huggingface.co/docs/transformers/model_doc/auto"
  388. )
  389. validate_quantization_for_training(model)
  390. # ---- 4. Distributed strategy ------------------------------------------------
  391. self.is_model_parallel = False
  392. if getattr(model, "hf_device_map", None) is not None:
  393. devices = [device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]]
  394. if len(devices) > 1:
  395. self.is_model_parallel = True
  396. elif len(devices) == 1:
  397. self.is_model_parallel = self.args.device != torch.device(devices[0])
  398. self.is_fsdp_xla_enabled = args.fsdp_config["xla"]
  399. if len(args.fsdp) > 0:
  400. if self.is_deepspeed_enabled:
  401. raise ValueError(
  402. "Using --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags."
  403. )
  404. if not args.fsdp_config["xla"] and args.parallel_mode != ParallelMode.DISTRIBUTED:
  405. raise ValueError("Using fsdp only works in distributed training.")
  406. # Postpone switching model to cuda when MP, DeepSpeed, full bf16/fp16 eval, or FSDP
  407. if args.place_model_on_device is not None:
  408. self.place_model_on_device = args.place_model_on_device
  409. elif (
  410. self.is_model_parallel
  411. or self.is_deepspeed_enabled
  412. or (args.fp16_full_eval or args.bf16_full_eval)
  413. or self.is_fsdp_xla_enabled
  414. or self.is_fsdp_enabled
  415. or is_sagemaker_mp_enabled()
  416. ):
  417. self.place_model_on_device = False
  418. else:
  419. self.place_model_on_device = True
  420. # ---- 5. Device placement ----------------------------------------------------
  421. # Bnb Quantized models don't support `.to` operation.
  422. if (
  423. self.place_model_on_device
  424. and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES
  425. ):
  426. self._move_model_to_device(model, args.device)
  427. # Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs
  428. if self.is_model_parallel:
  429. self.args._n_gpu = 1
  430. # `self.model is self.model_wrapped` is used later to check if it's wrapped
  431. self.model_wrapped = model
  432. self.model = model
  433. # ---- 6. Model introspection -------------------------------------------------
  434. unwrapped_model = unwrap_peft_model(self.accelerator.unwrap_model(model))
  435. if hasattr(unwrapped_model, "accepts_loss_kwargs"):
  436. self.model_accepts_loss_kwargs = unwrapped_model.accepts_loss_kwargs
  437. else:
  438. forward_params = inspect.signature(unwrapped_model.forward).parameters
  439. self.model_accepts_loss_kwargs = any(
  440. k.kind == inspect.Parameter.VAR_KEYWORD for k in forward_params.values()
  441. )
  442. # Sequence Parallelism computes its own good_tokens count
  443. pc = getattr(self.accelerator, "parallelism_config", None)
  444. if pc is not None and pc.sp_backend == "deepspeed" and pc.sp_enabled:
  445. self.model_accepts_loss_kwargs = False
  446. model_to_inspect = unwrap_peft_model(self.model)
  447. default_label_names = find_labels(model_to_inspect.__class__)
  448. self.label_names = default_label_names if self.args.label_names is None else self.args.label_names
  449. self.can_return_loss = can_return_loss(model_to_inspect.__class__)
  450. if self.args.label_smoothing_factor != 0:
  451. if getattr(self.model.config, "problem_type", None) == "multi_label_classification":
  452. warnings.warn(
  453. "Label smoothing is not compatible with multi-label classification. "
  454. "Disabling label smoothing for this training run.",
  455. UserWarning,
  456. )
  457. self.label_smoother = None
  458. else:
  459. self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
  460. else:
  461. self.label_smoother = None
  462. # ---- 7. Store init arguments ------------------------------------------------
  463. # Data
  464. default_collator = (
  465. DataCollatorWithPadding(processing_class)
  466. if processing_class is not None
  467. and isinstance(processing_class, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
  468. else default_data_collator
  469. )
  470. self.data_collator = data_collator if data_collator is not None else default_collator
  471. self.train_dataset = train_dataset
  472. self.eval_dataset = eval_dataset
  473. self.processing_class = processing_class
  474. self.neftune_noise_alpha = args.neftune_noise_alpha
  475. # Callables
  476. self.compute_loss_func = compute_loss_func
  477. self.compute_metrics = compute_metrics
  478. self.preprocess_logits_for_metrics = preprocess_logits_for_metrics
  479. # Optimizer & scheduler
  480. self.optimizer, self.lr_scheduler = optimizers
  481. self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs
  482. self._validate_args()
  483. # ---- 8. Callbacks -----------------------------------------------------------
  484. default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
  485. if self.args.enable_jit_checkpoint:
  486. from .trainer_jit_checkpoint import JITCheckpointCallback
  487. jit_callback = JITCheckpointCallback()
  488. default_callbacks = default_callbacks + [jit_callback]
  489. jit_callback.set_trainer(self)
  490. callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
  491. self.callback_handler = CallbackHandler(
  492. callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
  493. )
  494. self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
  495. # ---- 9. Hub & output ---------------------------------------------------------
  496. self.hub_model_id = None # Set by init_hf_repo() when push_to_hub is enabled
  497. if self.args.push_to_hub:
  498. self.init_hf_repo()
  499. if self.args.should_save:
  500. os.makedirs(self.args.output_dir, exist_ok=True)
  501. # ---- 10. Training state -----------------------------------------------------
  502. self.control = TrainerControl()
  503. self.state = TrainerState(
  504. is_local_process_zero=self.is_local_process_zero(),
  505. is_world_process_zero=self.is_world_process_zero(),
  506. stateful_callbacks=[
  507. cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
  508. ],
  509. )
  510. self.is_in_train = False # True between train() entry and exit
  511. self.hp_name = None # Set by hyperparameter_search() to label the trial
  512. self.hp_search_backend = None # Set by hyperparameter_search() (optuna / ray / wandb)
  513. # Per-process FLOP counter; accumulated into self.state.total_flos then reset
  514. self.current_flos = 0
  515. # Set True by _setup_loggers() on first call to self.log()
  516. self._loggers_initialized = False
  517. # Lazily filled by _set_signature_columns_if_needed(); caches model.forward param names
  518. self._signature_columns = None
  519. # Effective batch size; may be reduced by find_executable_batch_size
  520. self._train_batch_size = args.train_batch_size
  521. # Guards one-time LR scheduler creation in create_optimizer_and_scheduler
  522. self._created_lr_scheduler = False
  523. self.control = self.callback_handler.on_init_end(self.args, self.state, self.control)
  524. # ---- 11. Finalize -----------------------------------------------------------
  525. if getattr(self.model, "config", None) is not None:
  526. self.model.config.use_cache = self.args.use_cache
  527. self.is_fsdp_xla_v2_enabled = args.fsdp_config.get("xla_fsdp_v2", False)
  528. if self.is_fsdp_xla_v2_enabled:
  529. if not IS_XLA_FSDPV2_POST_2_2:
  530. raise ValueError("FSDPv2 requires `torch_xla` 2.2 or higher.")
  531. num_devices = xr.global_runtime_device_count()
  532. xs.set_global_mesh(xs.Mesh(np.array(range(num_devices)), (num_devices, 1), axis_names=("fsdp", "tensor")))
  533. self.is_fsdp_xla_v1_enabled = self.is_fsdp_xla_enabled and not self.is_fsdp_xla_v2_enabled
  534. self._memory_tracker.stop_and_update_metrics()
  535. def _validate_args(self) -> None:
  536. """Validate constructor arguments and fail fast on incompatible combinations."""
  537. args = self.args
  538. # --- SageMaker Model Parallel mixed-precision validation ---
  539. if is_sagemaker_mp_enabled():
  540. if args.bf16:
  541. raise ValueError("SageMaker Model Parallelism does not support BF16 yet. Please use FP16 instead ")
  542. if args.fp16 != smp.state.cfg.fp16:
  543. logger.warning(
  544. f"FP16 provided in SM_HP_MP_PARAMETERS is {smp.state.cfg.fp16}, "
  545. f"but FP16 provided in trainer argument is {args.fp16}, "
  546. f"setting to {smp.state.cfg.fp16}"
  547. )
  548. args.fp16 = smp.state.cfg.fp16
  549. # --- Training-argument validations ---
  550. if args.batch_eval_metrics and self.compute_metrics is not None:
  551. if "compute_result" not in inspect.signature(self.compute_metrics).parameters:
  552. raise ValueError(
  553. "When using `batch_eval_metrics`, your `compute_metrics` function must take a `compute_result`"
  554. " boolean argument which will be triggered after the last batch of the eval set to signal that the"
  555. " summary statistics should be returned by the function."
  556. )
  557. if args.eval_strategy is not None and args.eval_strategy != "no" and self.eval_dataset is None:
  558. raise ValueError(
  559. f"You have set `args.eval_strategy` to {args.eval_strategy} but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. "
  560. )
  561. if args.save_strategy == SaveStrategy.BEST or args.load_best_model_at_end:
  562. if args.metric_for_best_model is None:
  563. raise ValueError(
  564. "`args.metric_for_best_model` must be provided when using 'best' save_strategy or if `args.load_best_model_at_end` is set to `True`."
  565. )
  566. # --- Optimizer validations ---
  567. if self.optimizer_cls_and_kwargs is not None and self.optimizer is not None:
  568. raise RuntimeError("Passing both `optimizers` and `optimizer_cls_and_kwargs` arguments is incompatible.")
  569. if self.model_init is not None and (self.optimizer is not None or self.lr_scheduler is not None):
  570. raise RuntimeError(
  571. "Passing a `model_init` is incompatible with providing the `optimizers` argument. "
  572. "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
  573. )
  574. if is_torch_xla_available() and self.optimizer is not None:
  575. for param in self.model.parameters():
  576. model_device = param.device
  577. break
  578. for param_group in self.optimizer.param_groups:
  579. if len(param_group["params"]) > 0:
  580. optimizer_device = param_group["params"][0].device
  581. break
  582. if model_device != optimizer_device:
  583. raise ValueError(
  584. "The model and the optimizer parameters are not on the same device, which probably means you"
  585. " created an optimizer around your model **before** putting on the device and passing it to the"
  586. " `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and"
  587. " `model.to(xm.xla_device())` is performed before the optimizer creation in your script."
  588. )
  589. if (self.is_fsdp_xla_enabled or self.is_fsdp_enabled) and (
  590. self.optimizer is not None or self.lr_scheduler is not None
  591. ):
  592. raise RuntimeError(
  593. "Passing `optimizers` is not allowed if PyTorch FSDP is enabled. "
  594. "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
  595. )
  596. # --- Dataset validations ---
  597. if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)):
  598. raise TypeError("The `data_collator` should be a simple callable (function, class with `__call__`).")
  599. if args.max_steps > 0 and args.num_train_epochs > 0:
  600. logger.info("max_steps is given, it will override any value given in num_train_epochs")
  601. if self.train_dataset is not None and not has_length(self.train_dataset) and args.max_steps <= 0:
  602. raise ValueError(
  603. "The train_dataset does not implement __len__, max_steps has to be specified. "
  604. "The number of steps needs to be known in advance for the learning rate scheduler."
  605. )
  606. if self.train_dataset is not None and isinstance(self.train_dataset, torch.utils.data.IterableDataset):
  607. logger.info(
  608. f"The `train_sampling_strategy='{args.train_sampling_strategy}'` option is ignored when using an `IterableDataset`. "
  609. "Samplers cannot be used with IterableDataset as they require indexed access to the dataset."
  610. )
  611. def _build_accelerator_args(self, **kwargs) -> dict[str, Any]:
  612. """Helper method to build accelerator-specific keyword arguments."""
  613. args = {
  614. "mixed_precision": self.args.mixed_precision,
  615. "deepspeed_plugin": self.args.deepspeed_plugin,
  616. }
  617. args.update(kwargs)
  618. if self.args.ddp_find_unused_parameters is not None:
  619. find_unused = self.args.ddp_find_unused_parameters
  620. elif isinstance(self.model, PreTrainedModel):
  621. # find_unused_parameters breaks checkpointing as per
  622. # https://github.com/huggingface/transformers/pull/4659#issuecomment-643356021
  623. find_unused = not (self.model.is_gradient_checkpointing or self.args.gradient_checkpointing)
  624. else:
  625. find_unused = True
  626. ddp_kwargs = {"find_unused_parameters": find_unused}
  627. if self.args.ddp_bucket_cap_mb is not None:
  628. ddp_kwargs["bucket_cap_mb"] = self.args.ddp_bucket_cap_mb
  629. if self.args.ddp_broadcast_buffers is not None:
  630. ddp_kwargs["broadcast_buffers"] = self.args.ddp_broadcast_buffers
  631. args["kwargs_handlers"] = [DistributedDataParallelKwargs(**ddp_kwargs)]
  632. # We defer compatibility checks to accelerator
  633. if self.args.parallelism_config is not None:
  634. min_accelerate_version = "1.12.0"
  635. if not is_accelerate_available(min_accelerate_version):
  636. raise ImportError(
  637. f"ParallelismConfig requires accelerate>={min_accelerate_version}). Please upgrade accelerate to use this feature."
  638. )
  639. args["parallelism_config"] = self.args.parallelism_config
  640. if getattr(self.model, "tp_size", None) is not None and self.model.tp_size > 1:
  641. if self.args.parallelism_config is None:
  642. if is_accelerate_available("1.12.0"):
  643. if self.args.parallelism_config is None:
  644. from accelerate import ParallelismConfig
  645. args["parallelism_config"] = ParallelismConfig(tp_size=self.model.tp_size)
  646. else:
  647. raise ValueError("Requires accelerate>1.12.0 to use Tensor Parallelism.")
  648. elif args["parallelism_config"].tp_size != self.model.tp_size:
  649. args["parallelism_config"].tp_size = self.model.tp_size
  650. if is_accelerate_available("1.2.0"):
  651. # it we don't have the correct version, we will rely on env var instead that were set in TrainingArguments
  652. from accelerate.utils import TorchDynamoPlugin
  653. dynamo_plugin = TorchDynamoPlugin(
  654. backend=self.args.torch_compile_backend, mode=self.args.torch_compile_mode
  655. )
  656. args["dynamo_plugin"] = dynamo_plugin
  657. return args
  658. def create_accelerator_and_postprocess(self) -> None:
  659. """Create the accelerator and perform post-creation setup (FSDP, DeepSpeed, etc.)."""
  660. # We explicitly don't rely on the `Accelerator` to do gradient accumulation
  661. grad_acc_kwargs = {}
  662. if self.args.accelerator_config.gradient_accumulation_kwargs is not None:
  663. grad_acc_kwargs = self.args.accelerator_config.gradient_accumulation_kwargs
  664. # check if num_steps is attempted to be passed in gradient_accumulation_kwargs
  665. if "num_steps" in grad_acc_kwargs:
  666. if self.args.gradient_accumulation_steps > 1:
  667. # raise because we do not know which setting is intended.
  668. raise ValueError(
  669. "The `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`"
  670. "If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`."
  671. )
  672. else:
  673. self.args.gradient_accumulation_steps = grad_acc_kwargs["num_steps"]
  674. # The Trainer handles GAS itself, so GAS=1 in Accelerate to avoid any double-division
  675. grad_acc_kwargs["num_steps"] = 1
  676. # Just making sure that gradient_state have the correct values passed.
  677. # We don't rely on `accumulate` from accelerate to set sync_gradients in gradient_state.
  678. # Rather, we do it ourselves by setting self.accelerator.gradient_state._set_sync_gradients.
  679. gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
  680. accelerator_config = self.args.accelerator_config.to_dict()
  681. # Extract dataloader config params from accelerator config
  682. dataloader_params = ["split_batches", "dispatch_batches", "even_batches", "use_seedable_sampler"]
  683. dataloader_config = DataLoaderConfiguration(
  684. **{param: accelerator_config.pop(param) for param in dataloader_params}
  685. )
  686. dataloader_config.data_seed = self.args.data_seed
  687. non_blocking = accelerator_config.pop("non_blocking")
  688. if non_blocking and not self.args.dataloader_pin_memory:
  689. logger.warning(
  690. "`non_blocking` is enabled but `dataloader_pin_memory` is not. For the best performance, it's recommended to enable both."
  691. )
  692. dataloader_config.non_blocking = non_blocking
  693. # this would have been updated above, no need for it anymore
  694. accelerator_config.pop("gradient_accumulation_kwargs")
  695. fsdp_plugin = None
  696. if self.args.fsdp_plugin_args is not None:
  697. from accelerate.utils import FullyShardedDataParallelPlugin
  698. fsdp_plugin = FullyShardedDataParallelPlugin(**self.args.fsdp_plugin_args)
  699. args = self._build_accelerator_args(
  700. dataloader_config=dataloader_config,
  701. fsdp_plugin=fsdp_plugin,
  702. gradient_accumulation_plugin=gradient_accumulation_plugin,
  703. )
  704. # create accelerator object
  705. self.accelerator = Accelerator(**args)
  706. # some Trainer classes need to use `gather` instead of `gather_for_metrics`, thus we store a flag
  707. self.gather_function = self.accelerator.gather_for_metrics
  708. if "use_gather_object" in inspect.signature(self.gather_function).parameters:
  709. self.gather_function = functools.partial(
  710. self.gather_function, use_gather_object=self.args.eval_use_gather_object
  711. )
  712. # deepspeed and accelerate flags covering both trainer args and accelerate launcher
  713. self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
  714. self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
  715. # post accelerator creation setup
  716. if self.is_fsdp_enabled:
  717. fsdp_plugin = self.accelerator.state.fsdp_plugin
  718. for param in ["limit_all_gathers", "activation_checkpointing"]:
  719. setattr(fsdp_plugin, param, self.args.fsdp_config.get(param, getattr(fsdp_plugin, param)))
  720. if fsdp_plugin.activation_checkpointing and self.args.gradient_checkpointing:
  721. raise ValueError(
  722. "The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg "
  723. "can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic "
  724. "when using FSDP."
  725. )
  726. if self.is_deepspeed_enabled and getattr(self.args, "hf_deepspeed_config", None) is None:
  727. propagate_args_to_deepspeed(self.accelerator, self.args)
  728. # `save_only_model` can't be used with DeepSpeed/FSDP along with `load_best_model_at_end`
  729. if (
  730. self.args.save_only_model
  731. and (self.is_deepspeed_enabled or self.is_fsdp_enabled)
  732. and self.args.load_best_model_at_end
  733. ):
  734. wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP"
  735. raise ValueError(f"{wrapper} can't be used with `save_only_model` along with `load_best_model_at_end`.")
  736. # `auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3
  737. if (
  738. self.is_deepspeed_enabled
  739. and self.accelerator.state.deepspeed_plugin.zero_stage == 3
  740. and self.args.auto_find_batch_size
  741. ):
  742. raise ValueError(
  743. "`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDP"
  744. )
  745. if (
  746. self.args.save_only_model
  747. and self.is_fsdp_enabled
  748. and "SHARDED_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type)
  749. ):
  750. raise ValueError("save_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT'")
  751. # ---- Data Loading ----
  752. def get_train_dataloader(self) -> DataLoader:
  753. """
  754. Returns the training [`~torch.utils.data.DataLoader`].
  755. Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
  756. training if necessary) otherwise.
  757. Subclass and override this method if you want to inject some custom behavior.
  758. """
  759. if self.train_dataset is None:
  760. raise ValueError("Trainer: training requires a train_dataset.")
  761. return self._get_dataloader(
  762. dataset=self.train_dataset,
  763. description="Training",
  764. batch_size=self._train_batch_size,
  765. sampler_fn=self._get_train_sampler,
  766. is_training=True,
  767. )
  768. def get_eval_dataloader(self, eval_dataset: str | Dataset | None = None) -> DataLoader:
  769. """
  770. Returns the evaluation [`~torch.utils.data.DataLoader`].
  771. Subclass and override this method if you want to inject some custom behavior.
  772. Args:
  773. eval_dataset (`str` or `torch.utils.data.Dataset`, *optional*):
  774. If a `str`, will use `self.eval_dataset[eval_dataset]` as the evaluation dataset. If a `Dataset`, will override `self.eval_dataset` and must implement `__len__`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed.
  775. """
  776. if eval_dataset is None and self.eval_dataset is None:
  777. raise ValueError("Trainer: evaluation requires an eval_dataset.")
  778. # If we have persistent workers, don't do a fork bomb especially as eval datasets
  779. # don't change during training
  780. dataloader_key = eval_dataset if isinstance(eval_dataset, str) else "eval"
  781. if (
  782. hasattr(self, "_eval_dataloaders")
  783. and dataloader_key in self._eval_dataloaders
  784. and self.args.dataloader_persistent_workers
  785. ):
  786. return self._eval_dataloaders[dataloader_key]
  787. eval_dataset = (
  788. self.eval_dataset[eval_dataset]
  789. if isinstance(eval_dataset, str)
  790. else eval_dataset
  791. if eval_dataset is not None
  792. else self.eval_dataset
  793. )
  794. return self._get_dataloader(
  795. dataset=eval_dataset,
  796. description="Evaluation",
  797. batch_size=self.args.eval_batch_size,
  798. sampler_fn=self._get_eval_sampler,
  799. dataloader_key=dataloader_key,
  800. )
  801. def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
  802. """
  803. Returns the test [`~torch.utils.data.DataLoader`].
  804. Subclass and override this method if you want to inject some custom behavior.
  805. Args:
  806. test_dataset (`torch.utils.data.Dataset`, *optional*):
  807. The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the
  808. `model.forward()` method are automatically removed. It must implement `__len__`.
  809. """
  810. return self._get_dataloader(
  811. dataset=test_dataset,
  812. description="test",
  813. batch_size=self.args.eval_batch_size,
  814. sampler_fn=self._get_eval_sampler,
  815. )
  816. def num_examples(self, dataloader: DataLoader) -> int:
  817. """
  818. Helper to get number of samples in a [`~torch.utils.data.DataLoader`] by accessing its dataset. When
  819. dataloader.dataset does not exist or has no length, estimates as best it can
  820. """
  821. try:
  822. dataset = dataloader.dataset
  823. # Special case for IterableDatasetShard, we need to dig deeper
  824. if isinstance(dataset, IterableDatasetShard):
  825. return len(dataloader.dataset.dataset)
  826. return len(dataloader.dataset)
  827. except (NameError, AttributeError, TypeError): # no dataset or length, estimate by length of dataloader
  828. return len(dataloader) * self.args.per_device_train_batch_size
  829. def _get_dataloader(
  830. self,
  831. dataset: Dataset,
  832. description: str,
  833. batch_size: int,
  834. sampler_fn: Callable[[Dataset], torch.utils.data.Sampler] | None = None,
  835. is_training: bool = False,
  836. dataloader_key: str | None = None,
  837. ) -> DataLoader:
  838. """Create a [`~torch.utils.data.DataLoader`] from the given dataset."""
  839. data_collator = self.data_collator
  840. if is_datasets_available() and isinstance(dataset, datasets.Dataset):
  841. dataset = self._remove_unused_columns(dataset, description=description)
  842. else:
  843. data_collator = self._get_collator_with_removed_columns(self.data_collator, description=description)
  844. # MPS requrires forking if multiple workers are specified
  845. should_fork = torch.backends.mps.is_available() and self.args.dataloader_num_workers > 1
  846. dataloader_params = {
  847. "batch_size": batch_size,
  848. "collate_fn": data_collator,
  849. "num_workers": self.args.dataloader_num_workers,
  850. "pin_memory": self.args.dataloader_pin_memory,
  851. "persistent_workers": self.args.dataloader_persistent_workers,
  852. "multiprocessing_context": "fork" if should_fork else None,
  853. }
  854. if not isinstance(dataset, torch.utils.data.IterableDataset):
  855. if sampler_fn is not None:
  856. dataloader_params["sampler"] = sampler_fn(dataset)
  857. dataloader_params["drop_last"] = self.args.dataloader_drop_last
  858. dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
  859. if is_training:
  860. dataloader_params["worker_init_fn"] = partial(
  861. seed_worker, num_workers=self.args.dataloader_num_workers, rank=self.args.process_index
  862. )
  863. dataloader = self.accelerator.prepare(DataLoader(dataset, **dataloader_params))
  864. # Store the prepared dataloader for subsequent evaluations if using persistent workers.
  865. if dataloader_key is not None and self.args.dataloader_persistent_workers:
  866. if hasattr(self, "_eval_dataloaders"):
  867. self._eval_dataloaders[dataloader_key] = dataloader
  868. else:
  869. self._eval_dataloaders = {dataloader_key: dataloader}
  870. return dataloader
  871. def _get_train_sampler(self, train_dataset: Dataset | None = None) -> torch.utils.data.Sampler | None:
  872. """Return the training sampler based on `train_sampling_strategy`."""
  873. if train_dataset is None:
  874. train_dataset = self.train_dataset
  875. if train_dataset is None or not has_length(train_dataset):
  876. return None
  877. # Build the sampler.
  878. if self.args.train_sampling_strategy == "group_by_length":
  879. if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
  880. lengths = (
  881. train_dataset[self.args.length_column_name]
  882. if self.args.length_column_name in train_dataset.column_names
  883. else None
  884. )
  885. else:
  886. lengths = None
  887. model_input_name = (
  888. self.processing_class.model_input_names[0] if self.processing_class is not None else None
  889. )
  890. return LengthGroupedSampler(
  891. self.args.train_batch_size * self.args.gradient_accumulation_steps,
  892. dataset=train_dataset,
  893. lengths=lengths,
  894. model_input_name=model_input_name,
  895. )
  896. elif self.args.train_sampling_strategy == "sequential":
  897. return SequentialSampler(train_dataset)
  898. else:
  899. return RandomSampler(train_dataset)
  900. def _get_eval_sampler(self, eval_dataset: Dataset) -> torch.utils.data.Sampler | None:
  901. """Return the evaluation sampler, using sequential ordering when not distributed."""
  902. if eval_dataset is None or not has_length(eval_dataset):
  903. return None
  904. if self.args.train_sampling_strategy == "group_by_length":
  905. if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset):
  906. lengths = (
  907. eval_dataset[self.args.length_column_name]
  908. if self.args.length_column_name in eval_dataset.column_names
  909. else None
  910. )
  911. else:
  912. lengths = None
  913. model_input_name = (
  914. self.processing_class.model_input_names[0] if self.processing_class is not None else None
  915. )
  916. return LengthGroupedSampler(
  917. self.args.eval_batch_size,
  918. dataset=eval_dataset,
  919. lengths=lengths,
  920. model_input_name=model_input_name,
  921. )
  922. if self.args.world_size <= 1:
  923. return SequentialSampler(eval_dataset)
  924. else:
  925. return None
  926. def _set_signature_columns_if_needed(self) -> None:
  927. """Populate `_signature_columns` from the model's forward signature if not already set."""
  928. if self._signature_columns is None:
  929. # Inspect model forward signature to keep only the arguments it accepts.
  930. model_to_inspect = self.model
  931. if _is_peft_model(self.model):
  932. if hasattr(self.model, "get_base_model"):
  933. model_to_inspect = self.model.get_base_model()
  934. else:
  935. # PeftMixedModel do not provide a `get_base_model` method
  936. model_to_inspect = self.model.base_model.model
  937. signature = inspect.signature(model_to_inspect.forward)
  938. self._signature_columns = list(signature.parameters.keys())
  939. # Labels may be named label or label_ids, the default data collator handles that.
  940. self._signature_columns += list(set(["label", "label_ids"] + self.label_names))
  941. def _remove_unused_columns(
  942. self, dataset: "datasets.Dataset", description: str | None = None
  943. ) -> "datasets.Dataset":
  944. """Remove dataset columns not accepted by the model's forward method."""
  945. if not self.args.remove_unused_columns:
  946. return dataset
  947. self._set_signature_columns_if_needed()
  948. signature_columns = self._signature_columns
  949. ignored_columns = list(set(dataset.column_names) - set(signature_columns))
  950. if len(ignored_columns) > 0:
  951. dset_description = "" if description is None else f"in the {description} set"
  952. logger.info(
  953. f"The following columns {dset_description} don't have a corresponding argument in "
  954. f"`{self.model.__class__.__name__}.forward` and have been ignored: {', '.join(ignored_columns)}."
  955. f" If {', '.join(ignored_columns)} are not expected by `{self.model.__class__.__name__}.forward`, "
  956. " you can safely ignore this message."
  957. )
  958. columns = [k for k in signature_columns if k in dataset.column_names]
  959. if len(columns) == 0:
  960. raise ValueError(
  961. f"No columns in the dataset match the model's forward method signature: ({', '.join(signature_columns)}). "
  962. f"The following columns have been ignored: [{', '.join(ignored_columns)}]. "
  963. "Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`."
  964. )
  965. if version.parse(datasets.__version__) < version.parse("1.4.0"):
  966. dataset.set_format(
  967. type=dataset.format["type"], columns=columns, format_kwargs=dataset.format["format_kwargs"]
  968. )
  969. return dataset
  970. else:
  971. return dataset.remove_columns(ignored_columns)
  972. def _get_collator_with_removed_columns(self, data_collator: Callable, description: str | None = None) -> Callable:
  973. """Wrap the data collator in a callable removing unused columns."""
  974. if not self.args.remove_unused_columns:
  975. return data_collator
  976. self._set_signature_columns_if_needed()
  977. signature_columns = self._signature_columns
  978. remove_columns_collator = RemoveColumnsCollator(
  979. data_collator=data_collator,
  980. signature_columns=signature_columns,
  981. logger=logger,
  982. description=description,
  983. model_name=self.model.__class__.__name__,
  984. )
  985. return remove_columns_collator
  986. # ---- Optimizer & Scheduler & Learning rate ----
  987. def create_optimizer_and_scheduler(self, num_training_steps: int) -> None:
  988. """
  989. Setup the optimizer and the learning rate scheduler.
  990. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
  991. Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or
  992. `create_scheduler`) in a subclass.
  993. """
  994. self.create_optimizer()
  995. self.create_scheduler(num_training_steps=num_training_steps)
  996. def create_optimizer(self, model=None) -> torch.optim.Optimizer:
  997. """
  998. Setup the optimizer.
  999. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
  1000. Trainer's init through `optimizers`, or subclass and override this method in a subclass.
  1001. Returns:
  1002. `torch.optim.Optimizer`: The optimizer instance.
  1003. """
  1004. opt_model = self.model if model is None else model
  1005. if self.optimizer is None:
  1006. decay_parameters = self.get_decay_parameter_names(opt_model)
  1007. optimizer_grouped_parameters = [
  1008. {
  1009. "params": [
  1010. p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
  1011. ],
  1012. "weight_decay": self.args.weight_decay,
  1013. },
  1014. {
  1015. "params": [
  1016. p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
  1017. ],
  1018. "weight_decay": 0.0,
  1019. },
  1020. ]
  1021. if self.optimizer_cls_and_kwargs is not None:
  1022. optimizer_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs
  1023. else:
  1024. optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args, opt_model)
  1025. # Check if this is a factory (for complex optimizers like Muon, Dion)
  1026. # Factories are instantiated first, then called with (opt_model, **kwargs)
  1027. if is_optimizer_factory(optimizer_cls):
  1028. self.optimizer = optimizer_cls()(opt_model, **optimizer_kwargs)
  1029. else:
  1030. # Standard optimizer class instantiation
  1031. # Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs`
  1032. # e.g. for GaLore optimizer.
  1033. if "params" in optimizer_kwargs:
  1034. optimizer_grouped_parameters = optimizer_kwargs.pop("params")
  1035. # Overwrite `model` in case it's created by `get_optimizer_cls_and_kwargs`
  1036. # e.g. for LOMO optimizer.
  1037. if "model" in optimizer_kwargs:
  1038. optimizer_grouped_parameters = optimizer_kwargs.pop("model")
  1039. # For layer-wise dummy optimizers we overwrite optimizer_grouped_parameters with `optimizer_dict`
  1040. # to avoid arguments conflicts.
  1041. if "optimizer_dict" in optimizer_kwargs:
  1042. optimizer_grouped_parameters = optimizer_kwargs.pop("optimizer_dict")
  1043. self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
  1044. if "bitsandbytes" in str(optimizer_cls) and optimizer_kwargs.get("optim_bits", None) == 8:
  1045. import bitsandbytes
  1046. manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
  1047. skipped = 0
  1048. for module in opt_model.modules():
  1049. if isinstance(module, nn.Embedding):
  1050. skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
  1051. logger.info(f"skipped {module}: {skipped / 2**20}M params")
  1052. manager.register_module_override(module, "weight", {"optim_bits": 32})
  1053. logger.debug(f"bitsandbytes: will optimize {module} in fp32")
  1054. logger.info(f"skipped: {skipped / 2**20}M params")
  1055. if is_sagemaker_mp_enabled():
  1056. self.optimizer = smp.DistributedOptimizer(self.optimizer)
  1057. return self.optimizer
  1058. def create_scheduler(
  1059. self, num_training_steps: int, optimizer: torch.optim.Optimizer | None = None
  1060. ) -> torch.optim.lr_scheduler.LRScheduler:
  1061. """
  1062. Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
  1063. passed as an argument.
  1064. Args:
  1065. num_training_steps (int): The number of training steps to do.
  1066. Returns:
  1067. `torch.optim.lr_scheduler.LRScheduler`: The learning rate scheduler instance.
  1068. """
  1069. if self.lr_scheduler is None:
  1070. if optimizer is None:
  1071. if is_sagemaker_mp_enabled() and smp.state.cfg.fp16:
  1072. # If fp16 is enabled, we unwrap the optimizer
  1073. optimizer = self.optimizer.optimizer
  1074. else:
  1075. optimizer = self.optimizer
  1076. self.lr_scheduler = get_scheduler(
  1077. self.args.lr_scheduler_type,
  1078. optimizer=optimizer,
  1079. num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
  1080. num_training_steps=num_training_steps,
  1081. scheduler_specific_kwargs=self.args.lr_scheduler_kwargs,
  1082. )
  1083. self._created_lr_scheduler = True
  1084. return self.lr_scheduler
  1085. @staticmethod
  1086. def get_optimizer_cls_and_kwargs(args: TrainingArguments, model: PreTrainedModel | None = None) -> tuple[Any, Any]:
  1087. """
  1088. Returns the optimizer class and optimizer parameters based on the training arguments.
  1089. Args:
  1090. args (`transformers.training_args.TrainingArguments`):
  1091. The training arguments for the training session.
  1092. model (`PreTrainedModel`, *optional*):
  1093. The model being trained. Required for some optimizers (GaLore, Apollo, LOMO).
  1094. Returns:
  1095. A tuple containing the optimizer class and a dictionary of optimizer keyword arguments.
  1096. """
  1097. ctx = OptimizerContext(
  1098. args=args,
  1099. model=model,
  1100. optimizer_kwargs={"lr": args.learning_rate},
  1101. adam_kwargs={
  1102. "betas": (args.adam_beta1, args.adam_beta2),
  1103. "eps": args.adam_epsilon,
  1104. },
  1105. optim_args=_parse_optim_args(args.optim_args),
  1106. )
  1107. handler = _OPTIMIZER_HANDLERS.get(args.optim)
  1108. if handler is None:
  1109. raise ValueError(f"Trainer cannot instantiate unsupported optimizer: {args.optim}")
  1110. return handler(ctx)
  1111. def get_decay_parameter_names(self, model: nn.Module) -> list[str]:
  1112. """
  1113. Get all parameter names that weight decay will be applied to.
  1114. This function filters out parameters in two ways:
  1115. 1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)
  1116. 2. By parameter name patterns (containing 'bias', or variation of 'norm')
  1117. """
  1118. forbidden_name_patterns = [r"bias", r"layernorm", r"rmsnorm", r"(?:^|\.)norm(?:$|\.)", r"_norm(?:$|\.)"]
  1119. decay_parameters = get_parameter_names(model, [nn.LayerNorm], forbidden_name_patterns)
  1120. return decay_parameters
  1121. def _get_learning_rate(self) -> float:
  1122. """
  1123. Returns the current learning rate from the scheduler.
  1124. Handles DeepSpeed's dynamic loss scaling warmup period where `get_last_lr` may fail.
  1125. """
  1126. if self.is_deepspeed_enabled:
  1127. # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
  1128. # not run for the first few dozen steps while loss scale is too large, and thus during
  1129. # that time `get_last_lr` will fail if called during that warm up stage, so work around it:
  1130. try:
  1131. last_lr = self.lr_scheduler.get_last_lr()[0]
  1132. except AssertionError as e:
  1133. if "need to call step" in str(e):
  1134. logger.warning("tried to get lr value before scheduler/optimizer started stepping, returning lr=0")
  1135. last_lr = 0
  1136. else:
  1137. raise
  1138. else:
  1139. if isinstance(self.lr_scheduler, (torch.optim.lr_scheduler.ReduceLROnPlateau, GreedyLR)):
  1140. last_lr = self.optimizer.param_groups[0]["lr"]
  1141. else:
  1142. last_lr = self.lr_scheduler.get_last_lr()[0]
  1143. if torch.is_tensor(last_lr):
  1144. last_lr = last_lr.item()
  1145. return last_lr
  1146. # ---- Training ----
  1147. def train(
  1148. self,
  1149. resume_from_checkpoint: str | bool | None = None,
  1150. trial: "optuna.Trial | dict[str, Any] | None" = None,
  1151. ignore_keys_for_eval: list[str] | None = None,
  1152. ) -> TrainOutput:
  1153. """
  1154. Main training entry point.
  1155. Args:
  1156. resume_from_checkpoint (`str` or `bool`, *optional*):
  1157. If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
  1158. `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
  1159. of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
  1160. trial (`optuna.Trial` or `dict[str, Any]`, *optional*):
  1161. The trial run or the hyperparameter dictionary for hyperparameter search.
  1162. ignore_keys_for_eval (`list[str]`, *optional*)
  1163. A list of keys in the output of your model (if it is a dictionary) that should be ignored when
  1164. gathering predictions for evaluation during the training.
  1165. Returns:
  1166. [`~trainer_utils.TrainOutput`]: Object containing the global step count, training loss, and metrics.
  1167. """
  1168. if resume_from_checkpoint is False:
  1169. resume_from_checkpoint = None
  1170. # memory metrics - must set up as early as possible
  1171. self._memory_tracker.start()
  1172. args = self.args
  1173. self.is_in_train = True
  1174. # Model re-init
  1175. if self.model_init is not None:
  1176. # Seed must be set before instantiating the model when using model_init.
  1177. enable_full_determinism(args.seed) if args.full_determinism else set_seed(args.seed)
  1178. self.model = self.call_model_init(trial)
  1179. # Reinitializes optimizer and scheduler
  1180. self.optimizer, self.lr_scheduler = None, None
  1181. if self.place_model_on_device:
  1182. self._move_model_to_device(self.model, args.device)
  1183. self.model_wrapped = self.model
  1184. if self.args.use_liger_kernel:
  1185. apply_liger_kernel(self.model, self.args.liger_kernel_config)
  1186. # When fp16/bf16 full eval is enabled, __init__ skips device placement so that
  1187. # evaluation_loop can cast dtype and move in one step. Move the model now for training.
  1188. if (args.fp16_full_eval or args.bf16_full_eval) and not self.is_model_parallel and self.model_init is None:
  1189. self._move_model_to_device(self.model, args.device)
  1190. # Activate gradient checkpointing if needed
  1191. if args.gradient_checkpointing:
  1192. self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=args.gradient_checkpointing_kwargs)
  1193. # If the model uses a tokenizer, it may have a new tokens for fine-tuning purposes.
  1194. if isinstance(self.processing_class, (PreTrainedTokenizerBase, ProcessorMixin)) and hasattr(
  1195. self.model, "config"
  1196. ):
  1197. align_special_tokens(self.model, self.processing_class)
  1198. # Attach NEFTune hooks if necessary
  1199. if self.neftune_noise_alpha is not None:
  1200. self.neftune_hook_handle = activate_neftune(self.model, self.neftune_noise_alpha, self.accelerator)
  1201. # This might change the seed so needs to run first.
  1202. self._hp_search_setup(trial)
  1203. if DebugOption.UNDERFLOW_OVERFLOW in args.debug:
  1204. if args.n_gpu > 1:
  1205. # nn.DataParallel(model) replicates the model, creating new variables and module
  1206. # references registered here no longer work on other gpus, breaking the module
  1207. raise ValueError(
  1208. "Currently --debug underflow_overflow is not supported under DP. Please use DDP with torchrun"
  1209. )
  1210. else:
  1211. DebugUnderflowOverflow(self.model)
  1212. # Load potential model checkpoint
  1213. if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
  1214. resume_from_checkpoint = get_last_checkpoint(args.output_dir)
  1215. if resume_from_checkpoint is None:
  1216. raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})")
  1217. if resume_from_checkpoint is not None:
  1218. # Load model checkpoint before accelerator.prepare() for regular models,
  1219. # so that buffers and parameters are on the right device after prepare.
  1220. # Deepspeed/FSDP models are loaded after prepare in _prepare_for_training.
  1221. if not is_sagemaker_mp_enabled() and not self.is_deepspeed_enabled and not self.is_fsdp_enabled:
  1222. self._load_from_checkpoint(resume_from_checkpoint)
  1223. state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
  1224. if state.train_batch_size is not None and args.auto_find_batch_size:
  1225. # Only restore the checkpoint's train_batch_size when using auto_find_batch_size,
  1226. self._train_batch_size = state.train_batch_size
  1227. inner_training_loop = find_executable_batch_size(
  1228. self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
  1229. )
  1230. # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
  1231. ctx = suppress_progress_bars() if args.push_to_hub else contextlib.nullcontext()
  1232. with ctx:
  1233. return inner_training_loop(
  1234. args=args,
  1235. resume_from_checkpoint=resume_from_checkpoint,
  1236. trial=trial,
  1237. ignore_keys_for_eval=ignore_keys_for_eval,
  1238. )
  1239. def _inner_training_loop(
  1240. self,
  1241. batch_size: int | None = None,
  1242. args: TrainingArguments | None = None,
  1243. resume_from_checkpoint: str | None = None,
  1244. trial: "optuna.Trial | dict[str, Any] | None" = None,
  1245. ignore_keys_for_eval: list[str] | None = None,
  1246. ) -> TrainOutput:
  1247. """Run the actual training loop: forward, backward, optimizer step, logging, and checkpointing."""
  1248. # reset everything
  1249. self.accelerator.free_memory()
  1250. if args.auto_find_batch_size:
  1251. self._update_auto_batch_size(batch_size)
  1252. # Data loader and number of training steps
  1253. train_dataloader = self.get_train_dataloader()
  1254. if self.is_fsdp_xla_v2_enabled:
  1255. train_dataloader = tpu_spmd_dataloader(train_dataloader)
  1256. # Setting up training control variables:
  1257. (
  1258. num_train_epochs,
  1259. num_update_steps_per_epoch,
  1260. num_examples,
  1261. num_train_samples,
  1262. total_train_batch_size,
  1263. steps_in_epoch,
  1264. max_steps,
  1265. ) = self.set_initial_training_values(args, train_dataloader)
  1266. epochs_trained, steps_trained_in_current_epoch = self._init_training_state(
  1267. max_steps, num_update_steps_per_epoch, num_train_epochs, resume_from_checkpoint, trial
  1268. )
  1269. model, train_dataloader = self._prepare_for_training(max_steps, train_dataloader, resume_from_checkpoint)
  1270. # Train!
  1271. logger.info("***** Running training *****")
  1272. logger.info(f" Num examples = {num_examples:,}")
  1273. logger.info(f" Num Epochs = {num_train_epochs:,}")
  1274. logger.info(f" Num update steps per epoch = {num_update_steps_per_epoch:,}")
  1275. logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
  1276. if self.args.per_device_train_batch_size != self._train_batch_size:
  1277. logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
  1278. logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
  1279. logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
  1280. logger.info(f" Total optimization steps = {max_steps:,}")
  1281. logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")
  1282. if resume_from_checkpoint is not None:
  1283. logger.info(
  1284. f" Resuming training from checkpoint with epoch {epochs_trained} and global step {self.state.global_step}"
  1285. )
  1286. if not self.args.ignore_data_skip:
  1287. logger.info(
  1288. f" Fast-forwarding the dataloader past {epochs_trained} epochs and"
  1289. f" {steps_trained_in_current_epoch} batches to resume from the exact training state."
  1290. )
  1291. start_time = time.time()
  1292. # needed to calculate tokens/s
  1293. self._initial_num_input_tokens_seen = self.state.num_input_tokens_seen
  1294. # Logging state: _tr_loss accumulates on-device between logging steps (avoiding costly .item() syncs
  1295. # on TPUs), then gets drained into _total_loss_scalar at each logging step.
  1296. self._tr_loss = torch.tensor(0.0, device=args.device)
  1297. self._total_loss_scalar = 0.0
  1298. self._globalstep_last_logged = self.state.global_step
  1299. model.zero_grad()
  1300. self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
  1301. if args.eval_on_start:
  1302. self._evaluate(trial, ignore_keys_for_eval, skip_scheduler=True)
  1303. for epoch in range(epochs_trained, num_train_epochs):
  1304. self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control)
  1305. self._run_epoch(
  1306. model=model,
  1307. epoch=epoch,
  1308. train_dataloader=train_dataloader,
  1309. steps_in_epoch=steps_in_epoch,
  1310. num_update_steps_per_epoch=num_update_steps_per_epoch,
  1311. trial=trial,
  1312. ignore_keys_for_eval=ignore_keys_for_eval,
  1313. start_time=start_time,
  1314. resume_from_checkpoint=resume_from_checkpoint,
  1315. epochs_trained=epochs_trained,
  1316. steps_trained_in_current_epoch=steps_trained_in_current_epoch,
  1317. )
  1318. if self.control.should_training_stop:
  1319. break
  1320. return self._finalize_training(trial, num_train_samples, start_time)
  1321. def _init_training_state(
  1322. self, max_steps, num_update_steps_per_epoch, num_train_epochs, resume_from_checkpoint, trial
  1323. ) -> tuple[int, int]:
  1324. """Initialize TrainerState, optionally restoring from checkpoint. Returns (epochs_trained, steps_trained_in_current_epoch)."""
  1325. self.state = TrainerState(
  1326. stateful_callbacks=[
  1327. cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
  1328. ]
  1329. )
  1330. self.state.is_hyper_param_search = trial is not None
  1331. self.state.train_batch_size = self._train_batch_size
  1332. self.state.compute_steps(self.args, max_steps)
  1333. epochs_trained = 0
  1334. steps_trained_in_current_epoch = 0
  1335. if resume_from_checkpoint is not None and os.path.isfile(
  1336. os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
  1337. ):
  1338. self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
  1339. compare_trainer_and_checkpoint_args(self.args, self.state)
  1340. self._load_callback_state()
  1341. epochs_trained = int(self.state.global_step // num_update_steps_per_epoch)
  1342. if not self.args.ignore_data_skip:
  1343. steps_trained_in_current_epoch = self.state.global_step % num_update_steps_per_epoch
  1344. steps_trained_in_current_epoch *= self.args.gradient_accumulation_steps
  1345. self.state.init_training_references(self, max_steps, num_train_epochs, trial)
  1346. return epochs_trained, steps_trained_in_current_epoch
  1347. def _prepare_for_training(self, max_steps, train_dataloader, resume_from_checkpoint):
  1348. """Wrap model, create optimizer and scheduler, and run accelerator.prepare. Returns (model, train_dataloader)."""
  1349. delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
  1350. # Can't delay optimizer creation when using FSDP2: https://github.com/huggingface/accelerate/blob/3f636d626063ffcf9a337c7d3624d61b7d187d59/src/accelerate/accelerator.py#L1404
  1351. is_fsdp2 = self.is_fsdp_enabled and (getattr(self.accelerator.state.fsdp_plugin, "fsdp_version", 1) == 2)
  1352. if is_fsdp2:
  1353. delay_optimizer_creation = False
  1354. # We need to reset the scheduler, as its parameters may be different on subsequent calls
  1355. if self._created_lr_scheduler:
  1356. self.lr_scheduler = None
  1357. self._created_lr_scheduler = False
  1358. if self.is_deepspeed_enabled:
  1359. self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
  1360. if not delay_optimizer_creation:
  1361. self.create_optimizer()
  1362. # Pass `self.model_wrapped` so that `_wrap_model` can detect if the model is already
  1363. # wrapped (e.g. in DataParallel) on subsequent `train()` calls and avoid double wrapping.
  1364. model = self._wrap_model(self.model_wrapped)
  1365. # If the model is wrapped, don't use `accelerator.prepare`
  1366. # this is for unhandled cases in accelerate such as FSDP-XLA, SageMaker MP/DP, DataParallel
  1367. use_accelerator_prepare = model is self.model
  1368. # prepare using `accelerator` prepare
  1369. if use_accelerator_prepare:
  1370. if delay_optimizer_creation:
  1371. # TODO: check if we can move this somewhere else
  1372. if self.is_fsdp_enabled and _is_peft_model(self.model):
  1373. update_fsdp_plugin_peft(self.model, self.accelerator)
  1374. # we only prepare the model as we don't have an optimizer
  1375. model = self.accelerator.prepare(self.model)
  1376. # using the model we prepared to create the optimizer
  1377. self.create_optimizer(model)
  1378. self.optimizer = self.accelerator.prepare(self.optimizer)
  1379. elif self.is_deepspeed_enabled and type(self.lr_scheduler).__name__ == "DummyScheduler":
  1380. model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
  1381. self.model, self.optimizer, self.lr_scheduler
  1382. )
  1383. else:
  1384. model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
  1385. else:
  1386. self.optimizer = self.accelerator.prepare(self.optimizer)
  1387. # Create scheduler now that the optimizer won't change anymore
  1388. self.create_scheduler(num_training_steps=max_steps)
  1389. # updating self.model_wrapped
  1390. self.model_wrapped = model
  1391. if self.is_fsdp_enabled or self.is_fsdp_xla_enabled:
  1392. # breaking convention for FSDP model
  1393. # TODO: check if this is really needed
  1394. self.model = self.model_wrapped = model
  1395. # backward compatibility
  1396. # TODO: check if we really need this
  1397. if self.is_deepspeed_enabled:
  1398. self.deepspeed = self.model_wrapped
  1399. # Important: at this point:
  1400. # self.model is the Transformers Model except when we are using FSDP
  1401. # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model),
  1402. # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc.
  1403. if self.is_fsdp_enabled:
  1404. # Fix `got mixed torch.Tensor and DTensor` error in model.generate() for FSDP2 with LoRA
  1405. if hasattr(self.model, "generate"):
  1406. dist.fsdp.register_fsdp_forward_method(self.model, "generate")
  1407. # since DataLoader was Accelerate prepared w/o a model arg in the same call, we now have to complete the DL wrapping for ALST/UlyssesSP, after model has been prepared
  1408. pc = getattr(self.accelerator, "parallelism_config", None)
  1409. if pc is not None and pc.sp_backend == "deepspeed" and pc.sp_enabled:
  1410. train_dataloader = self.accelerator.deepspeed_ulysses_dl_adapter(train_dataloader, model)
  1411. # load checkpoint
  1412. if resume_from_checkpoint is not None:
  1413. if self.is_deepspeed_enabled:
  1414. deepspeed_load_checkpoint(
  1415. self.model_wrapped, resume_from_checkpoint, load_module_strict=not _is_peft_model(self.model)
  1416. )
  1417. elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled:
  1418. self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped)
  1419. self._load_optimizer_and_scheduler(resume_from_checkpoint)
  1420. self._load_scaler(resume_from_checkpoint)
  1421. # Update the references for the callback_handler
  1422. for attr in ("model", "optimizer", "lr_scheduler"):
  1423. setattr(self.callback_handler, attr, getattr(self, attr))
  1424. self.callback_handler.train_dataloader = train_dataloader
  1425. return model, train_dataloader
  1426. def _run_epoch(
  1427. self,
  1428. model,
  1429. epoch,
  1430. train_dataloader,
  1431. steps_in_epoch,
  1432. num_update_steps_per_epoch,
  1433. trial,
  1434. ignore_keys_for_eval,
  1435. start_time,
  1436. resume_from_checkpoint,
  1437. epochs_trained,
  1438. steps_trained_in_current_epoch,
  1439. ):
  1440. """Run one full pass over the dataloader."""
  1441. step = -1
  1442. grad_norm = None
  1443. learning_rate = None
  1444. rng_to_sync = False
  1445. # Handle resumption from checkpoint: skip already-trained batches in the resumed epoch
  1446. num_update_steps_trained = 0
  1447. if epoch == epochs_trained and resume_from_checkpoint is not None:
  1448. if steps_trained_in_current_epoch > 0 and not self.args.ignore_data_skip:
  1449. train_dataloader = skip_first_batches(train_dataloader, steps_trained_in_current_epoch)
  1450. step = steps_trained_in_current_epoch - 1
  1451. num_update_steps_trained = steps_trained_in_current_epoch // self.args.gradient_accumulation_steps
  1452. rng_to_sync = True
  1453. elif steps_trained_in_current_epoch == 0:
  1454. self._load_rng_state(resume_from_checkpoint)
  1455. if hasattr(train_dataloader, "set_epoch"):
  1456. train_dataloader.set_epoch(epoch)
  1457. epoch_iterator = iter(train_dataloader)
  1458. # We chunkify the epoch iterator into gradient accumulation steps `n` batches
  1459. remainder = steps_in_epoch % self.args.gradient_accumulation_steps
  1460. if remainder == 0:
  1461. remainder = self.args.gradient_accumulation_steps
  1462. # Outer loop: one iteration per optimizer step. Each iteration prefetches
  1463. # `gradient_accumulation_steps` batches (fewer for the last step if the epoch
  1464. # doesn't divide evenly).
  1465. for update_step in range(num_update_steps_trained, num_update_steps_per_epoch):
  1466. num_batches = (
  1467. self.args.gradient_accumulation_steps if update_step != (num_update_steps_per_epoch - 1) else remainder
  1468. )
  1469. batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, self.args.device)
  1470. # This is used to correctly scale the loss when the last accumulation step has fewer batches.
  1471. # Not used if `num_items_in_batch` is not None.
  1472. self.current_gradient_accumulation_steps = len(batch_samples)
  1473. # need to sync after if we skipped the batches in `get_batch_samples` for shuffle order reason
  1474. if rng_to_sync:
  1475. self._load_rng_state(resume_from_checkpoint)
  1476. rng_to_sync = False
  1477. # Inner loop: forward + backward for each micro-batch. Gradients are
  1478. # accumulated without syncing until the last micro-batch, then we clip,
  1479. # step the optimizer, and log/save/evaluate.
  1480. for i, inputs in enumerate(batch_samples):
  1481. step += 1
  1482. do_sync_step = (step + 1) % self.args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch
  1483. # Since we perform prefetching, we need to manually set sync_gradients
  1484. self.accelerator.gradient_state._set_sync_gradients(do_sync_step)
  1485. if step % self.args.gradient_accumulation_steps == 0:
  1486. self.control = self.callback_handler.on_step_begin(self.args, self.state, self.control)
  1487. # We sync the gradients in the following cases: 1. sync_each_batch set to True 2. Using deepspeed 3. when we are at the last batch sample
  1488. if (
  1489. self.accelerator.gradient_state.plugin_kwargs.get("sync_each_batch", False)
  1490. or self.accelerator.distributed_type == DistributedType.DEEPSPEED
  1491. or i == len(batch_samples) - 1
  1492. ):
  1493. sync_context = contextlib.nullcontext
  1494. else:
  1495. sync_context = functools.partial(self.accelerator.no_sync, model=model)
  1496. with sync_context():
  1497. tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
  1498. if (
  1499. self.args.logging_nan_inf_filter
  1500. and not is_torch_xla_available()
  1501. and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
  1502. ):
  1503. # if loss is nan or inf simply add the average of previous logged losses
  1504. self._tr_loss += self._tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
  1505. else:
  1506. if self._tr_loss.device != tr_loss_step.device:
  1507. raise ValueError(
  1508. f"Calculated loss must be on the original device: {self._tr_loss.device} but device in use is {tr_loss_step.device}"
  1509. )
  1510. self._tr_loss += tr_loss_step
  1511. self.current_flos += float(self.floating_point_ops(inputs))
  1512. self._track_num_input_tokens(inputs)
  1513. if do_sync_step:
  1514. grad_norm = None
  1515. if self.args.max_grad_norm > 0:
  1516. grad_norm = self._clip_grad_norm(model)
  1517. grad_norm = self._get_grad_norm(model, grad_norm=grad_norm)
  1518. self.control = self.callback_handler.on_pre_optimizer_step(self.args, self.state, self.control)
  1519. self.optimizer.step()
  1520. self.control = self.callback_handler.on_optimizer_step(self.args, self.state, self.control)
  1521. # get leaning rate before update
  1522. learning_rate = self._get_learning_rate()
  1523. if not self.accelerator.optimizer_step_was_skipped:
  1524. # Delay optimizer scheduling until metrics are generated
  1525. if not isinstance(self.lr_scheduler, (torch.optim.lr_scheduler.ReduceLROnPlateau, GreedyLR)):
  1526. self.lr_scheduler.step()
  1527. model.zero_grad()
  1528. self.state.global_step += 1
  1529. self.state.epoch = epoch + (step + 1) / steps_in_epoch
  1530. self.control = self.callback_handler.on_step_end(self.args, self.state, self.control)
  1531. self._maybe_log_save_evaluate(
  1532. self._tr_loss,
  1533. grad_norm,
  1534. model,
  1535. trial,
  1536. epoch,
  1537. ignore_keys_for_eval,
  1538. start_time,
  1539. learning_rate=learning_rate,
  1540. )
  1541. else:
  1542. self.control = self.callback_handler.on_substep_end(self.args, self.state, self.control)
  1543. if self.control.should_epoch_stop or self.control.should_training_stop:
  1544. break
  1545. if self.control.should_epoch_stop or self.control.should_training_stop:
  1546. break
  1547. # PyTorch/XLA relies on the dataloader to insert mark_step each iteration.
  1548. # When we break out of the loop early, we flush the pending graph manually.
  1549. if is_torch_xla_available():
  1550. xm.mark_step()
  1551. if step < 0:
  1552. logger.warning(
  1553. "There seems not to be a single sample in your epoch_iterator, stopping training at step"
  1554. f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
  1555. f" num_steps ({self.state.max_steps}) higher than the number of available samples."
  1556. )
  1557. self.control.should_training_stop = True
  1558. self.control = self.callback_handler.on_epoch_end(self.args, self.state, self.control)
  1559. self._maybe_log_save_evaluate(
  1560. self._tr_loss,
  1561. grad_norm,
  1562. model,
  1563. trial,
  1564. epoch,
  1565. ignore_keys_for_eval,
  1566. start_time,
  1567. learning_rate=learning_rate,
  1568. )
  1569. def _finalize_training(self, trial, num_train_samples, start_time):
  1570. """Finalize training: metrics, best-model loading, cleanup. Returns TrainOutput."""
  1571. logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
  1572. # add remaining tr_loss
  1573. self._total_loss_scalar += self._tr_loss.item()
  1574. effective_global_step = max(self.state.global_step, 0.001) # Avoid ZeroDivisionError
  1575. train_loss = self._total_loss_scalar / effective_global_step
  1576. metrics = speed_metrics(
  1577. "train",
  1578. start_time,
  1579. num_samples=num_train_samples,
  1580. num_steps=self.state.max_steps,
  1581. )
  1582. self.store_flos()
  1583. metrics["total_flos"] = self.state.total_flos
  1584. metrics["train_loss"] = train_loss
  1585. self._memory_tracker.stop_and_update_metrics(metrics)
  1586. self.log(metrics)
  1587. if self.args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
  1588. self._load_best_model()
  1589. checkpoints_sorted = sort_checkpoints(
  1590. output_dir=self._get_output_dir(trial), best_model_checkpoint=self.state.best_model_checkpoint
  1591. )
  1592. # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
  1593. if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
  1594. for checkpoint in checkpoints_sorted:
  1595. if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
  1596. logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
  1597. shutil.rmtree(checkpoint, ignore_errors=True)
  1598. self.control = self.callback_handler.on_train_end(self.args, self.state, self.control)
  1599. # Wait for the checkpoint to be uploaded.
  1600. self._finish_current_push()
  1601. # After training we make sure to retrieve back the original forward pass method
  1602. # for the embedding layer by removing the forward post hook.
  1603. if self.neftune_noise_alpha is not None:
  1604. deactivate_neftune(self.model, self.neftune_hook_handle, self.accelerator)
  1605. self.is_in_train = False
  1606. return TrainOutput(self.state.global_step, train_loss, metrics)
  1607. def training_step(
  1608. self,
  1609. model: nn.Module,
  1610. inputs: dict[str, torch.Tensor | Any],
  1611. num_items_in_batch: torch.Tensor | int | None = None,
  1612. ) -> torch.Tensor:
  1613. """
  1614. Perform a training step on a batch of inputs.
  1615. Subclass and override to inject custom behavior.
  1616. Args:
  1617. model (`nn.Module`):
  1618. The model to train.
  1619. inputs (`dict[str, torch.Tensor | Any]`):
  1620. The inputs and targets of the model.
  1621. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
  1622. argument `labels`. Check your model's documentation for all accepted arguments.
  1623. Return:
  1624. `torch.Tensor`: The tensor with training loss on this batch.
  1625. """
  1626. # Prepare buffers for context parallelism
  1627. cp_context, inputs = self._prepare_context_parallel_inputs(model, inputs)
  1628. # Context manager is no-op if CP isn't enabled
  1629. with cp_context():
  1630. model.train()
  1631. if hasattr(self.optimizer, "train") and callable(self.optimizer.train):
  1632. self.optimizer.train()
  1633. inputs = self._prepare_inputs(inputs)
  1634. if is_sagemaker_mp_enabled():
  1635. loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
  1636. return loss_mb.reduce_mean().detach().to(self.args.device)
  1637. with self.compute_loss_context_manager():
  1638. loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
  1639. del inputs
  1640. if (
  1641. self.args.torch_empty_cache_steps is not None
  1642. and self.state.global_step % self.args.torch_empty_cache_steps == 0
  1643. ):
  1644. clear_device_cache()
  1645. kwargs = {}
  1646. # For LOMO optimizers you need to explicitly use the learning rate
  1647. if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
  1648. kwargs["learning_rate"] = self._get_learning_rate()
  1649. if self.args.n_gpu > 1:
  1650. loss = loss.mean() # mean() to average on multi-gpu parallel training
  1651. # Finally we need to normalize the loss for reporting if GA loss bug is not fixed during compute loss
  1652. if (not self.model_accepts_loss_kwargs or num_items_in_batch is None) and self.compute_loss_func is None:
  1653. # If the model does not accept loss kwargs, we need to normalize the loss by the number of gradient accumulation steps
  1654. loss = loss / self.current_gradient_accumulation_steps
  1655. # Turning off loss scaling w.r.t. gradient accumulation when DeepSpeed is enabled
  1656. # https://github.com/huggingface/transformers/pull/35808
  1657. if self.accelerator.distributed_type == DistributedType.DEEPSPEED:
  1658. kwargs["scale_wrt_gas"] = False
  1659. self.accelerator.backward(loss, **kwargs)
  1660. return loss.detach()
  1661. def compute_loss(
  1662. self,
  1663. model: nn.Module,
  1664. inputs: dict[str, torch.Tensor | Any],
  1665. return_outputs: bool = False,
  1666. num_items_in_batch: torch.Tensor | int | None = None,
  1667. ) -> torch.Tensor | tuple[torch.Tensor, Any]:
  1668. """
  1669. How the loss is computed by Trainer. By default, all models return the loss in the first element.
  1670. Args:
  1671. model (`nn.Module`):
  1672. The model to compute the loss for.
  1673. inputs (`dict[str, torch.Tensor | Any]`):
  1674. The input data for the model.
  1675. return_outputs (`bool`, *optional*, defaults to `False`):
  1676. Whether to return the model outputs along with the loss.
  1677. num_items_in_batch (Optional[torch.Tensor], *optional*):
  1678. The number of items in the batch. If not passed, the loss is computed
  1679. using the default batch size reduction logic.
  1680. Returns:
  1681. The loss of the model along with its output if return_outputs was set to True
  1682. Subclass and override for custom behavior. If you are not using `num_items_in_batch` when computing your loss,
  1683. make sure to overwrite `self.model_accepts_loss_kwargs` to `False`. Otherwise, the loss calculation might be slightly inaccurate when performing gradient accumulation.
  1684. """
  1685. pc = getattr(self.accelerator, "parallelism_config", None)
  1686. if pc is not None and pc.sp_backend == "deepspeed" and pc.sp_enabled and self.model.training:
  1687. return deepspeed_sp_compute_loss(self.accelerator, model, inputs, return_outputs, pc)
  1688. if (self.label_smoother is not None or self.compute_loss_func is not None) and "labels" in inputs:
  1689. labels = inputs.pop("labels")
  1690. else:
  1691. labels = None
  1692. if self.model_accepts_loss_kwargs:
  1693. kwargs = {}
  1694. if num_items_in_batch is not None:
  1695. kwargs["num_items_in_batch"] = num_items_in_batch
  1696. inputs = {**inputs, **kwargs}
  1697. outputs = model(**inputs)
  1698. # User-defined compute_loss function
  1699. if self.compute_loss_func is not None:
  1700. if labels is None:
  1701. logger.warning(
  1702. "Trainer: `compute_loss_func` is defined but `labels=None`. "
  1703. "Your custom loss function will still be called with labels=None. "
  1704. )
  1705. loss = self.compute_loss_func(
  1706. outputs,
  1707. labels,
  1708. num_items_in_batch=num_items_in_batch,
  1709. )
  1710. # Default HF loss handling (label smoothing) if no custom loss function
  1711. elif labels is not None:
  1712. unwrapped_model = self.accelerator.unwrap_model(model)
  1713. model_name = (
  1714. unwrapped_model.base_model.model._get_name()
  1715. if _is_peft_model(unwrapped_model)
  1716. else unwrapped_model._get_name()
  1717. )
  1718. if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
  1719. loss = self.label_smoother(outputs, labels, shift_labels=True)
  1720. else:
  1721. loss = self.label_smoother(outputs, labels)
  1722. else:
  1723. if isinstance(outputs, dict) and "loss" not in outputs:
  1724. raise ValueError(
  1725. "The model did not return a loss from the inputs, only the following keys: "
  1726. f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
  1727. )
  1728. # We don't use .loss here since the model may return tuples instead of ModelOutput.
  1729. loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
  1730. if (
  1731. self.args.average_tokens_across_devices
  1732. and (self.model_accepts_loss_kwargs or self.compute_loss_func)
  1733. and num_items_in_batch is not None
  1734. ):
  1735. loss *= self.accelerator.num_processes if self.args.n_gpu <= 1 else self.args.n_gpu
  1736. return (loss, outputs) if return_outputs else loss
  1737. def compute_loss_context_manager(self) -> contextlib.ExitStack:
  1738. """
  1739. A helper wrapper to group together context managers.
  1740. """
  1741. ctx_stack = contextlib.ExitStack()
  1742. autocast_ctx = self.autocast_smart_context_manager()
  1743. if not isinstance(autocast_ctx, contextlib.nullcontext):
  1744. ctx_stack.enter_context(autocast_ctx)
  1745. return ctx_stack
  1746. def autocast_smart_context_manager(self, cache_enabled: bool | None = True) -> contextlib.AbstractContextManager:
  1747. """
  1748. A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
  1749. arguments, depending on the situation. We rely on accelerate for autocast, hence we do nothing here.
  1750. """
  1751. return contextlib.nullcontext()
  1752. def _maybe_log_save_evaluate(
  1753. self,
  1754. tr_loss: torch.Tensor,
  1755. grad_norm: torch.Tensor | float | None,
  1756. model: nn.Module,
  1757. trial: "optuna.Trial | dict[str, Any] | None",
  1758. epoch: float,
  1759. ignore_keys_for_eval: list[str] | None,
  1760. start_time: float,
  1761. learning_rate: float | None = None,
  1762. ) -> None:
  1763. """Log metrics, run evaluation, and save checkpoints if the current training state requires it."""
  1764. if self.control.should_log and self.state.global_step > self._globalstep_last_logged:
  1765. if is_torch_xla_available():
  1766. xm.mark_step()
  1767. logs: dict[str, float] = {}
  1768. # all_gather + mean() to get average loss over all processes
  1769. tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).mean().item()
  1770. # reset tr_loss to zero
  1771. tr_loss -= tr_loss
  1772. logs["loss"] = tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged)
  1773. if grad_norm is not None:
  1774. logs["grad_norm"] = grad_norm.item() if isinstance(grad_norm, torch.Tensor) else grad_norm
  1775. if learning_rate is not None:
  1776. logs["learning_rate"] = learning_rate
  1777. else:
  1778. logs["learning_rate"] = self._get_learning_rate()
  1779. self._total_loss_scalar += tr_loss_scalar
  1780. self._globalstep_last_logged = self.state.global_step
  1781. self.store_flos()
  1782. self.log(logs, start_time)
  1783. metrics = None
  1784. if self.control.should_evaluate:
  1785. metrics = self._evaluate(trial, ignore_keys_for_eval)
  1786. is_new_best_metric = self._determine_best_metric(metrics=metrics, trial=trial)
  1787. if self.args.save_strategy == SaveStrategy.BEST:
  1788. self.control.should_save = is_new_best_metric
  1789. if self.control.should_save:
  1790. self._save_checkpoint(model, trial)
  1791. self.control = self.callback_handler.on_save(self.args, self.state, self.control)
  1792. # ---- Training Utilites ----
  1793. def get_batch_samples(
  1794. self, epoch_iterator: Iterator, num_batches: int, device: torch.device
  1795. ) -> tuple[list, torch.Tensor | int | None]:
  1796. """
  1797. Collects a specified number of batches from the epoch iterator and optionally counts the number of items in the batches to properly scale the loss.
  1798. """
  1799. batch_samples = []
  1800. for _ in range(num_batches):
  1801. try:
  1802. batch_samples.append(next(epoch_iterator))
  1803. except StopIteration:
  1804. break
  1805. num_items_in_batch = self._get_num_items_in_batch(batch_samples, device)
  1806. return batch_samples, num_items_in_batch
  1807. def _get_num_items_in_batch(self, batch_samples: list, device: torch.device) -> torch.Tensor | int | None:
  1808. """
  1809. Counts the number of items in the batches to properly scale the loss.
  1810. Args:
  1811. batch_samples (`list`): List of batches
  1812. device (`torch.device`): The device on which the number of items in the batch should be.
  1813. Returns:
  1814. None if the number of items in the batch doesn't need to be computed else the number of items in the batch
  1815. """
  1816. num_items_in_batch = None
  1817. count_num_items_in_batch = (
  1818. len(batch_samples) > 0
  1819. and "labels" in batch_samples[0]
  1820. and (
  1821. # num_items_in_batch is passed to model forward
  1822. # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3757
  1823. self.model_accepts_loss_kwargs
  1824. # num_items_in_batch is passed to compute_loss_func
  1825. # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3773
  1826. or self.compute_loss_func is not None
  1827. # num_items_in_batch is also verified if (self.model_accepts_loss_kwargs or self.compute_loss_func)
  1828. # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3790
  1829. )
  1830. )
  1831. if count_num_items_in_batch:
  1832. # For now we don't support object detection
  1833. try:
  1834. num_items_in_batch = sum((batch["labels"].ne(-100)).sum() for batch in batch_samples)
  1835. except (TypeError, AttributeError):
  1836. pass
  1837. if num_items_in_batch is not None:
  1838. if self.args.average_tokens_across_devices:
  1839. if self.args.world_size > 1:
  1840. num_items_in_batch = self.accelerator.gather(num_items_in_batch.to(device)).sum()
  1841. elif self.args.n_gpu > 1:
  1842. # In DP case, if we don't average, we need to divide by the number of gpu. This is the simplest approximation.
  1843. # Otherwise, we would have to scatter labels and calculate num_items_in_batch for each gpu.
  1844. num_items_in_batch = num_items_in_batch // self.args.n_gpu
  1845. if torch.is_tensor(num_items_in_batch):
  1846. num_items_in_batch = num_items_in_batch.to(device)
  1847. if self.args.n_gpu > 1 and num_items_in_batch.dim() == 0:
  1848. # In the DataParallel case, convert the scalar tensor into a 2-dim tensor with the same value repeated
  1849. num_items_in_batch = num_items_in_batch.unsqueeze(0).expand(self.args.n_gpu, -1)
  1850. # Divide by number of devices with the same batch
  1851. if pc := getattr(self.accelerator, "parallelism_config", None):
  1852. num_items_in_batch = num_items_in_batch // pc.non_data_parallel_size
  1853. return num_items_in_batch
  1854. def _prepare_input(self, data: torch.Tensor | Any) -> torch.Tensor | Any:
  1855. """
  1856. Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
  1857. """
  1858. if isinstance(data, Mapping):
  1859. return type(data)({k: self._prepare_input(v) for k, v in data.items()})
  1860. elif isinstance(data, (tuple, list)):
  1861. return type(data)(self._prepare_input(v) for v in data)
  1862. elif isinstance(data, torch.Tensor):
  1863. kwargs = {"device": self.args.device}
  1864. if self.is_deepspeed_enabled and (torch.is_floating_point(data) or torch.is_complex(data)):
  1865. # NLP models inputs are int/uint and those get adjusted to the right dtype of the
  1866. # embedding. Other models such as wav2vec2's inputs are already float and thus
  1867. # may need special handling to match the dtypes of the model
  1868. kwargs.update({"dtype": self.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()})
  1869. return data.to(**kwargs)
  1870. return data
  1871. def _prepare_inputs(self, inputs: dict[str, torch.Tensor | Any]) -> dict[str, torch.Tensor | Any]:
  1872. """
  1873. Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
  1874. handling potential state.
  1875. """
  1876. inputs = self._prepare_input(inputs)
  1877. if len(inputs) == 0:
  1878. raise ValueError(
  1879. "The batch received was empty, your model won't be able to train on it. Double-check that your "
  1880. f"training dataset contains keys expected by the model: {','.join(self._signature_columns)}."
  1881. )
  1882. return inputs
  1883. def _prepare_context_parallel_inputs(
  1884. self, model: nn.Module, inputs: dict[str, torch.Tensor | Any]
  1885. ) -> tuple[Callable, dict[str, torch.Tensor | Any]]:
  1886. """
  1887. Prepare inputs for context parallelism by setting up buffers and validation.
  1888. Args:
  1889. model: The model being trained
  1890. inputs: Input tensors to prepare
  1891. Returns:
  1892. tuple: (context_manager, prepared_inputs) where context_manager is either
  1893. the context parallelism wrapper or a no-op context
  1894. """
  1895. if (
  1896. getattr(self.accelerator, "parallelism_config", None) is not None
  1897. and self.accelerator.parallelism_config.cp_enabled
  1898. ):
  1899. if self.accelerator.parallelism_config.cp_backend == "torch":
  1900. if hasattr(model, "config"):
  1901. if model.config._attn_implementation != "sdpa":
  1902. raise ValueError(
  1903. f"Context parallelism is supported only with SDPA attention, you are using {model.config._attn_implementation}."
  1904. )
  1905. if "shift_labels" not in inputs:
  1906. logger.warning_once("Shift labels not found in the inputs, shifting manually")
  1907. if "labels" in inputs:
  1908. _ignore_index = -100
  1909. labels = nn.functional.pad(inputs["labels"], (0, 1), value=_ignore_index)
  1910. inputs["shift_labels"] = labels[:, 1:].contiguous()
  1911. # note: we don't do anything for accelerator.parallelism_config.sp_backend == "deepspeed" since:
  1912. # - accelerator.parallelism_config performs the `model.config._attn_implementation` checks already and it supports more than `dspa`
  1913. # - UlyssesSPDataLoaderAdapter called from Accelerate performs the `shift_label` creation - must not interfere
  1914. # - position_ids generation should be done by HF Trainer if it wasn't done by the user
  1915. if "position_ids" not in inputs:
  1916. logger.warning_once("Position IDs not found in the inputs, generating manually")
  1917. inputs["position_ids"] = torch.arange(
  1918. inputs["input_ids"].size(1), device=inputs["input_ids"].device
  1919. ).expand(inputs["input_ids"].size(0), -1)
  1920. buffers = []
  1921. buffer_seq_dims = []
  1922. if "input_ids" in inputs:
  1923. buffers.append(inputs["input_ids"])
  1924. buffer_seq_dims.append(1) # Sequence dimension
  1925. if "labels" in inputs:
  1926. buffers.append(inputs["labels"])
  1927. buffer_seq_dims.append(1)
  1928. if "shift_labels" in inputs:
  1929. buffers.append(inputs["shift_labels"])
  1930. buffer_seq_dims.append(1)
  1931. # Add attention_mask to buffers for context parallel splitting (only if causal)
  1932. if "attention_mask" in inputs:
  1933. # Only validate causal mask once for performance
  1934. if not getattr(self, "_attn_mask_causal_checked", False):
  1935. # Context parallel currently doesn't support other masks than causal
  1936. # Accelerate applies hooks to replace mask with is_causal arg in SDPA
  1937. # Check if the mask is really causal and if not throw an error
  1938. attention_mask = inputs["attention_mask"]
  1939. if not is_attention_mask_causal(attention_mask):
  1940. raise ValueError(
  1941. "Context parallelism only supports causal attention masks. "
  1942. "The provided attention_mask is not causal. "
  1943. "Please ensure your data uses causal masking (lower triangular) "
  1944. "or remove the attention_mask to use the model's default causal masking."
  1945. )
  1946. self._attn_mask_causal_checked = True
  1947. if self._attn_mask_causal_checked:
  1948. # Add to buffers only after validation (or if validation already passed)
  1949. attention_mask = inputs["attention_mask"]
  1950. if attention_mask.dim() == 2:
  1951. buffers.append(attention_mask)
  1952. buffer_seq_dims.append(1)
  1953. else:
  1954. # Other dimensionality; keep as-is without sharding to avoid incorrect splits
  1955. pass
  1956. # Include position_ids in context parallelism splitting
  1957. if "position_ids" in inputs and inputs["position_ids"] is not None:
  1958. buffers.append(inputs["position_ids"])
  1959. buffer_seq_dims.append(1)
  1960. return partial(
  1961. self.accelerator.maybe_context_parallel,
  1962. buffers=buffers,
  1963. buffer_seq_dims=buffer_seq_dims,
  1964. no_restore_buffers=set(buffers),
  1965. ), inputs
  1966. return contextlib.nullcontext, inputs
  1967. def set_initial_training_values(
  1968. self, args: TrainingArguments, dataloader: DataLoader
  1969. ) -> tuple[int, int, int, int, int, int | None, int]:
  1970. """
  1971. Calculates and returns the following values:
  1972. - `num_train_epochs`
  1973. - `num_update_steps_per_epoch`
  1974. - `num_examples`
  1975. - `num_train_samples`
  1976. - `total_train_batch_size`
  1977. - `steps_in_epoch` (total batches per epoch)
  1978. - `max_steps`
  1979. """
  1980. # Case 1: we rely on `args.max_steps` first
  1981. max_steps = args.max_steps
  1982. # If max_steps is negative, we use the number of epochs to determine the number of total steps later
  1983. epoch_based = max_steps < 0
  1984. len_dataloader = len(dataloader) if has_length(dataloader) else None
  1985. total_train_batch_size = self.get_total_train_batch_size(args)
  1986. # Account for Sequence Parallelism (SP) dataloader adapter's effect
  1987. sp_size = self.get_sp_size()
  1988. if sp_size > 1 and len_dataloader is not None:
  1989. len_dataloader = len_dataloader * sp_size
  1990. # Case 2: We have a dataloader length and can extrapolate
  1991. if len_dataloader is not None:
  1992. num_update_steps_per_epoch = max(
  1993. len_dataloader // args.gradient_accumulation_steps
  1994. + int(len_dataloader % args.gradient_accumulation_steps > 0),
  1995. 1,
  1996. )
  1997. # Case 3: We have a length but are using epochs, we can extrapolate the number of steps
  1998. if epoch_based:
  1999. max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
  2000. # Now we figure out `num_examples`, `num_train_epochs`, and `train_samples`
  2001. if len_dataloader:
  2002. num_examples = self.num_examples(dataloader)
  2003. if args.max_steps > 0:
  2004. num_train_epochs = max_steps // num_update_steps_per_epoch + int(
  2005. max_steps % num_update_steps_per_epoch > 0
  2006. )
  2007. # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
  2008. # the best we can do.
  2009. num_train_samples = max_steps * total_train_batch_size
  2010. else:
  2011. num_train_epochs = math.ceil(args.num_train_epochs)
  2012. num_train_samples = self.num_examples(dataloader) * args.num_train_epochs
  2013. elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size
  2014. # Setting a very large number of epochs so we go as many times as necessary over the iterator.
  2015. num_train_epochs = sys.maxsize
  2016. num_update_steps_per_epoch = max_steps
  2017. num_examples = total_train_batch_size * args.max_steps
  2018. num_train_samples = args.max_steps * total_train_batch_size
  2019. else:
  2020. raise ValueError(
  2021. "args.max_steps must be set to a positive value if dataloader does not have a length, was"
  2022. f" {args.max_steps}"
  2023. )
  2024. steps_in_epoch = len_dataloader if len_dataloader is not None else max_steps * args.gradient_accumulation_steps
  2025. return (
  2026. num_train_epochs,
  2027. num_update_steps_per_epoch,
  2028. num_examples,
  2029. num_train_samples,
  2030. total_train_batch_size,
  2031. steps_in_epoch,
  2032. max_steps,
  2033. )
  2034. def get_total_train_batch_size(self, args: TrainingArguments) -> int:
  2035. """Calculates total batch size (micro_batch * grad_accum * dp_world_size).
  2036. Accounts for all parallelism dimensions: TP, CP, and SP.
  2037. Formula: dp_world_size = world_size // (tp_size * cp_size * sp_size)
  2038. Where:
  2039. - TP (Tensor Parallelism): Model layers split across GPUs
  2040. - CP (Context Parallelism): Sequences split using Ring Attention (FSDP2)
  2041. - SP (Sequence Parallelism): Sequences split using ALST/Ulysses (DeepSpeed)
  2042. All dimensions are separate and multiplicative: world_size = dp_size * tp_size * cp_size * sp_size
  2043. """
  2044. dp_world_size = args.world_size // self.get_tp_size() // self.get_cp_size() // self.get_sp_size()
  2045. return self._train_batch_size * args.gradient_accumulation_steps * dp_world_size
  2046. def get_sp_size(self) -> int:
  2047. """Get the sequence parallel size"""
  2048. if getattr(self.accelerator, "parallelism_config", None) is None:
  2049. return 1
  2050. else:
  2051. pc = self.accelerator.parallelism_config
  2052. return pc.sp_size
  2053. def get_cp_size(self) -> int:
  2054. """Get the context parallel size"""
  2055. if getattr(self.accelerator, "parallelism_config", None) is None:
  2056. return 1
  2057. else:
  2058. pc = self.accelerator.parallelism_config
  2059. return pc.cp_size
  2060. def get_tp_size(self) -> int:
  2061. """Get the tensor parallel size from either the model or DeepSpeed config."""
  2062. # 1. Check model.tp_size first
  2063. if (model_tp := getattr(self.model, "_tp_size", None)) is not None:
  2064. return model_tp
  2065. # 2. Fall back to DeepSpeed config if enabled
  2066. if self.is_deepspeed_enabled and (deepspeed_config := getattr(self.args, "hf_deepspeed_config", None)):
  2067. return deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 1)
  2068. # 3. Default fallback
  2069. return 1
  2070. def _wrap_model(self, model: nn.Module, training: bool = True, dataloader: DataLoader | None = None) -> nn.Module:
  2071. """Wrap `model` for distributed training if needed (DDP, FSDP, SageMaker, etc.)."""
  2072. # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
  2073. if self.accelerator.unwrap_model(model, keep_torch_compile=False) is not model:
  2074. return model
  2075. if is_sagemaker_mp_enabled():
  2076. # Wrapping the base model twice in a DistributedModel will raise an error.
  2077. if isinstance(model, smp.model.DistributedModel):
  2078. return model
  2079. return smp.DistributedModel(model, backward_passes_per_step=self.args.gradient_accumulation_steps)
  2080. # Multi-gpu training, 8bit models does not support DP
  2081. if self.args.n_gpu > 1 and not getattr(model, "is_loaded_in_8bit", False):
  2082. model = nn.DataParallel(model)
  2083. # Note: in torch.distributed mode, there's no point in wrapping the model
  2084. # inside a DistributedDataParallel as we'll be under `no_grad` anyways.
  2085. if not training:
  2086. return model
  2087. # Distributed training using PyTorch FSDP
  2088. if self.is_fsdp_xla_enabled:
  2089. model = wrap_model_xla_fsdp(model, self.args, self.is_fsdp_xla_v2_enabled)
  2090. elif is_sagemaker_dp_enabled():
  2091. model = nn.parallel.DistributedDataParallel(
  2092. model, device_ids=[int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))]
  2093. )
  2094. return model
  2095. def _update_auto_batch_size(self, batch_size):
  2096. """Free memory, reset model wrapping, and update DeepSpeed config for the new batch size when using `auto_find_batch_size`"""
  2097. # `_train_batch_size` value might have changed to `auto_find_batch_size`
  2098. self._train_batch_size = batch_size
  2099. # frees the wrapped model and resets it back to the unwrapped base model
  2100. release_memory(self.model_wrapped)
  2101. if self.is_fsdp_enabled:
  2102. # Remove FSDP wrapping from sub-models because self.model points to the wrapped model in FSDP case
  2103. self.model = unwrap_model(self.model, recursive=True)
  2104. self.model_wrapped = self.model
  2105. # Check for DeepSpeed *after* the initial pass and modify the config
  2106. if self.is_deepspeed_enabled:
  2107. # Temporarily unset `self.args.train_batch_size`
  2108. original_bs = self.args.per_device_train_batch_size
  2109. self.args.per_device_train_batch_size = self._train_batch_size // max(1, self.args.n_gpu)
  2110. propagate_args_to_deepspeed(self.accelerator, self.args, auto_find_batch_size=True)
  2111. self.args.per_device_train_batch_size = original_bs
  2112. def _track_num_input_tokens(self, inputs):
  2113. """Count input tokens seen (all or non-padding) and update state."""
  2114. if self.args.include_num_input_tokens_seen == "no":
  2115. return
  2116. main_input_name = getattr(self.model, "main_input_name", "input_ids")
  2117. if main_input_name not in inputs:
  2118. logger.warning(
  2119. "Tried to track the number of tokens seen, however the current model is "
  2120. "not configured properly to know what item is the input. To fix this, add "
  2121. "a `main_input_name` attribute to the model class you are using."
  2122. )
  2123. return
  2124. if self.args.include_num_input_tokens_seen == "non_padding":
  2125. if "attention_mask" in inputs:
  2126. input_tokens = inputs["attention_mask"].sum()
  2127. elif (
  2128. self.processing_class is not None
  2129. and hasattr(self.processing_class, "pad_token_id")
  2130. and self.processing_class.pad_token_id is not None
  2131. ):
  2132. input_tokens = (inputs[main_input_name] != self.processing_class.pad_token_id).sum()
  2133. else:
  2134. logger.warning(
  2135. "Could not determine method to count non-padding tokens, falling back to counting all tokens."
  2136. )
  2137. input_tokens = inputs[main_input_name].numel()
  2138. else:
  2139. input_tokens = inputs[main_input_name].numel()
  2140. input_tokens = torch.as_tensor(input_tokens, device=self.args.device, dtype=torch.int64)
  2141. self.state.num_input_tokens_seen += self.accelerator.gather(input_tokens).sum().item()
  2142. def _clip_grad_norm(self, model):
  2143. """Clip gradients to max_grad_norm. Returns the pre-clip gradient norm."""
  2144. if is_sagemaker_mp_enabled() and self.args.fp16:
  2145. return self.optimizer.clip_master_grads(self.args.max_grad_norm)
  2146. return self.accelerator.clip_grad_norm_(model.parameters(), self.args.max_grad_norm)
  2147. def _get_grad_norm(self, model, grad_norm=None):
  2148. """Return the gradient norm as a Python float."""
  2149. if grad_norm is None:
  2150. # Compute norm without clipping (inf means no actual clipping happens)
  2151. grad_norm = self.accelerator.clip_grad_norm_(model.parameters(), float("inf"))
  2152. if self.accelerator.distributed_type == DistributedType.DEEPSPEED:
  2153. if hasattr(grad_norm, "item"):
  2154. grad_norm = grad_norm.item()
  2155. return grad_norm
  2156. # ---- Evaluation & Prediction ----
  2157. def evaluate(
  2158. self,
  2159. eval_dataset: Dataset | dict[str, Dataset] | None = None,
  2160. ignore_keys: list[str] | None = None,
  2161. metric_key_prefix: str = "eval",
  2162. ) -> dict[str, float]:
  2163. """
  2164. Run evaluation and returns metrics.
  2165. The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
  2166. (pass it to the init `compute_metrics` argument).
  2167. You can also subclass and override this method to inject custom behavior.
  2168. Args:
  2169. eval_dataset (`Dataset` | dict[str, `Dataset`], *optional*):
  2170. Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns
  2171. not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will
  2172. evaluate on each dataset, prepending the dictionary key to the metric name. Datasets must implement the
  2173. `__len__` method.
  2174. <Tip>
  2175. If you pass a dictionary with names of datasets as keys and datasets as values, evaluate will run
  2176. separate evaluations on each dataset. This can be useful to monitor how training affects other
  2177. datasets or simply to get a more fine-grained evaluation.
  2178. When used with `load_best_model_at_end`, make sure `metric_for_best_model` references exactly one
  2179. of the datasets. If you, for example, pass in `{"data1": data1, "data2": data2}` for two datasets
  2180. `data1` and `data2`, you could specify `metric_for_best_model="eval_data1_loss"` for using the
  2181. loss on `data1` and `metric_for_best_model="eval_data2_loss"` for the loss on `data2`.
  2182. </Tip>
  2183. ignore_keys (`list[str]`, *optional*):
  2184. A list of keys in the output of your model (if it is a dictionary) that should be ignored when
  2185. gathering predictions.
  2186. metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
  2187. An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
  2188. "eval_bleu" if the prefix is "eval" (default)
  2189. Returns:
  2190. A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
  2191. dictionary also contains the epoch number which comes from the training state.
  2192. """
  2193. # handle multiple eval datasets
  2194. override = eval_dataset is not None
  2195. eval_dataset = eval_dataset if override else self.eval_dataset
  2196. if isinstance(eval_dataset, dict):
  2197. metrics = {}
  2198. for eval_dataset_name, _eval_dataset in eval_dataset.items():
  2199. dataset_metrics = self.evaluate(
  2200. eval_dataset=_eval_dataset if override else eval_dataset_name,
  2201. ignore_keys=ignore_keys,
  2202. metric_key_prefix=f"{metric_key_prefix}_{eval_dataset_name}",
  2203. )
  2204. metrics.update(dataset_metrics)
  2205. return metrics
  2206. # memory metrics - must set up as early as possible
  2207. self._memory_tracker.start()
  2208. eval_dataloader = self.get_eval_dataloader(eval_dataset)
  2209. if self.is_fsdp_xla_v2_enabled:
  2210. eval_dataloader = tpu_spmd_dataloader(eval_dataloader)
  2211. start_time = time.time()
  2212. output = self.evaluation_loop(
  2213. eval_dataloader,
  2214. description="Evaluation",
  2215. # No point gathering the predictions if there are no metrics, otherwise we defer to
  2216. # self.args.prediction_loss_only
  2217. prediction_loss_only=True if self.compute_metrics is None else None,
  2218. ignore_keys=ignore_keys,
  2219. metric_key_prefix=metric_key_prefix,
  2220. )
  2221. total_batch_size = self.args.eval_batch_size * self.args.world_size
  2222. if f"{metric_key_prefix}_model_preparation_time" in output.metrics:
  2223. start_time += output.metrics[f"{metric_key_prefix}_model_preparation_time"]
  2224. output.metrics.update(
  2225. speed_metrics(
  2226. metric_key_prefix,
  2227. start_time,
  2228. num_samples=output.num_samples,
  2229. num_steps=math.ceil(output.num_samples / total_batch_size),
  2230. )
  2231. )
  2232. self.log(output.metrics)
  2233. if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
  2234. xm.master_print(met.metrics_report())
  2235. self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)
  2236. self._memory_tracker.stop_and_update_metrics(output.metrics)
  2237. return output.metrics
  2238. def evaluation_loop(
  2239. self,
  2240. dataloader: DataLoader,
  2241. description: str,
  2242. prediction_loss_only: bool | None = None,
  2243. ignore_keys: list[str] | None = None,
  2244. metric_key_prefix: str = "eval",
  2245. ) -> EvalLoopOutput:
  2246. """
  2247. Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
  2248. Works both with or without labels.
  2249. """
  2250. args = self.args
  2251. prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
  2252. # if eval is called w/o train, handle model prep here
  2253. if self.is_deepspeed_enabled and self.deepspeed is None:
  2254. _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
  2255. model = self._wrap_model(self.model, training=False)
  2256. if len(self.accelerator._models) == 0 and model is self.model:
  2257. start_time = time.time()
  2258. model = (
  2259. self.accelerator.prepare(model)
  2260. if self.is_deepspeed_enabled or (self.is_fsdp_enabled and not self.args.torch_compile)
  2261. else self.accelerator.prepare_model(model, evaluation_mode=True)
  2262. )
  2263. self.model_preparation_time = round(time.time() - start_time, 4)
  2264. if self.is_fsdp_enabled:
  2265. self.model = model
  2266. # for the rest of this function `model` is the outside model, whether it was wrapped or not
  2267. if model is not self.model:
  2268. self.model_wrapped = model
  2269. # backward compatibility
  2270. if self.is_deepspeed_enabled:
  2271. self.deepspeed = self.model_wrapped
  2272. # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
  2273. # while ``train`` is running, cast it to the right dtype first and then put on device
  2274. if not self.is_in_train:
  2275. if args.fp16_full_eval:
  2276. model = model.to(dtype=torch.float16, device=args.device)
  2277. elif args.bf16_full_eval:
  2278. model = model.to(dtype=torch.bfloat16, device=args.device)
  2279. batch_size = self.args.eval_batch_size
  2280. logger.info(f"\n***** Running {description} *****")
  2281. if has_length(dataloader):
  2282. logger.info(f" Num examples = {self.num_examples(dataloader)}")
  2283. else:
  2284. logger.info(" Num examples: Unknown")
  2285. logger.info(f" Batch size = {batch_size}")
  2286. if hasattr(model, "eval") and callable(model.eval):
  2287. model.eval()
  2288. if hasattr(self.optimizer, "eval") and callable(self.optimizer.eval):
  2289. self.optimizer.eval()
  2290. self.callback_handler.eval_dataloader = dataloader
  2291. # Do this before wrapping.
  2292. eval_dataset = getattr(dataloader, "dataset", None)
  2293. # Initialize containers
  2294. all_losses = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
  2295. all_preds = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
  2296. all_labels = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
  2297. all_inputs = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
  2298. metrics = None
  2299. eval_set_kwargs = {}
  2300. # Will be useful when we have an iterable dataset so don't know its length.
  2301. observed_num_examples = 0
  2302. # Main evaluation loop
  2303. for step, inputs in enumerate(dataloader):
  2304. # Update the observed num examples
  2305. observed_batch_size = find_batch_size(inputs)
  2306. if observed_batch_size is not None:
  2307. observed_num_examples += observed_batch_size
  2308. # For batch samplers, batch_size is not known by the dataloader in advance.
  2309. if batch_size is None:
  2310. batch_size = observed_batch_size
  2311. # Prediction step
  2312. losses, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
  2313. main_input_name = getattr(self.model, "main_input_name", "input_ids")
  2314. inputs_decode = (
  2315. self._prepare_input(inputs[main_input_name]) if "inputs" in args.include_for_metrics else None
  2316. )
  2317. if is_torch_xla_available():
  2318. xm.mark_step()
  2319. # Update containers
  2320. if losses is not None:
  2321. losses = self.gather_function(losses.repeat(batch_size))
  2322. all_losses.add(losses)
  2323. if inputs_decode is not None:
  2324. inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100)
  2325. inputs_decode = self.gather_function(inputs_decode)
  2326. if not self.args.batch_eval_metrics or description == "Prediction":
  2327. all_inputs.add(inputs_decode)
  2328. if labels is not None:
  2329. # Pad labels here, preparing for preprocess_logits_for_metrics in next logits block.
  2330. labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
  2331. if logits is not None:
  2332. logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100)
  2333. if self.preprocess_logits_for_metrics is not None:
  2334. logits = self.preprocess_logits_for_metrics(logits, labels)
  2335. logits = self.gather_function(logits)
  2336. if not self.args.batch_eval_metrics or description == "Prediction":
  2337. all_preds.add(logits)
  2338. if labels is not None:
  2339. labels = self.gather_function(labels)
  2340. if not self.args.batch_eval_metrics or description == "Prediction":
  2341. all_labels.add(labels)
  2342. self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
  2343. if self.args.batch_eval_metrics:
  2344. if self.compute_metrics is not None and logits is not None and labels is not None:
  2345. is_last_step = self.accelerator.gradient_state.end_of_dataloader
  2346. batch_kwargs = {}
  2347. batch_kwargs["losses"] = losses if "loss" in args.include_for_metrics else None
  2348. batch_kwargs["inputs"] = inputs if "inputs" in args.include_for_metrics else None
  2349. metrics = self.compute_metrics(
  2350. EvalPrediction(predictions=logits, label_ids=labels, **batch_kwargs),
  2351. compute_result=is_last_step,
  2352. )
  2353. del losses, logits, labels, inputs
  2354. torch.cuda.empty_cache()
  2355. # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
  2356. elif args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
  2357. all_losses.to_cpu_and_numpy()
  2358. all_preds.to_cpu_and_numpy()
  2359. all_labels.to_cpu_and_numpy()
  2360. all_inputs.to_cpu_and_numpy()
  2361. del losses, logits, labels, inputs
  2362. torch.cuda.empty_cache()
  2363. # After all calls to `.gather_function`, reset to `gather_for_metrics`:
  2364. self.gather_function = self.accelerator.gather_for_metrics
  2365. # Gather all remaining tensors and put them back on the CPU
  2366. all_losses = all_losses.get_arrays()
  2367. all_preds = all_preds.get_arrays()
  2368. all_labels = all_labels.get_arrays()
  2369. all_inputs = all_inputs.get_arrays()
  2370. # Number of samples
  2371. if has_length(eval_dataset):
  2372. num_samples = len(eval_dataset)
  2373. # The instance check is weird and does not actually check for the type, but whether the dataset has the right
  2374. # methods. Therefore we need to make sure it also has the attribute.
  2375. elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0:
  2376. num_samples = eval_dataset.num_examples
  2377. else:
  2378. if has_length(dataloader):
  2379. num_samples = self.num_examples(dataloader)
  2380. else: # both len(dataloader.dataset) and len(dataloader) fail
  2381. num_samples = observed_num_examples
  2382. if num_samples == 0 and observed_num_examples > 0:
  2383. num_samples = observed_num_examples
  2384. # Metrics!
  2385. if (
  2386. self.compute_metrics is not None
  2387. and all_preds is not None
  2388. and all_labels is not None
  2389. and not self.args.batch_eval_metrics
  2390. ):
  2391. eval_set_kwargs["losses"] = all_losses if "loss" in args.include_for_metrics else None
  2392. eval_set_kwargs["inputs"] = all_inputs if "inputs" in args.include_for_metrics else None
  2393. metrics = self.compute_metrics(
  2394. EvalPrediction(predictions=all_preds, label_ids=all_labels, **eval_set_kwargs)
  2395. )
  2396. elif metrics is None:
  2397. metrics = {}
  2398. # To be JSON-serializable, we need to remove numpy types or zero-d tensors
  2399. metrics = denumpify_detensorize(metrics)
  2400. if isinstance(all_losses, list) and all_losses:
  2401. metrics[f"{metric_key_prefix}_loss"] = np.concatenate(all_losses).mean().item()
  2402. elif isinstance(all_losses, np.ndarray):
  2403. metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
  2404. if hasattr(self, "model_preparation_time"):
  2405. metrics[f"{metric_key_prefix}_model_preparation_time"] = self.model_preparation_time
  2406. # Prefix all keys with metric_key_prefix + '_'
  2407. for key in list(metrics.keys()):
  2408. if not key.startswith(f"{metric_key_prefix}_"):
  2409. metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
  2410. return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
  2411. def predict(
  2412. self, test_dataset: Dataset, ignore_keys: list[str] | None = None, metric_key_prefix: str = "test"
  2413. ) -> PredictionOutput:
  2414. """
  2415. Run prediction and returns predictions and potential metrics.
  2416. Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
  2417. will also return metrics, like in `evaluate()`.
  2418. Args:
  2419. test_dataset (`Dataset`):
  2420. Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
  2421. `model.forward()` method are automatically removed. Has to implement the method `__len__`
  2422. ignore_keys (`list[str]`, *optional*):
  2423. A list of keys in the output of your model (if it is a dictionary) that should be ignored when
  2424. gathering predictions.
  2425. metric_key_prefix (`str`, *optional*, defaults to `"test"`):
  2426. An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
  2427. "test_bleu" if the prefix is "test" (default)
  2428. <Tip>
  2429. If your predictions or labels have different sequence length (for instance because you're doing dynamic padding
  2430. in a token classification task) the predictions will be padded (on the right) to allow for concatenation into
  2431. one array. The padding index is -100.
  2432. </Tip>
  2433. Returns: *NamedTuple* A namedtuple with the following keys:
  2434. - predictions (`np.ndarray`): The predictions on `test_dataset`.
  2435. - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
  2436. - metrics (`dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
  2437. labels).
  2438. """
  2439. # memory metrics - must set up as early as possible
  2440. self._memory_tracker.start()
  2441. test_dataloader = self.get_test_dataloader(test_dataset)
  2442. start_time = time.time()
  2443. output = self.evaluation_loop(
  2444. test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix
  2445. )
  2446. total_batch_size = self.args.eval_batch_size * self.args.world_size
  2447. if f"{metric_key_prefix}_model_preparation_time" in output.metrics:
  2448. start_time += output.metrics[f"{metric_key_prefix}_model_preparation_time"]
  2449. output.metrics.update(
  2450. speed_metrics(
  2451. metric_key_prefix,
  2452. start_time,
  2453. num_samples=output.num_samples,
  2454. num_steps=math.ceil(output.num_samples / total_batch_size),
  2455. )
  2456. )
  2457. self.control = self.callback_handler.on_predict(self.args, self.state, self.control, output.metrics)
  2458. self._memory_tracker.stop_and_update_metrics(output.metrics)
  2459. return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics)
  2460. def prediction_step(
  2461. self,
  2462. model: nn.Module,
  2463. inputs: dict[str, torch.Tensor | Any],
  2464. prediction_loss_only: bool,
  2465. ignore_keys: list[str] | None = None,
  2466. ) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]:
  2467. """
  2468. Perform an evaluation step on `model` using `inputs`.
  2469. Subclass and override to inject custom behavior.
  2470. Args:
  2471. model (`nn.Module`):
  2472. The model to evaluate.
  2473. inputs (`dict[str, torch.Tensor | Any]`):
  2474. The inputs and targets of the model.
  2475. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
  2476. argument `labels`. Check your model's documentation for all accepted arguments.
  2477. prediction_loss_only (`bool`):
  2478. Whether or not to return the loss only.
  2479. ignore_keys (`list[str]`, *optional*):
  2480. A list of keys in the output of your model (if it is a dictionary) that should be ignored when
  2481. gathering predictions.
  2482. Return:
  2483. tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
  2484. logits and labels (each being optional).
  2485. """
  2486. has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names)
  2487. # For CLIP-like models capable of returning loss values.
  2488. # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss`
  2489. # is `True` in `model.forward`.
  2490. return_loss = inputs.get("return_loss")
  2491. if return_loss is None:
  2492. return_loss = self.can_return_loss
  2493. loss_without_labels = len(self.label_names) == 0 and return_loss
  2494. inputs = self._prepare_inputs(inputs)
  2495. if ignore_keys is None:
  2496. if hasattr(self.model, "config"):
  2497. ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", ["past_key_values"])
  2498. else:
  2499. ignore_keys = []
  2500. # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
  2501. if has_labels or loss_without_labels:
  2502. labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
  2503. if len(labels) == 1:
  2504. labels = labels[0]
  2505. else:
  2506. labels = None
  2507. with torch.no_grad():
  2508. if is_sagemaker_mp_enabled():
  2509. raw_outputs = smp_forward_only(model, inputs)
  2510. if has_labels or loss_without_labels:
  2511. if isinstance(raw_outputs, dict):
  2512. loss_mb = raw_outputs["loss"]
  2513. logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"])
  2514. else:
  2515. loss_mb = raw_outputs[0]
  2516. logits_mb = raw_outputs[1:]
  2517. loss = loss_mb.reduce_mean().detach().cpu()
  2518. logits = smp_nested_concat(logits_mb)
  2519. else:
  2520. loss = None
  2521. if isinstance(raw_outputs, dict):
  2522. logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys)
  2523. else:
  2524. logits_mb = raw_outputs
  2525. logits = smp_nested_concat(logits_mb)
  2526. else:
  2527. if has_labels or loss_without_labels:
  2528. with self.compute_loss_context_manager():
  2529. num_items_in_batch = self._get_num_items_in_batch([inputs], self.args.device)
  2530. loss, outputs = self.compute_loss(
  2531. model, inputs, return_outputs=True, num_items_in_batch=num_items_in_batch
  2532. )
  2533. loss = loss.detach().mean()
  2534. if isinstance(outputs, dict):
  2535. logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
  2536. else:
  2537. logits = outputs[1:]
  2538. else:
  2539. loss = None
  2540. with self.compute_loss_context_manager():
  2541. outputs = model(**inputs)
  2542. if isinstance(outputs, dict):
  2543. logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
  2544. else:
  2545. logits = outputs
  2546. if prediction_loss_only:
  2547. return (loss, None, None)
  2548. logits = nested_detach(logits)
  2549. if len(logits) == 1:
  2550. logits = logits[0]
  2551. return (loss, logits, labels)
  2552. def _evaluate(
  2553. self,
  2554. trial: "optuna.Trial | dict[str, Any] | None",
  2555. ignore_keys_for_eval: list[str] | None,
  2556. skip_scheduler: bool = False,
  2557. ) -> dict[str, float]:
  2558. """Run evaluation, report to HP search, and step ReduceLROnPlateau/GreedyLR if needed."""
  2559. metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
  2560. self._report_to_hp_search(trial, self.state.global_step, metrics)
  2561. # Run delayed LR scheduler now that metrics are populated
  2562. if (
  2563. isinstance(self.lr_scheduler, (torch.optim.lr_scheduler.ReduceLROnPlateau, GreedyLR))
  2564. and not skip_scheduler
  2565. ):
  2566. metric_to_check = self.args.metric_for_best_model
  2567. if not metric_to_check.startswith("eval_"):
  2568. metric_to_check = f"eval_{metric_to_check}"
  2569. try:
  2570. self.lr_scheduler.step(metrics[metric_to_check])
  2571. except KeyError as exc:
  2572. raise KeyError(
  2573. f"The `metric_for_best_model` training argument is set to '{metric_to_check}', "
  2574. f"which is not found in the evaluation metrics. "
  2575. f"The available evaluation metrics are: {list(metrics.keys())}. "
  2576. f"Please ensure that the `compute_metrics` function returns a dictionary that includes '{metric_to_check}' or "
  2577. f"consider changing the `metric_for_best_model` via the TrainingArguments."
  2578. ) from exc
  2579. return metrics
  2580. # ---- Checkpoint Saving ----
  2581. def _get_output_dir(self, trial: "optuna.Trial | dict[str, Any] | None") -> str:
  2582. """Return the output directory, accounting for hyperparameter search trials."""
  2583. if self.hp_search_backend is not None and trial is not None:
  2584. if self.hp_search_backend == HPSearchBackend.OPTUNA:
  2585. run_id = trial.number
  2586. elif self.hp_search_backend == HPSearchBackend.RAY:
  2587. import ray.tune
  2588. run_id = ray.tune.get_context().get_trial_id()
  2589. elif self.hp_search_backend == HPSearchBackend.WANDB:
  2590. import wandb
  2591. run_id = wandb.run.id
  2592. run_name = self.hp_name(trial) if self.hp_name is not None else f"run-{run_id}"
  2593. run_dir = os.path.join(self.args.output_dir, run_name)
  2594. else:
  2595. run_dir = self.args.output_dir
  2596. return run_dir
  2597. def _save_checkpoint(self, model: nn.Module, trial: "optuna.Trial | dict[str, Any] | None") -> None:
  2598. """Save model checkpoint, optimizer, scheduler, scaler, RNG states, and trainer state."""
  2599. # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
  2600. # want to save except FullyShardedDDP.
  2601. # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
  2602. # Save model checkpoint
  2603. checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
  2604. if self.hp_search_backend is None and trial is None:
  2605. self.store_flos()
  2606. run_dir = self._get_output_dir(trial=trial)
  2607. output_dir = os.path.join(run_dir, checkpoint_folder)
  2608. self.save_model(output_dir, _internal_call=True)
  2609. if (
  2610. self.args.save_strategy in [SaveStrategy.STEPS, SaveStrategy.EPOCH, SaveStrategy.BEST]
  2611. and self.state.best_global_step
  2612. ):
  2613. # Wait for everyone to get here so we are sure the model has been saved by process 0
  2614. # before we check if the best_checkpoint_dir exists
  2615. if is_torch_xla_available():
  2616. xm.rendezvous("load_best_model_at_end")
  2617. elif self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2618. dist.barrier()
  2619. elif is_sagemaker_mp_enabled():
  2620. smp.barrier()
  2621. best_checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.best_global_step}"
  2622. best_checkpoint_dir = os.path.join(run_dir, best_checkpoint_folder)
  2623. if os.path.exists(best_checkpoint_dir):
  2624. self.state.best_model_checkpoint = best_checkpoint_dir
  2625. if not self.args.save_only_model:
  2626. # Save optimizer and scheduler
  2627. self._save_optimizer_and_scheduler(output_dir)
  2628. self._save_scaler(output_dir)
  2629. # Save RNG state
  2630. self._save_rng_state(output_dir)
  2631. # Save the Trainer state
  2632. if self.args.should_save:
  2633. # Update `ExportableState` callbacks and `TrainerControl` state to where we are currently
  2634. for cb in [
  2635. cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
  2636. ]:
  2637. cb_name = cb.__class__.__name__
  2638. cb_state = cb.state()
  2639. if isinstance(self.state.stateful_callbacks[cb_name], list):
  2640. self.state.stateful_callbacks[cb_name].append(cb_state)
  2641. else:
  2642. self.state.stateful_callbacks[cb_name] = cb_state
  2643. self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
  2644. if self.args.push_to_hub:
  2645. self._push_from_checkpoint(output_dir)
  2646. # Maybe delete some older checkpoints.
  2647. if self.args.should_save:
  2648. # we use mtime as default, filesystems without mtime support will be detected in `sort_checkpoints`
  2649. rotate_checkpoints(
  2650. output_dir=run_dir,
  2651. save_total_limit=self.args.save_total_limit,
  2652. best_model_checkpoint=self.state.best_model_checkpoint,
  2653. use_mtime=True,
  2654. )
  2655. def _determine_best_metric(self, metrics: dict[str, float], trial: "optuna.Trial | dict[str, Any] | None") -> bool:
  2656. """
  2657. Determine if the model should be saved based on the evaluation metrics.
  2658. Returns:
  2659. bool: True if a new best metric was found, else False
  2660. """
  2661. is_new_best_metric = False
  2662. if self.args.metric_for_best_model is not None:
  2663. metric_to_check = self.args.metric_for_best_model
  2664. if not metric_to_check.startswith("eval_"):
  2665. metric_to_check = f"eval_{metric_to_check}"
  2666. try:
  2667. metric_value = metrics[metric_to_check]
  2668. except KeyError as exc:
  2669. raise KeyError(
  2670. f"The `metric_for_best_model` training argument is set to '{metric_to_check}', which is not found in the evaluation metrics. "
  2671. f"The available evaluation metrics are: {list(metrics.keys())}. Consider changing the `metric_for_best_model` via the TrainingArguments."
  2672. ) from exc
  2673. operator = np.greater if self.args.greater_is_better else np.less
  2674. if self.state.best_metric is None:
  2675. self.state.best_metric = float("-inf") if self.args.greater_is_better else float("inf")
  2676. if operator(metric_value, self.state.best_metric):
  2677. self.state.best_metric = metric_value
  2678. if self.args.save_strategy in [SaveStrategy.STEPS, SaveStrategy.EPOCH, SaveStrategy.BEST]:
  2679. self.state.best_global_step = self.state.global_step
  2680. is_new_best_metric = True
  2681. return is_new_best_metric
  2682. def _save_rng_state(self, output_dir: str) -> None:
  2683. """Save random number generator states for reproducible resumption."""
  2684. # Save RNG state in non-distributed training
  2685. rng_states = {
  2686. "python": random.getstate(),
  2687. "numpy": np.random.get_state(),
  2688. "cpu": torch.random.get_rng_state(),
  2689. }
  2690. if torch.cuda.is_available():
  2691. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2692. # In non distributed, we save the global CUDA RNG state (will take care of DataParallel)
  2693. rng_states["cuda"] = torch.cuda.random.get_rng_state_all()
  2694. else:
  2695. rng_states["cuda"] = torch.cuda.random.get_rng_state()
  2696. if is_torch_xla_available():
  2697. rng_states["xla"] = xm.get_rng_state()
  2698. if is_torch_npu_available():
  2699. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2700. rng_states["npu"] = torch.npu.random.get_rng_state_all()
  2701. else:
  2702. rng_states["npu"] = torch.npu.random.get_rng_state()
  2703. if is_torch_hpu_available():
  2704. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2705. rng_states["hpu"] = torch.hpu.random.get_rng_state_all()
  2706. else:
  2707. rng_states["hpu"] = torch.hpu.random.get_rng_state()
  2708. if is_torch_mlu_available():
  2709. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2710. rng_states["mlu"] = torch.mlu.random.get_rng_state_all()
  2711. else:
  2712. rng_states["mlu"] = torch.mlu.random.get_rng_state()
  2713. if is_torch_musa_available():
  2714. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2715. rng_states["musa"] = torch.musa.get_rng_state_all()
  2716. else:
  2717. rng_states["musa"] = torch.musa.get_rng_state()
  2718. # A process can arrive here before the process 0 has a chance to save the model, in which case output_dir may
  2719. # not yet exist.
  2720. os.makedirs(output_dir, exist_ok=True)
  2721. if self.args.world_size <= 1:
  2722. torch.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
  2723. else:
  2724. torch.save(rng_states, os.path.join(output_dir, f"rng_state_{self.args.process_index}.pth"))
  2725. def _save_optimizer_and_scheduler(self, output_dir: str) -> None:
  2726. """Save optimizer and learning rate scheduler states to `output_dir`."""
  2727. if is_torch_xla_available():
  2728. xm.rendezvous("saving_optimizer_states")
  2729. if self.is_fsdp_xla_v1_enabled:
  2730. optm = {
  2731. "optimizer": self.optimizer.state_dict(),
  2732. "shard_metadata": self.model.get_shard_metadata(),
  2733. }
  2734. xm.save(
  2735. optm,
  2736. os.path.join(
  2737. output_dir, f"rank{self.args.process_index}-of-{self.args.world_size}-{OPTIMIZER_NAME}"
  2738. ),
  2739. master_only=False,
  2740. )
  2741. else:
  2742. xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
  2743. with warnings.catch_warnings(record=True) as caught_warnings:
  2744. xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
  2745. reissue_pt_warnings(caught_warnings)
  2746. elif is_sagemaker_mp_enabled():
  2747. opt_state_dict = self.optimizer.local_state_dict(gather_if_shard=False)
  2748. smp.barrier()
  2749. if smp.rdp_rank() == 0 or smp.state.cfg.shard_optimizer_state:
  2750. smp.save(
  2751. opt_state_dict,
  2752. os.path.join(output_dir, OPTIMIZER_NAME),
  2753. partial=True,
  2754. v3=smp.state.cfg.shard_optimizer_state,
  2755. )
  2756. elif self.is_deepspeed_enabled:
  2757. # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
  2758. # config `stage3_gather_16bit_weights_on_model_save` is True
  2759. accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
  2760. inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
  2761. )
  2762. if accept_exclude_frozen_parameters and _is_peft_model(self.model):
  2763. self.model_wrapped.save_checkpoint(output_dir, exclude_frozen_parameters=True)
  2764. else:
  2765. self.model_wrapped.save_checkpoint(output_dir)
  2766. elif self.is_fsdp_enabled:
  2767. # save fsdp specific ckpt for resuming from ckpt
  2768. save_fsdp_model(
  2769. self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir, **get_fsdp_ckpt_kwargs()
  2770. )
  2771. save_fsdp_optimizer(
  2772. self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir
  2773. )
  2774. elif self.args.should_save:
  2775. # deepspeed.save_checkpoint above saves model/optim/sched
  2776. torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
  2777. # Save SCHEDULER & SCALER
  2778. is_deepspeed_custom_scheduler = self.is_deepspeed_enabled and not isinstance(
  2779. self.lr_scheduler, DeepSpeedSchedulerWrapper
  2780. )
  2781. if (
  2782. self.args.should_save
  2783. and (not self.is_deepspeed_enabled or is_deepspeed_custom_scheduler)
  2784. and not is_torch_xla_available()
  2785. ):
  2786. with warnings.catch_warnings(record=True) as caught_warnings:
  2787. torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
  2788. reissue_pt_warnings(caught_warnings)
  2789. def _save_scaler(self, output_dir: str) -> None:
  2790. """Save the gradient scaler state if one exists."""
  2791. # See if there is a scaler attribute
  2792. try:
  2793. scaler = self.accelerator.scaler
  2794. except AttributeError:
  2795. return
  2796. if scaler is None:
  2797. return
  2798. if is_torch_xla_available():
  2799. xm.rendezvous("saving_scaler_state")
  2800. with warnings.catch_warnings(record=True) as caught_warnings:
  2801. xm.save(self.accelerator.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
  2802. reissue_pt_warnings(caught_warnings)
  2803. # Save SCALER
  2804. if self.args.should_save and not is_torch_xla_available():
  2805. with warnings.catch_warnings(record=True) as caught_warnings:
  2806. torch.save(self.accelerator.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
  2807. reissue_pt_warnings(caught_warnings)
  2808. # ---- Checkpoint Resuming ----
  2809. def _load_from_checkpoint(self, resume_from_checkpoint: str, model: nn.Module | None = None) -> None:
  2810. """Load model weights from a checkpoint directory."""
  2811. if model is None:
  2812. model = self.model
  2813. config_file = os.path.join(resume_from_checkpoint, CONFIG_NAME)
  2814. adapter_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_WEIGHTS_NAME)
  2815. adapter_safe_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
  2816. weights_file = os.path.join(resume_from_checkpoint, WEIGHTS_NAME)
  2817. weights_index_file = os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME)
  2818. safe_weights_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_NAME)
  2819. safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME)
  2820. is_fsdp_ckpt = os.path.isdir(resume_from_checkpoint) and (
  2821. # this checks the FSDP state dict when `SHARDED_STATE_DICT` is used
  2822. any(
  2823. FSDP_MODEL_NAME in folder_name
  2824. for folder_name in os.listdir(resume_from_checkpoint)
  2825. if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name))
  2826. )
  2827. # this checks the FSDP state dict when `FULL_STATE_DICT` is used
  2828. or os.path.isfile(os.path.join(resume_from_checkpoint, f"{FSDP_MODEL_NAME}.bin"))
  2829. )
  2830. # if multiple adapters exist, they get saved in sub directories
  2831. adapter_subdirs = (
  2832. [
  2833. folder_name
  2834. for folder_name in os.listdir(resume_from_checkpoint)
  2835. if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name))
  2836. and (
  2837. os.path.isfile(os.path.join(resume_from_checkpoint, folder_name, ADAPTER_WEIGHTS_NAME))
  2838. or os.path.isfile(os.path.join(resume_from_checkpoint, folder_name, ADAPTER_SAFE_WEIGHTS_NAME))
  2839. )
  2840. ]
  2841. if os.path.isdir(resume_from_checkpoint)
  2842. else []
  2843. )
  2844. if is_fsdp_ckpt and not self.is_fsdp_enabled:
  2845. raise ValueError(f"Checkpoint found at {resume_from_checkpoint} is only supported when using PyTorch FSDP")
  2846. if not (
  2847. any(
  2848. os.path.isfile(f)
  2849. for f in [
  2850. weights_file,
  2851. safe_weights_file,
  2852. weights_index_file,
  2853. safe_weights_index_file,
  2854. adapter_weights_file,
  2855. adapter_safe_weights_file,
  2856. ]
  2857. )
  2858. or is_fsdp_ckpt
  2859. or adapter_subdirs
  2860. ):
  2861. raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
  2862. logger.info(f"Loading model from {resume_from_checkpoint}.")
  2863. if os.path.isfile(config_file):
  2864. config = PreTrainedConfig.from_json_file(config_file)
  2865. checkpoint_version = config.transformers_version
  2866. if checkpoint_version is not None and checkpoint_version != __version__:
  2867. logger.warning(
  2868. f"You are resuming training from a checkpoint trained with {checkpoint_version} of "
  2869. f"Transformers but your current version is {__version__}. This is not recommended and could "
  2870. "yield to errors or unwanted behaviors."
  2871. )
  2872. if os.path.isfile(weights_file) or os.path.isfile(safe_weights_file) or is_fsdp_ckpt:
  2873. # If the model is on the GPU, it still works!
  2874. if is_sagemaker_mp_enabled():
  2875. smp.resume_from_checkpoint(
  2876. path=resume_from_checkpoint, tag=WEIGHTS_NAME, partial=False, load_optimizer=False
  2877. )
  2878. elif self.is_fsdp_enabled:
  2879. load_fsdp_model(
  2880. self.accelerator.state.fsdp_plugin,
  2881. self.accelerator,
  2882. model,
  2883. resume_from_checkpoint,
  2884. **get_fsdp_ckpt_kwargs(),
  2885. )
  2886. else:
  2887. # We load the model state dict on the CPU to avoid an OOM error.
  2888. if os.path.isfile(safe_weights_file):
  2889. state_dict = safetensors.torch.load_file(safe_weights_file, device="cpu")
  2890. else:
  2891. check_torch_load_is_safe()
  2892. state_dict = torch.load(weights_file, map_location="cpu", weights_only=True)
  2893. # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
  2894. # which takes *args instead of **kwargs
  2895. load_result = model.load_state_dict(state_dict, False)
  2896. # release memory
  2897. del state_dict
  2898. self._issue_warnings_after_load(load_result)
  2899. # Load adapters following PR # 24096
  2900. elif _is_peft_model(model):
  2901. # If training a model using PEFT, assume that adapter have been saved properly.
  2902. if hasattr(model, "active_adapters") and hasattr(model, "load_adapter"):
  2903. if os.path.exists(resume_from_checkpoint):
  2904. active_adapters = model.active_adapters
  2905. if len(active_adapters) > 1:
  2906. logger.warning("Multiple active adapters detected will only consider the first adapter")
  2907. active_adapter = active_adapters[0]
  2908. if adapter_subdirs:
  2909. for subdir_name in adapter_subdirs:
  2910. peft_id = os.path.join(resume_from_checkpoint, subdir_name)
  2911. model.load_adapter(peft_id, subdir_name, is_trainable=(subdir_name == active_adapter))
  2912. model.set_adapter(active_adapter)
  2913. else:
  2914. model.load_adapter(resume_from_checkpoint, active_adapter, is_trainable=True)
  2915. else:
  2916. logger.warning(
  2917. "The intermediate checkpoints of PEFT may not be saved correctly, "
  2918. f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
  2919. "Check some examples here: https://github.com/huggingface/peft/issues/96"
  2920. )
  2921. else:
  2922. logger.warning(f"Could not load adapter model, make sure to have PEFT >= {MIN_PEFT_VERSION} installed")
  2923. else:
  2924. # We load the sharded checkpoint
  2925. load_result = load_sharded_checkpoint(model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled())
  2926. if not is_sagemaker_mp_enabled():
  2927. self._issue_warnings_after_load(load_result)
  2928. def _load_best_model(self) -> None:
  2929. """Load the best model found during training based on the tracked metric."""
  2930. logger.info(f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric}).")
  2931. best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)
  2932. best_safe_model_path = os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_NAME)
  2933. best_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_WEIGHTS_NAME)
  2934. best_safe_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
  2935. model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
  2936. if self.is_deepspeed_enabled:
  2937. deepspeed_load_checkpoint(
  2938. self.model_wrapped,
  2939. self.state.best_model_checkpoint,
  2940. load_module_strict=not _is_peft_model(self.model),
  2941. )
  2942. elif self.is_fsdp_enabled:
  2943. load_result = load_fsdp_model(
  2944. self.accelerator.state.fsdp_plugin,
  2945. self.accelerator,
  2946. model,
  2947. self.state.best_model_checkpoint,
  2948. **get_fsdp_ckpt_kwargs(),
  2949. )
  2950. elif (
  2951. os.path.exists(best_model_path)
  2952. or os.path.exists(best_safe_model_path)
  2953. or os.path.exists(best_adapter_model_path)
  2954. or os.path.exists(best_safe_adapter_model_path)
  2955. ):
  2956. has_been_loaded = True
  2957. if is_sagemaker_mp_enabled():
  2958. smp.resume_from_checkpoint(
  2959. path=self.state.best_model_checkpoint,
  2960. tag=WEIGHTS_NAME,
  2961. partial=False,
  2962. load_optimizer=False,
  2963. )
  2964. else:
  2965. if _is_peft_model(model):
  2966. # If training a model using PEFT, assume that adapter have been saved properly.
  2967. if hasattr(model, "active_adapters") and hasattr(model, "load_adapter"):
  2968. active_adapter = model.active_adapters[0]
  2969. if len(model.active_adapters) > 1:
  2970. logger.warning("Detected multiple active adapters, will only consider the first one")
  2971. if os.path.exists(best_adapter_model_path) or os.path.exists(best_safe_adapter_model_path):
  2972. try:
  2973. model.load_adapter(self.state.best_model_checkpoint, active_adapter)
  2974. except RuntimeError as exc:
  2975. if model.peft_config[active_adapter].is_prompt_learning:
  2976. # for context: https://github.com/huggingface/peft/issues/2256
  2977. msg = (
  2978. "When using prompt learning PEFT methods such as "
  2979. f"{model.peft_config[active_adapter].peft_type.value}, setting "
  2980. "load_best_model_at_end=True can lead to errors, it is recommended "
  2981. "to set this to False and to load the model manually from the checkpoint "
  2982. "directory using PeftModel.from_pretrained(base_model, <path>) after training "
  2983. "has finished."
  2984. )
  2985. raise RuntimeError(msg) from exc
  2986. else:
  2987. raise
  2988. # Load_adapter has no return value present, modify it when appropriate.
  2989. from torch.nn.modules.module import _IncompatibleKeys
  2990. load_result = _IncompatibleKeys([], [])
  2991. else:
  2992. logger.warning(
  2993. "The intermediate checkpoints of PEFT may not be saved correctly, "
  2994. f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
  2995. "Check some examples here: https://github.com/huggingface/peft/issues/96"
  2996. )
  2997. has_been_loaded = False
  2998. else:
  2999. logger.warning(
  3000. f"Could not load adapter model, make sure to have PEFT >= {MIN_PEFT_VERSION} installed"
  3001. )
  3002. has_been_loaded = False
  3003. else:
  3004. # We load the model state dict on the CPU to avoid an OOM error.
  3005. if os.path.isfile(best_safe_model_path):
  3006. state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
  3007. else:
  3008. check_torch_load_is_safe()
  3009. state_dict = torch.load(best_model_path, map_location="cpu", weights_only=True)
  3010. # If the model is on the GPU, it still works!
  3011. # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
  3012. # which takes *args instead of **kwargs
  3013. load_result = model.load_state_dict(state_dict, False)
  3014. if not is_sagemaker_mp_enabled() and has_been_loaded:
  3015. self._issue_warnings_after_load(load_result)
  3016. elif os.path.exists(os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_INDEX_NAME)) or os.path.exists(
  3017. os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)
  3018. ):
  3019. load_result = load_sharded_checkpoint(
  3020. model, self.state.best_model_checkpoint, strict=is_sagemaker_mp_enabled()
  3021. )
  3022. if not is_sagemaker_mp_enabled():
  3023. self._issue_warnings_after_load(load_result)
  3024. else:
  3025. logger.warning(
  3026. f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
  3027. "on multiple nodes, you should activate `--save_on_each_node`."
  3028. )
  3029. def _load_rng_state(self, checkpoint: str | None) -> None:
  3030. """Restore random number generator states from a checkpoint."""
  3031. # Load RNG states from `checkpoint`
  3032. if checkpoint is None:
  3033. return
  3034. if self.args.world_size > 1:
  3035. process_index = self.args.process_index
  3036. rng_file = os.path.join(checkpoint, f"rng_state_{process_index}.pth")
  3037. if not os.path.isfile(rng_file):
  3038. logger.info(
  3039. f"Didn't find an RNG file for process {process_index}, if you are resuming a training that "
  3040. "wasn't launched in a distributed fashion, reproducibility is not guaranteed."
  3041. )
  3042. return
  3043. else:
  3044. rng_file = os.path.join(checkpoint, "rng_state.pth")
  3045. if not os.path.isfile(rng_file):
  3046. logger.info(
  3047. "Didn't find an RNG file, if you are resuming a training that was launched in a distributed "
  3048. "fashion, reproducibility is not guaranteed."
  3049. )
  3050. return
  3051. with safe_globals():
  3052. check_torch_load_is_safe()
  3053. checkpoint_rng_state = torch.load(rng_file, weights_only=True)
  3054. random.setstate(checkpoint_rng_state["python"])
  3055. np.random.set_state(checkpoint_rng_state["numpy"])
  3056. torch.random.set_rng_state(checkpoint_rng_state["cpu"])
  3057. if is_torch_xla_available():
  3058. xm.set_rng_state(checkpoint_rng_state["xla"])
  3059. is_distributed = self.args.parallel_mode == ParallelMode.DISTRIBUTED
  3060. if torch.cuda.is_available():
  3061. set_rng_state_for_device("CUDA", torch.cuda, checkpoint_rng_state, is_distributed)
  3062. if is_torch_npu_available():
  3063. set_rng_state_for_device("NPU", torch.npu, checkpoint_rng_state, is_distributed)
  3064. if is_torch_hpu_available():
  3065. set_rng_state_for_device("HPU", torch.hpu, checkpoint_rng_state, is_distributed)
  3066. if is_torch_mlu_available():
  3067. set_rng_state_for_device("MLU", torch.mlu, checkpoint_rng_state, is_distributed)
  3068. if is_torch_musa_available():
  3069. set_rng_state_for_device("MUSA", torch.musa, checkpoint_rng_state, is_distributed)
  3070. def _load_optimizer_and_scheduler(self, checkpoint: str | None) -> None:
  3071. """If optimizer and scheduler states exist, load them."""
  3072. if checkpoint is None:
  3073. return
  3074. if self.is_deepspeed_enabled:
  3075. # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init
  3076. if not isinstance(self.lr_scheduler, DeepSpeedSchedulerWrapper):
  3077. with warnings.catch_warnings(record=True) as caught_warnings:
  3078. check_torch_load_is_safe()
  3079. self.lr_scheduler.load_state_dict(
  3080. torch.load(os.path.join(checkpoint, SCHEDULER_NAME), weights_only=True)
  3081. )
  3082. reissue_pt_warnings(caught_warnings)
  3083. return
  3084. checkpoint_file_exists = (
  3085. glob.glob(os.path.join(checkpoint, OPTIMIZER_NAME) + "_*")
  3086. if is_sagemaker_mp_enabled()
  3087. else (
  3088. os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME))
  3089. or os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME_BIN))
  3090. or (
  3091. os.path.isdir(checkpoint)
  3092. and any(
  3093. OPTIMIZER_NAME_BIN.split(".")[0] in folder_name
  3094. for folder_name in os.listdir(checkpoint)
  3095. if os.path.isdir(os.path.join(checkpoint, folder_name))
  3096. )
  3097. )
  3098. )
  3099. )
  3100. checkpoint_file_exists = (
  3101. glob.glob(os.path.join(checkpoint, f"rank*-of-{self.args.world_size}-{OPTIMIZER_NAME}"))
  3102. if self.is_fsdp_xla_v1_enabled
  3103. else checkpoint_file_exists
  3104. )
  3105. if checkpoint_file_exists and os.path.isfile(os.path.join(checkpoint, SCHEDULER_NAME)):
  3106. # Load in optimizer and scheduler states
  3107. if is_torch_xla_available():
  3108. # On TPU we have to take some extra precautions to properly load the states on the right device.
  3109. if self.is_fsdp_xla_v1_enabled:
  3110. check_torch_load_is_safe()
  3111. optimizer_state = torch.load(
  3112. os.path.join(
  3113. checkpoint, f"rank{self.args.process_index}-of-{self.args.world_size}-{OPTIMIZER_NAME}"
  3114. ),
  3115. map_location="cpu",
  3116. weights_only=True,
  3117. )
  3118. # We only need `optimizer` when resuming from checkpoint
  3119. optimizer_state = optimizer_state["optimizer"]
  3120. else:
  3121. check_torch_load_is_safe()
  3122. optimizer_state = torch.load(
  3123. os.path.join(checkpoint, OPTIMIZER_NAME), map_location="cpu", weights_only=True
  3124. )
  3125. with warnings.catch_warnings(record=True) as caught_warnings:
  3126. check_torch_load_is_safe()
  3127. lr_scheduler_state = torch.load(
  3128. os.path.join(checkpoint, SCHEDULER_NAME), map_location="cpu", weights_only=True
  3129. )
  3130. reissue_pt_warnings(caught_warnings)
  3131. xm.send_cpu_data_to_device(optimizer_state, self.args.device)
  3132. xm.send_cpu_data_to_device(lr_scheduler_state, self.args.device)
  3133. self.optimizer.load_state_dict(optimizer_state)
  3134. self.lr_scheduler.load_state_dict(lr_scheduler_state)
  3135. else:
  3136. if is_sagemaker_mp_enabled():
  3137. def opt_load_hook(mod, opt):
  3138. opt.load_state_dict(smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=True))
  3139. self.model_wrapped.register_post_step_hook(opt_load_hook)
  3140. else:
  3141. # We use the CPU when training on one GPU to avoid OOM for GPU RAM when training big models.
  3142. # In distributed training however, we load directly on each GPU and risk the GPU OOM as it's more
  3143. # likely to get OOM on CPU (since we load num_gpu times the optimizer state
  3144. map_location = self.args.device if self.args.world_size > 1 else "cpu"
  3145. if self.is_fsdp_enabled:
  3146. load_fsdp_optimizer(
  3147. self.accelerator.state.fsdp_plugin,
  3148. self.accelerator,
  3149. self.optimizer,
  3150. self.model,
  3151. checkpoint,
  3152. **get_fsdp_ckpt_kwargs(),
  3153. )
  3154. else:
  3155. check_torch_load_is_safe()
  3156. self.optimizer.load_state_dict(
  3157. torch.load(
  3158. os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location, weights_only=True
  3159. )
  3160. )
  3161. with warnings.catch_warnings(record=True) as caught_warnings:
  3162. check_torch_load_is_safe()
  3163. self.lr_scheduler.load_state_dict(
  3164. torch.load(os.path.join(checkpoint, SCHEDULER_NAME), weights_only=True)
  3165. )
  3166. reissue_pt_warnings(caught_warnings)
  3167. def _load_scaler(self, checkpoint: str | None) -> None:
  3168. """If scaler state exists, load it."""
  3169. if checkpoint is None:
  3170. return
  3171. checkpoint_file_exists = os.path.isfile(os.path.join(checkpoint, SCALER_NAME))
  3172. if checkpoint_file_exists:
  3173. # On TPU we have to take some extra precautions to properly load the states on the right device.
  3174. # Load in scaler states
  3175. if is_torch_xla_available():
  3176. with warnings.catch_warnings(record=True) as caught_warnings:
  3177. check_torch_load_is_safe()
  3178. scaler_state = torch.load(
  3179. os.path.join(checkpoint, SCALER_NAME), map_location="cpu", weights_only=True
  3180. )
  3181. reissue_pt_warnings(caught_warnings)
  3182. xm.send_cpu_data_to_device(scaler_state, self.args.device)
  3183. self.accelerator.scaler.load_state_dict(scaler_state)
  3184. else:
  3185. with warnings.catch_warnings(record=True) as caught_warnings:
  3186. check_torch_load_is_safe()
  3187. self.accelerator.scaler.load_state_dict(
  3188. torch.load(os.path.join(checkpoint, SCALER_NAME), weights_only=True)
  3189. )
  3190. reissue_pt_warnings(caught_warnings)
  3191. def _load_callback_state(self) -> None:
  3192. """If callback states exist and were passed in, restore their states if enabled"""
  3193. if not self.args.restore_callback_states_from_checkpoint:
  3194. return
  3195. # Callback states are stored in stateful_callbacks
  3196. not_found = []
  3197. new_callbacks = []
  3198. original_callbacks = self.callback_handler.callbacks + [self.control]
  3199. for stored_callback, data in self.state.stateful_callbacks.items():
  3200. if not isinstance(data, list):
  3201. data = [data]
  3202. if any(callback.__class__.__name__ == stored_callback for callback in original_callbacks):
  3203. # We can load/restore from multiple callbacks of the same type.
  3204. duplicates = [
  3205. callback for callback in original_callbacks if callback.__class__.__name__ == stored_callback
  3206. ]
  3207. for callback, callback_data in zip(duplicates, data):
  3208. args = callback_data.get("args", {})
  3209. attributes = callback_data.get("attributes", {})
  3210. new_callback = type(callback)(**args)
  3211. for attribute, value in attributes.items():
  3212. setattr(new_callback, attribute, value)
  3213. if isinstance(callback, TrainerControl):
  3214. # Specifically for restoring the `control` state
  3215. self.control = new_callback
  3216. else:
  3217. new_callbacks.append(new_callback)
  3218. # We remove the existing callback and add it to the list of new callbacks
  3219. self.callback_handler.remove_callback(type(new_callback))
  3220. logger.info("Continuing training from checkpoint, restoring any callbacks that were passed in")
  3221. else:
  3222. not_found.append(stored_callback)
  3223. if len(not_found) > 0:
  3224. logger.warning(
  3225. f"Checkpoint included callbacks not included in current configuration. Ignoring. ({', '.join(not_found)})"
  3226. )
  3227. for callback in new_callbacks:
  3228. self.callback_handler.add_callback(callback)
  3229. def _issue_warnings_after_load(self, load_result: Any) -> None:
  3230. """Log warnings for missing or unexpected keys after loading a checkpoint."""
  3231. if len(load_result.missing_keys) != 0:
  3232. if self.model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) == set(
  3233. self.model._keys_to_ignore_on_save
  3234. ):
  3235. self.model.tie_weights()
  3236. else:
  3237. logger.warning(f"There were missing keys in the checkpoint model loaded: {load_result.missing_keys}.")
  3238. if len(load_result.unexpected_keys) != 0:
  3239. logger.warning(
  3240. f"There were unexpected keys in the checkpoint model loaded: {load_result.unexpected_keys}."
  3241. )
  3242. # ---- Saving & Serialization ----
  3243. def save_model(self, output_dir: str | None = None, _internal_call: bool = False) -> None:
  3244. """
  3245. Will save the model, so you can reload it using `from_pretrained()`.
  3246. Will only save from the main process.
  3247. """
  3248. if output_dir is None:
  3249. output_dir = self.args.output_dir
  3250. if is_torch_xla_available():
  3251. save_tpu_checkpoint(
  3252. self.model, self.args, self.accelerator, self.processing_class, self.is_fsdp_xla_v1_enabled, output_dir
  3253. )
  3254. elif is_sagemaker_mp_enabled():
  3255. # Calling the state_dict needs to be done on the wrapped model and on all processes.
  3256. os.makedirs(output_dir, exist_ok=True)
  3257. state_dict = self.model_wrapped.state_dict()
  3258. if self.args.should_save:
  3259. self._save(output_dir, state_dict=state_dict)
  3260. Path(os.path.join(output_dir, "user_content.pt")).touch()
  3261. elif self.is_fsdp_enabled:
  3262. if "FULL_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type):
  3263. state_dict = self.accelerator.get_state_dict(self.model)
  3264. if self.args.should_save:
  3265. self._save(output_dir, state_dict=state_dict)
  3266. elif self.is_deepspeed_enabled:
  3267. try:
  3268. accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
  3269. inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
  3270. )
  3271. zero3_sharding = self.deepspeed.config.get("zero_optimization", {}).get("stage", None) == 3
  3272. if accept_exclude_frozen_parameters and _is_peft_model(self.model) and zero3_sharding:
  3273. # When using PEFT with DeepSpeed ZeRO Stage 3,
  3274. # we do not need to load the frozen parameters
  3275. state_dict = self.deepspeed._zero3_consolidated_16bit_state_dict(exclude_frozen_parameters=True)
  3276. else:
  3277. state_dict = self.accelerator.get_state_dict(self.deepspeed)
  3278. if self.args.should_save:
  3279. self._save(output_dir, state_dict=state_dict)
  3280. except ValueError:
  3281. logger.warning(
  3282. " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use"
  3283. " zero_to_fp32.py to recover weights"
  3284. )
  3285. if self.args.should_save:
  3286. self._save(output_dir, state_dict={})
  3287. # remove the dummy state_dict
  3288. remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
  3289. self.model_wrapped.save_checkpoint(output_dir)
  3290. elif self.args.should_save:
  3291. self._save(output_dir)
  3292. # Push to the Hub when `save_model` is called by the user.
  3293. if self.args.push_to_hub and not _internal_call:
  3294. self.push_to_hub(commit_message="Model save", revision=self.args.hub_revision)
  3295. def _save(self, output_dir: str | None = None, state_dict: dict | None = None) -> None:
  3296. """Save model weights, configuration, and processing class to `output_dir`."""
  3297. # If we are executing this function, we are the process zero, so we don't check for that.
  3298. output_dir = output_dir if output_dir is not None else self.args.output_dir
  3299. os.makedirs(output_dir, exist_ok=True)
  3300. logger.info(f"Saving model checkpoint to {output_dir}")
  3301. supported_classes = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)
  3302. # Save a trained model and configuration using `save_pretrained()`.
  3303. # They can then be reloaded using `from_pretrained()`
  3304. if not isinstance(self.model, supported_classes):
  3305. if state_dict is None:
  3306. state_dict = self.model.state_dict()
  3307. if isinstance(self.accelerator.unwrap_model(self.model, keep_torch_compile=False), supported_classes):
  3308. self.accelerator.unwrap_model(self.model, keep_torch_compile=False).save_pretrained(
  3309. output_dir, state_dict=state_dict
  3310. )
  3311. else:
  3312. logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
  3313. safetensors.torch.save_file(
  3314. state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}
  3315. )
  3316. else:
  3317. self.model.save_pretrained(output_dir, state_dict=state_dict)
  3318. if self.processing_class is not None:
  3319. self.processing_class.save_pretrained(output_dir)
  3320. elif (
  3321. self.data_collator is not None
  3322. and hasattr(self.data_collator, "tokenizer")
  3323. and self.data_collator.tokenizer is not None
  3324. ):
  3325. logger.info("Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`")
  3326. self.data_collator.tokenizer.save_pretrained(output_dir)
  3327. # Good practice: save your training arguments together with the trained model
  3328. torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
  3329. # ---- Logging & Metrics ----
  3330. def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
  3331. """
  3332. Log `logs` on the various objects watching training.
  3333. Subclass and override this method to inject custom behavior.
  3334. Args:
  3335. logs (`dict[str, float]`):
  3336. The values to log.
  3337. start_time (`Optional[float]`):
  3338. The start of training.
  3339. """
  3340. if self.state.epoch is not None:
  3341. logs["epoch"] = self.state.epoch
  3342. if self.args.include_num_input_tokens_seen != "no":
  3343. logs["num_input_tokens_seen"] = self.state.num_input_tokens_seen
  3344. if start_time is not None:
  3345. current_session_num_tokens = self.state.num_input_tokens_seen - self._initial_num_input_tokens_seen
  3346. logs.update(speed_metrics("train", start_time, num_tokens=current_session_num_tokens))
  3347. output = {**logs, "step": self.state.global_step}
  3348. self.state.log_history.append(output)
  3349. self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
  3350. def store_flos(self) -> None:
  3351. """Store the number of floating-point operations that went into the model."""
  3352. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  3353. self.state.total_flos += (
  3354. distributed_broadcast_scalars([self.current_flos], device=self.args.device).sum().item()
  3355. )
  3356. self.current_flos = 0
  3357. else:
  3358. self.state.total_flos += self.current_flos
  3359. self.current_flos = 0
  3360. def floating_point_ops(self, inputs: dict[str, torch.Tensor | Any]) -> int:
  3361. """
  3362. For models that inherit from [`PreTrainedModel`], uses that method to compute the number of floating point
  3363. operations for every backward + forward pass. If using another model, either implement such a method in the
  3364. model or subclass and override this method.
  3365. Args:
  3366. inputs (`dict[str, torch.Tensor | Any]`):
  3367. The inputs and targets of the model.
  3368. Returns:
  3369. `int`: The number of floating-point operations.
  3370. """
  3371. if (main_input := getattr(self.model, "main_input_name", "input_ids")) in inputs and hasattr(
  3372. self.model, "num_parameters"
  3373. ):
  3374. return 6 * inputs[main_input].numel() * self.model.num_parameters(exclude_embeddings=True)
  3375. return 0
  3376. # ---- Hub Integration ----
  3377. def init_hf_repo(self, token: str | None = None) -> None:
  3378. """
  3379. Initializes a git repo in `self.args.hub_model_id`.
  3380. """
  3381. # Only on process zero
  3382. if not self.is_world_process_zero():
  3383. return
  3384. if self.args.hub_model_id is None:
  3385. repo_name = Path(self.args.output_dir).absolute().name
  3386. else:
  3387. repo_name = self.args.hub_model_id
  3388. token = token if token is not None else self.args.hub_token
  3389. repo_url = create_repo(repo_name, token=token, private=self.args.hub_private_repo, exist_ok=True)
  3390. self.hub_model_id = repo_url.repo_id
  3391. self.push_in_progress = None
  3392. def create_model_card(
  3393. self,
  3394. language: str | None = None,
  3395. license: str | None = None,
  3396. tags: str | list[str] | None = None,
  3397. model_name: str | None = None,
  3398. finetuned_from: str | None = None,
  3399. tasks: str | list[str] | None = None,
  3400. dataset_tags: str | list[str] | None = None,
  3401. dataset: str | list[str] | None = None,
  3402. dataset_args: str | list[str] | None = None,
  3403. ) -> None:
  3404. """
  3405. Creates a draft of a model card using the information available to the `Trainer`.
  3406. Args:
  3407. language (`str`, *optional*):
  3408. The language of the model (if applicable)
  3409. license (`str`, *optional*):
  3410. The license of the model. Will default to the license of the pretrained model used, if the original
  3411. model given to the `Trainer` comes from a repo on the Hub.
  3412. tags (`str` or `list[str]`, *optional*):
  3413. Some tags to be included in the metadata of the model card.
  3414. model_name (`str`, *optional*):
  3415. The name of the model.
  3416. finetuned_from (`str`, *optional*):
  3417. The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo
  3418. of the original model given to the `Trainer` (if it comes from the Hub).
  3419. tasks (`str` or `list[str]`, *optional*):
  3420. One or several task identifiers, to be included in the metadata of the model card.
  3421. dataset_tags (`str` or `list[str]`, *optional*):
  3422. One or several dataset tags, to be included in the metadata of the model card.
  3423. dataset (`str` or `list[str]`, *optional*):
  3424. One or several dataset identifiers, to be included in the metadata of the model card.
  3425. dataset_args (`str` or `list[str]`, *optional*):
  3426. One or several dataset arguments, to be included in the metadata of the model card.
  3427. """
  3428. if not self.is_world_process_zero():
  3429. return
  3430. model_card_filepath = os.path.join(self.args.output_dir, "README.md")
  3431. is_peft_library = False
  3432. if os.path.exists(model_card_filepath):
  3433. library_name = ModelCard.load(model_card_filepath).data.get("library_name")
  3434. is_peft_library = library_name == "peft"
  3435. # Append existing tags in `tags`
  3436. existing_tags = ModelCard.load(model_card_filepath).data.tags
  3437. if tags is not None and existing_tags is not None:
  3438. if isinstance(tags, str):
  3439. tags = [tags]
  3440. for tag in existing_tags:
  3441. if tag not in tags:
  3442. tags.append(tag)
  3443. training_summary = TrainingSummary.from_trainer(
  3444. self,
  3445. language=language,
  3446. license=license,
  3447. tags=tags,
  3448. model_name=model_name,
  3449. finetuned_from=finetuned_from,
  3450. tasks=tasks,
  3451. dataset_tags=dataset_tags,
  3452. dataset=dataset,
  3453. dataset_args=dataset_args,
  3454. )
  3455. model_card = training_summary.to_model_card()
  3456. with open(model_card_filepath, "w") as f:
  3457. f.write(model_card)
  3458. if is_peft_library:
  3459. self.accelerator.unwrap_model(self.model).create_or_update_model_card(self.args.output_dir)
  3460. def push_to_hub(
  3461. self,
  3462. commit_message: str | None = "End of training",
  3463. blocking: bool = True,
  3464. token: str | None = None,
  3465. revision: str | None = None,
  3466. **kwargs,
  3467. ) -> CommitInfo:
  3468. """
  3469. Upload `self.model` and `self.processing_class` to the 🤗 model hub on the repo `self.args.hub_model_id`.
  3470. Parameters:
  3471. commit_message (`str`, *optional*, defaults to `"End of training"`):
  3472. Message to commit while pushing.
  3473. blocking (`bool`, *optional*, defaults to `True`):
  3474. Whether the function should return only when the `git push` has finished.
  3475. token (`str`, *optional*, defaults to `None`):
  3476. Token with write permission to overwrite Trainer's original args.
  3477. revision (`str`, *optional*):
  3478. The git revision to commit from. Defaults to the head of the "main" branch.
  3479. kwargs (`dict[str, Any]`, *optional*):
  3480. Additional keyword arguments passed along to [`~Trainer.create_model_card`].
  3481. Returns:
  3482. The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
  3483. progress of the commit if `blocking=True`.
  3484. """
  3485. self.callback_handler.on_push_begin(self.args, self.state, self.control)
  3486. model_name = kwargs.pop("model_name", None)
  3487. if model_name is None and self.args.should_save:
  3488. if self.args.hub_model_id is None:
  3489. model_name = Path(self.args.output_dir).name
  3490. else:
  3491. model_name = self.args.hub_model_id.split("/")[-1]
  3492. token = token if token is not None else self.args.hub_token
  3493. # In case the user calls this method with args.push_to_hub = False
  3494. if self.hub_model_id is None:
  3495. self.init_hf_repo(token=token)
  3496. # Needs to be executed on all processes for TPU training, but will only save on the processed determined by
  3497. # self.args.should_save.
  3498. self.save_model(_internal_call=True)
  3499. # Only push from one node.
  3500. if not self.is_world_process_zero():
  3501. return
  3502. # Add additional tags in the case the model has already some tags and users pass
  3503. # "tags" argument to `push_to_hub` so that trainer automatically handles internal tags
  3504. # from all models since Trainer does not call `model.push_to_hub`.
  3505. if getattr(self.model, "model_tags", None) is not None:
  3506. if "tags" not in kwargs:
  3507. kwargs["tags"] = []
  3508. # If it is a string, convert it to a list
  3509. if isinstance(kwargs["tags"], str):
  3510. kwargs["tags"] = [kwargs["tags"]]
  3511. for model_tag in self.model.model_tags:
  3512. if model_tag not in kwargs["tags"]:
  3513. kwargs["tags"].append(model_tag)
  3514. self.create_model_card(model_name=model_name, **kwargs)
  3515. if revision is None:
  3516. revision = self.args.hub_revision
  3517. # Wait for the current upload to be finished.
  3518. self._finish_current_push()
  3519. return upload_folder(
  3520. repo_id=self.hub_model_id,
  3521. folder_path=self.args.output_dir,
  3522. commit_message=commit_message,
  3523. token=token,
  3524. run_as_future=not blocking,
  3525. ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
  3526. revision=revision,
  3527. )
  3528. def _push_from_checkpoint(self, checkpoint_folder: str) -> None:
  3529. """Push model and checkpoint files to the Hub from a checkpoint folder."""
  3530. if not self.is_world_process_zero() or self.args.hub_strategy == HubStrategy.END:
  3531. return
  3532. # If we haven't finished the last push, we don't do this one unless args.hub_always_push=True.
  3533. if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done():
  3534. return
  3535. self.callback_handler.on_push_begin(self.args, self.state, self.control)
  3536. output_dir = self.args.output_dir
  3537. # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
  3538. modeling_files = [CONFIG_NAME, GENERATION_CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME]
  3539. # Add sharded checkpoints if we have an index
  3540. for index_file in [WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME]:
  3541. index_path = os.path.join(checkpoint_folder, index_file)
  3542. if os.path.isfile(index_path):
  3543. modeling_files.append(index_file)
  3544. with open(index_path) as f:
  3545. index = json.loads(f.read())
  3546. shard_files = list(set(index["weight_map"].values()))
  3547. modeling_files.extend(shard_files)
  3548. if is_peft_available():
  3549. modeling_files.extend([ADAPTER_CONFIG_NAME, ADAPTER_WEIGHTS_NAME, ADAPTER_SAFE_WEIGHTS_NAME])
  3550. for modeling_file in modeling_files:
  3551. if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)):
  3552. shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file))
  3553. # Saving the processing class is fast and we don't know how many files it may have spawned, so we resave it to be sure.
  3554. if self.processing_class is not None:
  3555. self.processing_class.save_pretrained(output_dir)
  3556. # Same for the training arguments
  3557. torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
  3558. if self.args.save_strategy == SaveStrategy.STEPS:
  3559. commit_message = f"Training in progress, step {self.state.global_step}"
  3560. else:
  3561. commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
  3562. model_push_job = upload_folder(
  3563. repo_id=self.hub_model_id,
  3564. folder_path=output_dir,
  3565. commit_message=commit_message,
  3566. token=self.args.hub_token,
  3567. run_as_future=True,
  3568. ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
  3569. revision=self.args.hub_revision,
  3570. )
  3571. push_jobs = [model_push_job]
  3572. if self.args.hub_strategy in [HubStrategy.CHECKPOINT, HubStrategy.ALL_CHECKPOINTS]:
  3573. path_in_repo = (
  3574. "last-checkpoint" if self.args.hub_strategy == HubStrategy.CHECKPOINT else Path(checkpoint_folder).name
  3575. )
  3576. checkpoint_push = upload_folder(
  3577. repo_id=self.hub_model_id,
  3578. folder_path=checkpoint_folder,
  3579. path_in_repo=path_in_repo,
  3580. commit_message=commit_message + ", checkpoint",
  3581. token=self.args.hub_token,
  3582. run_as_future=True,
  3583. revision=self.args.hub_revision,
  3584. )
  3585. push_jobs.append(checkpoint_push)
  3586. if self.push_in_progress is None or self.push_in_progress.is_done():
  3587. self.push_in_progress = PushInProgress(push_jobs)
  3588. else:
  3589. self.push_in_progress.jobs.extend(push_jobs)
  3590. def _finish_current_push(self) -> None:
  3591. """Wait for any in-progress push to the Hub to complete."""
  3592. if not hasattr(self, "push_in_progress"):
  3593. return
  3594. if self.push_in_progress is not None and not self.push_in_progress.is_done():
  3595. logger.info("Waiting for the current checkpoint push to be finished, this might take a couple of minutes.")
  3596. self.push_in_progress.wait_until_done()
  3597. # ---- Hyperparameter Search ----
  3598. def hyperparameter_search(
  3599. self,
  3600. hp_space: Callable[["optuna.Trial"], dict[str, float]] | None = None,
  3601. compute_objective: Callable[[dict[str, float]], float] | None = None,
  3602. n_trials: int = 20,
  3603. direction: str | list[str] = "minimize",
  3604. backend: str | HPSearchBackend | None = None,
  3605. hp_name: Callable[["optuna.Trial"], str] | None = None,
  3606. **kwargs,
  3607. ) -> BestRun | list[BestRun]:
  3608. """
  3609. Launch a hyperparameter search using `optuna` or `Ray Tune`. The optimized quantity is determined
  3610. by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided,
  3611. the sum of all metrics otherwise.
  3612. <Tip warning={true}>
  3613. To use this method, you need to have provided a `model_init` when initializing your [`Trainer`]: we need to
  3614. reinitialize the model at each new run. This is incompatible with the `optimizers` argument, so you need to
  3615. subclass [`Trainer`] and override the method [`~Trainer.create_optimizer_and_scheduler`] for custom
  3616. optimizer/scheduler.
  3617. </Tip>
  3618. Args:
  3619. hp_space (`Callable[["optuna.Trial"], dict[str, float]]`, *optional*):
  3620. A function that defines the hyperparameter search space. Will default to
  3621. [`~trainer_utils.default_hp_space_optuna`] or [`~trainer_utils.default_hp_space_ray`]
  3622. depending on your backend.
  3623. compute_objective (`Callable[[dict[str, float]], float]`, *optional*):
  3624. A function computing the objective to minimize or maximize from the metrics returned by the `evaluate`
  3625. method. Will default to [`~trainer_utils.default_compute_objective`].
  3626. n_trials (`int`, *optional*, defaults to 100):
  3627. The number of trial runs to test.
  3628. direction (`str` or `list[str]`, *optional*, defaults to `"minimize"`):
  3629. If it's single objective optimization, direction is `str`, can be `"minimize"` or `"maximize"`, you
  3630. should pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or
  3631. several metrics. If it's multi objectives optimization, direction is `list[str]`, can be List of
  3632. `"minimize"` and `"maximize"`, you should pick `"minimize"` when optimizing the validation loss,
  3633. `"maximize"` when optimizing one or several metrics.
  3634. backend (`str` or [`~training_utils.HPSearchBackend`], *optional*):
  3635. The backend to use for hyperparameter search. Will default to optuna or Ray Tune, depending
  3636. on which one is installed. If all are installed, will default to optuna.
  3637. hp_name (`Callable[["optuna.Trial"], str]]`, *optional*):
  3638. A function that defines the trial/run name. Will default to None.
  3639. kwargs (`dict[str, Any]`, *optional*):
  3640. Additional keyword arguments for each backend:
  3641. - `optuna`: parameters from
  3642. [optuna.study.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html)
  3643. and also the parameters `timeout`, `n_jobs` and `gc_after_trial` from
  3644. [optuna.study.Study.optimize](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize)
  3645. - `ray`: parameters from [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run).
  3646. If `resources_per_trial` is not set in the `kwargs`, it defaults to 1 CPU core and 1 GPU (if available).
  3647. If `progress_reporter` is not set in the `kwargs`,
  3648. [ray.tune.CLIReporter](https://docs.ray.io/en/latest/tune/api/doc/ray.tune.CLIReporter.html) is used.
  3649. Returns:
  3650. [`trainer_utils.BestRun` or `list[trainer_utils.BestRun]`]: All the information about the best run or best
  3651. runs for multi-objective optimization. Experiment summary can be found in `run_summary` attribute for Ray
  3652. backend.
  3653. """
  3654. if backend is None:
  3655. backend = default_hp_search_backend()
  3656. backend = HPSearchBackend(backend)
  3657. backend_obj = ALL_HYPERPARAMETER_SEARCH_BACKENDS[backend]()
  3658. backend_obj.ensure_available()
  3659. self.hp_search_backend = backend
  3660. if self.model_init is None:
  3661. raise RuntimeError(
  3662. "To use hyperparameter search, you need to pass your model through a model_init function."
  3663. )
  3664. self.hp_space = backend_obj.default_hp_space if hp_space is None else hp_space
  3665. self.hp_name = hp_name
  3666. self.compute_objective = default_compute_objective if compute_objective is None else compute_objective
  3667. best_run = backend_obj.run(self, n_trials, direction, **kwargs)
  3668. self.hp_search_backend = None
  3669. return best_run
  3670. def call_model_init(self, trial: "optuna.Trial | dict[str, Any] | None" = None) -> nn.Module:
  3671. """Invoke `model_init` to get a fresh model instance, optionally conditioned on a hyperparameter trial."""
  3672. model_init_argcount = number_of_arguments(self.model_init)
  3673. if model_init_argcount == 0:
  3674. model = self.model_init()
  3675. elif model_init_argcount == 1:
  3676. model = self.model_init(trial)
  3677. else:
  3678. raise RuntimeError("model_init should have 0 or 1 argument.")
  3679. if model is None:
  3680. raise RuntimeError("model_init should not return None.")
  3681. return model
  3682. def _hp_search_setup(self, trial: "optuna.Trial | dict[str, Any] | None") -> None:
  3683. """Set up training arguments and accelerator state for a hyperparameter search trial."""
  3684. self._trial = trial
  3685. if self.hp_search_backend is None or trial is None:
  3686. return
  3687. if self.hp_search_backend == HPSearchBackend.OPTUNA:
  3688. params = self.hp_space(trial)
  3689. elif self.hp_search_backend == HPSearchBackend.RAY:
  3690. params = trial
  3691. params.pop("wandb", None)
  3692. elif self.hp_search_backend == HPSearchBackend.WANDB:
  3693. params = trial
  3694. for key, value in params.items():
  3695. if not hasattr(self.args, key):
  3696. logger.warning(
  3697. f"Trying to set {key} in the hyperparameter search but there is no corresponding field in"
  3698. " `TrainingArguments`."
  3699. )
  3700. continue
  3701. old_attr = getattr(self.args, key, None)
  3702. # Casting value to the proper type
  3703. if old_attr is not None:
  3704. value = type(old_attr)(value)
  3705. setattr(self.args, key, value)
  3706. if self.hp_search_backend == HPSearchBackend.OPTUNA:
  3707. logger.info(f"Trial: {trial.params}")
  3708. if self.hp_search_backend == HPSearchBackend.WANDB:
  3709. logger.info(f"W&B Sweep parameters: {trial}")
  3710. if self.is_deepspeed_enabled:
  3711. if self.args.deepspeed is None:
  3712. raise ValueError("For sweeps with deepspeed, `args.deepspeed` must be set")
  3713. self.accelerator.free_memory()
  3714. # Rebuild the deepspeed config to reflect the updated training parameters
  3715. from accelerate.utils import DeepSpeedPlugin
  3716. from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
  3717. self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed)
  3718. self.args.hf_deepspeed_config.trainer_config_process(self.args)
  3719. self.args.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.args.hf_deepspeed_config)
  3720. # From 1.0 on, we need to fully wipe the DS plugin when doing sweeps.
  3721. # Simply calling `_reset_state` is enough and doesn't need a version pin.
  3722. AcceleratorState()._reset_state()
  3723. # `train_batch_size` might change when using HPO https://github.com/huggingface/transformers/pull/18918
  3724. self._train_batch_size = self.args.train_batch_size
  3725. self.create_accelerator_and_postprocess()
  3726. def _report_to_hp_search(
  3727. self, trial: "optuna.Trial | dict[str, Any] | None", step: int, metrics: dict[str, float]
  3728. ) -> None:
  3729. """Report intermediate metrics to the active hyperparameter search backend."""
  3730. if self.hp_search_backend is None or trial is None:
  3731. return
  3732. metrics = metrics.copy()
  3733. self.objective = self.compute_objective(metrics)
  3734. if self.hp_search_backend == HPSearchBackend.OPTUNA:
  3735. import optuna
  3736. if hasattr(trial, "study") and not trial.study._is_multi_objective():
  3737. trial.report(self.objective, step)
  3738. if trial.should_prune():
  3739. self.callback_handler.on_train_end(self.args, self.state, self.control)
  3740. raise optuna.TrialPruned()
  3741. elif self.hp_search_backend == HPSearchBackend.RAY:
  3742. import ray.tune
  3743. with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
  3744. checkpoint = None
  3745. if self.control.should_save:
  3746. self._tune_save_checkpoint(checkpoint_dir=temp_checkpoint_dir)
  3747. checkpoint = ray.tune.Checkpoint.from_directory(temp_checkpoint_dir)
  3748. metrics["objective"] = self.objective
  3749. ray.tune.report(metrics, checkpoint=checkpoint)
  3750. def _tune_save_checkpoint(self, checkpoint_dir: str) -> None:
  3751. """Save a checkpoint during a Ray Tune hyperparameter search trial."""
  3752. output_dir = os.path.join(checkpoint_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
  3753. self.save_model(output_dir, _internal_call=True)
  3754. if self.args.should_save:
  3755. # Update the `TrainerControl` state to where we are currently
  3756. self.state.stateful_callbacks["TrainerControl"] = self.control.state()
  3757. self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
  3758. torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
  3759. torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
  3760. # ---- Callbacks ----
  3761. def add_callback(self, callback: type[TrainerCallback] | TrainerCallback) -> None:
  3762. """
  3763. Add a callback to the current list of [`~transformers.TrainerCallback`].
  3764. Args:
  3765. callback (`type` or [`~transformers.TrainerCallback]`):
  3766. A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
  3767. first case, will instantiate a member of that class.
  3768. """
  3769. self.callback_handler.add_callback(callback)
  3770. def pop_callback(self, callback: type[TrainerCallback] | TrainerCallback) -> TrainerCallback | None:
  3771. """
  3772. Remove a callback from the current list of [`~transformers.TrainerCallback`] and returns it.
  3773. If the callback is not found, returns `None` (and no error is raised).
  3774. Args:
  3775. callback (`type` or [`~transformers.TrainerCallback]`):
  3776. A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
  3777. first case, will pop the first member of that class found in the list of callbacks.
  3778. Returns:
  3779. [`~transformers.TrainerCallback`]: The callback removed, if found.
  3780. """
  3781. return self.callback_handler.pop_callback(callback)
  3782. def remove_callback(self, callback: type[TrainerCallback] | TrainerCallback) -> None:
  3783. """
  3784. Remove a callback from the current list of [`~transformers.TrainerCallback`].
  3785. Args:
  3786. callback (`type` or [`~transformers.TrainerCallback]`):
  3787. A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
  3788. first case, will remove the first member of that class found in the list of callbacks.
  3789. """
  3790. self.callback_handler.remove_callback(callback)
  3791. # ---- Utilities ----
  3792. def is_local_process_zero(self) -> bool:
  3793. """
  3794. Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
  3795. machines) main process.
  3796. """
  3797. return self.args.local_process_index == 0
  3798. def is_world_process_zero(self) -> bool:
  3799. """
  3800. Whether or not this process is the global main process (when training in a distributed fashion on several
  3801. machines, this is only going to be `True` for one process).
  3802. """
  3803. # Special case for SageMaker ModelParallel since there process_index is dp_process_index, not the global
  3804. # process index.
  3805. if is_sagemaker_mp_enabled():
  3806. return smp.rank() == 0
  3807. return self.args.process_index == 0
  3808. def _move_model_to_device(self, model: nn.Module, device: torch.device) -> None:
  3809. """Move the model to the specified device, re-tying weights on XLA if needed."""
  3810. if getattr(model, "hf_device_map", None) is not None:
  3811. logger.warning(
  3812. "The model is already on multiple devices. Skipping the move to device specified in `args`."
  3813. )
  3814. return
  3815. model = model.to(device)
  3816. # Moving a model to an XLA device disconnects the tied weights, so we have to retie them.
  3817. if self.args.parallel_mode == ParallelMode.TPU and hasattr(model, "tie_weights"):
  3818. model.tie_weights()