derivatives.yaml 179 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253
  1. # Defines derivative formulas and Python signatures of methods on Variable
  2. #
  3. # Note about possibly confusing nomenclature: An 'output gradient' is the
  4. # gradient of an output of a forward function. Output gradients are used as
  5. # the inputs to backward functions. `grads` is a vector of output gradients,
  6. # and `grad == grads[0]`, in all the derivative formulas in this file.
  7. # An 'input gradient' is the gradient of an input to a forward function.
  8. # Input gradients are the outputs of backward functions, corresponding to the
  9. # input names included in the derivative formulas defined in this file.
  10. # Also, every time we talk computing "gradient" we actually mean computing
  11. # the vector jacobian product using the given 'output gradient' as the vector.
  12. #
  13. # Each entry consists of:
  14. # - A 'name', which specifies the ATen name of the function you
  15. # are defining derivatives for, and an argument specification.
  16. # - An optional 'dispatch' entry which can be used to specify
  17. # per-autograd dispatch key derivatives. If this entry is not
  18. # specified, then the gradient entries will be taken as the
  19. # default gradients (i.e. registered for every backward dispatch
  20. # key). (see _test_autograd_multiple_dispatch for an example
  21. # of how to register separate derivates for different dispatch keys).
  22. # The list of allowed dispatch keys (in addition to 'Default' which
  23. # represents the Autograd alias key) is torchgen/model.py:AUTOGRAD_KEYS.
  24. # - One or more gradients entries, mapping differentiable input
  25. # names to a formula specifying how to compute its gradient.
  26. # Note that a single gradient entry can specify the gradient
  27. # formula for multiple input names, by specifying a key
  28. # "input1, input2" (see atan2 for an example).
  29. # - An argument can be flagged as 'non_differentiable'.
  30. # - Optional entry with key 'output_differentiability' and value a list of the
  31. # same length as the number of outputs from the forward function. The list
  32. # should contain only booleans, specifying whether each of the output Tensor
  33. # is differentiable.
  34. # If it is not specified for a function that returns multiple elements but
  35. # uses `grad` instead of `grads[idx]`, then all but the first output will
  36. # be marked as non-differentiable.
  37. # If None of the output is differentiable, you can also add the function
  38. # name to `gen_variable_type.py`'s `DONT_REQUIRE_DERIVATIVE` list.
  39. #
  40. # There are two cases for Tensor and TensorList arguments here:
  41. # - If that argument is differentiable, in the sense that a gradient with respect
  42. # to that argument could exist. You should either:
  43. # - Specify the formula for that gradient
  44. # - Specify not_implemented("function_name") as a formula to say that this is not
  45. # implemented yet (but might be in the future and the user can request that on an issue)
  46. # - If that argument is not differentiable, because it is not a floating point dtype or the
  47. # function is not differentiable with respect to that argument for
  48. # example. You should either:
  49. # - Do not specify any formula for this argument
  50. # - Specify explicitly that this argument is "non_differentiable". Note that in this case,
  51. # we trust you that this argument will never have requires_grad=True and it will be silently
  52. # ignored if it does.
  53. #
  54. # If a function has out-of-place and in-place variants, then the derivative
  55. # definition for the in-place variant is optional. It will default to the
  56. # definition for the out-of-place variant. Note that _out variants are never
  57. # differentiable.
  58. #
  59. # Gradient expressions are standard C++ expressions operating on ATen
  60. # variables. In a gradient expression, the following variables/functions
  61. # are in scope:
  62. #
  63. # - 'grad', the gradient of the output (often spelled grad_output
  64. # in Python) which we are going to left-multiply.
  65. #
  66. # When a function returns multiple *differentiable* outputs,
  67. # you can refer to the gradients of each outputs using 'grads',
  68. # e.g., 'grads[0]', 'grads[1]'.
  69. #
  70. # When a function returns multiple *differentiable* outputs that
  71. # are named, you can refer to the gradients of each outputs using
  72. # 'grad_{name}', e.g., 'grad_x', 'grad_y'.
  73. #
  74. # When a function returns *one* differentiable output (the
  75. # first output) and some more nondifferentiable outputs,
  76. # you MUST refer to the gradient of the differentiable output with
  77. # 'grad' (this case is special-cased in our code generation).
  78. #
  79. # Note that the number of differentiable outputs can be modified by the
  80. # 'output_differentiability' entry (see above).
  81. #
  82. # Across a differentiable function's derivatives set, it is not
  83. # permitted to mix the use of "grad", "grads", and
  84. # "grad_{name}". You must be consistent for that differentiable
  85. # function.
  86. #
  87. # - Any of the input arguments, tensor or non-tensor, including
  88. # argument names that only appear in Declarations.yaml, e.g. 'output'.
  89. #
  90. # - 'result', representing the result of evaluating the forward
  91. # expression for ATen native function declarations. If the forward
  92. # expression outputs a tuple, use 'resultX' instead to access the
  93. # X-th entry
  94. #
  95. # - 'grad_input_mask', a std::array<bool, n>, specifies which input
  96. # gradients are actually needed. For example, in the entry
  97. # `input0, input1: foo(grad_input_mask)`, `grad_input_mask` is a size
  98. # two array, where `grad_input_mask[0]` is true if `input0` requires
  99. # grad, and `grad_input_mask[1]` is true if `input1` requires grad.
  100. #
  101. # (NB: if your function computes gradient for a list of tensors,
  102. # the `grad_input_mask` will only have a single entry for the list
  103. # specifying if either zero or at least one tensor from the list requires
  104. # grad. If we want to support more fine-grained signalling,
  105. # we'll need some alternate variable which is not a std::array)
  106. #
  107. # - 'retain_variables', a bool which is true if a user has specified
  108. # that saved variables should be retained in case the backwards is
  109. # run again later. This allows an optimization where we can
  110. # destroy saved buffers if we know variables are not going to be retained,
  111. # e.g., it is used by _cudnn_rnn
  112. #
  113. # - `wrap_opt_if`, is a 2-argument function that accepts a tensor
  114. # variable and a boolean condition that dictates whether to save that
  115. # variable in a graph. The result of this function is `std::optional<Tensor>`,
  116. # and it is `::std::nullopt` when the condition evaluates to `false`,
  117. # otherwise it is the variable wrapped in `std::optional<Tensor>`.
  118. # For example, wrap_opt_if(var_0, grad_input_mask[1] || grad_input_mask[2])
  119. # would mean that `var_0` is saved as long as the second (grad_input_mask[1])
  120. # or the third (grad_input_mask[2]) argument requires gradients.
  121. # Another interpretation of this expression would read as `var_0` is needed
  122. # in the backward computation of the second or the third argument.
  123. # NOTE: the usage of `var_i.requires_grad()` in the conditional expression
  124. # is not supported, use `grad_input_mask[i]` instead.
  125. # NOTE: `wrap_opt_if` could be used to prevent saving redundant variables
  126. # with multi-output backward formulas.
  127. # See https://github.com/pytorch/pytorch/issues/97575 for more details
  128. # on the issue.
  129. #
  130. # If you need a complex expression, e.g., with local variables,
  131. # write a _backward function in torch/csrc/autograd/FunctionsManual.cpp
  132. # and invoke it from here. By the way, go read
  133. # https://github.com/zdevito/ATen/issues/163; this describes an
  134. # important hazard that occurs when porting backwards from Python to C++
  135. #
  136. # Double backwards gradient expressions can be somewhat confusing;
  137. # the most important thing to remember is: (1) you need to define a
  138. # derivative formula for every input, including inputs named things
  139. # like 'grad_output', and (2) the gradient to multiply with is always
  140. # called 'grad' (even though it really is a grad-grad).
  141. #
  142. # You can also add forward derivative definition by defining a formula for
  143. # a returned value (in general "result" if the name is not specified). This
  144. # formula works the same way as the backward one and advanced implementations
  145. # should also be placed in the FunctionsManual file.
  146. # This formula should compute a single Jacobian vector product using the (primal)
  147. # value of the argument "foo_p", its forward grad "foo_t" and the result of the
  148. # function as "result".
  149. # Note that the forward derivative can be automatically generated in two cases:
  150. # - if your function is linear (NOT affine or multi-linear), then you can
  151. # specify so by just using the string "auto_linear" for the formula.
  152. # - if your function is applied element wise (and has a single input), you
  153. # can specify so by just using the string "auto_element_wise" for the formula.
  154. #
  155. # Note that to avoid unpacking overhead, functions taking TensorList as inputs
  156. # will always have their forward grad formula called. This function is responsible
  157. # to check if any computation is needed and should return an undefined Tensor when
  158. # there is nothing to do. You can check "cat_forward" for a full example.
  159. #
  160. # NB: There are a number of gradient definitions in here which are bogus
  161. # (implemented using zeros_like). These gradients are (hopefully) not
  162. # used by our frontend. You MUST check the frontend code; search for
  163. # OpName.apply to see if it's still using a legacy Python style API.
  164. #
  165. # Note: Returning views.
  166. # The following cases exist:
  167. # - If a function returns no view, it can have arbitrary outputs.
  168. # - If a function return at least one Tensor that is a differentiable view
  169. # of one of its input:
  170. # - If there is only one differentiable output, this Tensor is marked as a
  171. # differentiable view. (alias or transpose for example)
  172. # - If there are more than one differentiable output, by default all the views are
  173. # marked as differentiable views and created with allow_rebase_history=false.
  174. # Meaning that any inplace operation on it will raise an error. (unbind for example)
  175. #
  176. # Notes about undefined output gradients:
  177. # All backward functions must support all combinations of undefined output
  178. # gradient Tensors, where `grad[i].defined() == false`. Depending on the
  179. # number of input and output grads your derivative formula uses, code
  180. # generation may automatically add some level of undefined grad support,
  181. # according to these three cases:
  182. #
  183. # * 1 input grad and 1 output grad:
  184. # Complete undefined grad support is automatically added, so you
  185. # shouldn't have to think about it, unless there is a bug in the code
  186. # generation.
  187. #
  188. # * 1 input grad and multiple output grads:
  189. # Undefined grad support is automatically added ONLY in the case where
  190. # all output grads are undefined. You will have to add explicit support
  191. # for cases where a subset of output grads is undefined.
  192. #
  193. # * multiple input grads:
  194. # No automatic support, so you will need to add it.
  195. #
  196. # If your derivative formula uses more than one output grad, it is usually
  197. # preferable to add undefined grad support in the backward function itself
  198. # (if you're using one), rather than in the derivative formula in this file.
  199. #
  200. # Undefined Tensors are created with the default constructor `at::Tensor()`.
  201. # It is an efficient way to represent a Tensor filled with zeros because
  202. # the Tensor holds no sizing information and no Storage data is allocated.
  203. # But consequently, Tensor operations cannot be performed on them.
  204. # Therefore, your backward function should treat an undefined output grad as
  205. # a zero, and it needs to be a special case.
  206. #
  207. # If all output grads are undefined, then it should be correct for the
  208. # backward function to return undefined input grads. Since we use the chain
  209. # rule, output grads equal to zero should result in input grads equal to zero,
  210. # unless there is some rare special case.
  211. #
  212. # If a subset of output grads is undefined, then it may be acceptable for
  213. # the backward function to return undefined input grads--it depends on the
  214. # specific function, so you'll have to determine that yourself. If returning
  215. # an undefined Tensor is correct for a given input grad, it is also logically
  216. # correct to return a defined grad full of zeros, but that would not be
  217. # preferable since it would be less efficient.
  218. #
  219. # NB: The parameter names here MUST be consistent with the parameter names
  220. # in native_functions.yaml
  221. - name: abs(Tensor self) -> Tensor
  222. self: grad * self.sgn()
  223. result: handle_r_to_c(result.scalar_type(), self_t.conj() * self_p.sgn())
  224. - name: acos(Tensor self) -> Tensor
  225. self: grad * -((-self * self + 1).rsqrt()).conj()
  226. result: auto_element_wise
  227. - name: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  228. self: handle_r_to_c(self.scalar_type(), grad)
  229. other: handle_r_to_c(other.scalar_type(), maybe_multiply(grad, alpha.conj()))
  230. result: self_t + maybe_multiply(other_t, alpha)
  231. - name: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
  232. self: handle_r_to_c(self.scalar_type(), grad)
  233. result: self_t.clone()
  234. - name: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  235. self: maybe_multiply(grad, beta.conj())
  236. batch1: maybe_multiply(grad.unsqueeze(0).expand_symint({ batch1.sym_size(0), batch1.sym_size(1), batch2.sym_size(2) }).bmm(batch2.transpose(1, 2).conj()), alpha.conj())
  237. batch2: maybe_multiply(batch1.transpose(1, 2).conj().bmm(grad.unsqueeze(0).expand_symint({ batch1.sym_size(0), batch1.sym_size(1), batch2.sym_size(2) })), alpha.conj())
  238. result: maybe_multiply(self_t, beta) + maybe_multiply(batch1_t.bmm(batch2_p).sum(0), alpha) + maybe_multiply(batch1_p.bmm(batch2_t).sum(0), alpha)
  239. - name: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
  240. self: handle_r_to_c(self.scalar_type(), grad)
  241. tensor1: handle_r_to_c(tensor1.scalar_type(), grad * (value / tensor2).conj())
  242. tensor2: handle_r_to_c(tensor2.scalar_type(), -grad * (value * tensor1 / (tensor2 * tensor2)).conj())
  243. result: self_t + maybe_multiply(tensor1_t / tensor2_p, value) - maybe_multiply(tensor2_t * (tensor1_p / tensor2_p) / tensor2_p, value)
  244. - name: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
  245. self: handle_r_to_c(self.scalar_type(), grad)
  246. tensor1: handle_r_to_c(tensor1.scalar_type(), grad * (tensor2 * value).conj())
  247. tensor2: handle_r_to_c(tensor2.scalar_type(), grad * (tensor1 * value).conj())
  248. result: self_t + maybe_multiply(tensor1_t * tensor2_p, value) + maybe_multiply(tensor2_t * tensor1_p, value)
  249. - name: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  250. self: maybe_multiply(grad, beta.conj())
  251. mat1: mm_mat1_backward(grad, mat2, mat1.sym_sizes(), mat1.sym_strides(), mat1.layout(), alpha)
  252. mat2: mm_mat2_backward(grad, mat1, mat2.sym_sizes(), mat2.sym_strides(), mat2.layout(), alpha)
  253. result: maybe_multiply(self_t, beta) + maybe_multiply(mat1_t.mm(mat2_p), alpha) + maybe_multiply(mat1_p.mm(mat2_t), alpha)
  254. - name: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  255. self: maybe_multiply(grad, beta)
  256. mat1: mm_mat1_sparse_backward(grad, mat1, mat2, alpha)
  257. mat2: mm_mat2_backward(grad, mat1, mat2.sym_sizes(), mat2.sym_strides(), mat2.layout(), alpha)
  258. - name: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  259. self: maybe_multiply(grad, beta.conj())
  260. mat: maybe_multiply(grad.ger(vec.conj()), alpha.conj())
  261. vec: maybe_multiply(mat.t().conj().mv(grad), alpha.conj())
  262. result: maybe_multiply(self_t, beta) + maybe_multiply(mat_t.mv(vec_p), alpha) + maybe_multiply(mat_p.mv(vec_t), alpha)
  263. - name: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  264. self: maybe_multiply(grad, beta.conj())
  265. vec1: maybe_multiply(grad.mv(vec2.conj()), alpha.conj())
  266. vec2: maybe_multiply(grad.t().mv(vec1.conj()), alpha.conj())
  267. result: maybe_multiply(self_t, beta) + maybe_multiply(vec1_t.outer(vec2_p), alpha) + maybe_multiply(vec1_p.outer(vec2_t), alpha)
  268. - name: affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor
  269. theta: affine_grid_generator_backward_symint(grad, size, align_corners)
  270. result: auto_linear
  271. - name: alias(Tensor(a) self) -> Tensor(a)
  272. self: grad
  273. result: self_t
  274. - name: angle(Tensor self) -> Tensor
  275. self: angle_backward(grad, self)
  276. result: handle_r_to_c(result.scalar_type(), angle_backward(self_t.conj(), self_p).conj())
  277. # The four items below are necessary because TensorIterator doesn't work on
  278. # Variables (codegen does not unwrap the input Tensor for all() and any() ).
  279. - name: any(Tensor self) -> Tensor
  280. output_differentiability: [False]
  281. - name: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
  282. output_differentiability: [False]
  283. - name: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
  284. output_differentiability: [False]
  285. - name: _is_all_true(Tensor self) -> Tensor
  286. self: non_differentiable
  287. - name: _is_any_true(Tensor self) -> Tensor
  288. self: non_differentiable
  289. - name: all(Tensor self) -> Tensor
  290. output_differentiability: [False]
  291. - name: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
  292. output_differentiability: [False]
  293. - name: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
  294. output_differentiability: [False]
  295. - name: acosh(Tensor self) -> Tensor
  296. # Save one rsqrt in the real case by using that for x real and positive sqrt(x*y) = sqrt(x)*sqrt(y) (not true in the complex case)
  297. self: "self.is_complex() ? grad * ((self + 1).rsqrt() * (self - 1).rsqrt()).conj() : grad * (self * self - 1).rsqrt()"
  298. result: auto_element_wise
  299. - name: acosh_(Tensor(a!) self) -> Tensor(a!)
  300. self: not_implemented("inplace version of acosh")
  301. - name: asinh(Tensor self) -> Tensor
  302. self: grad * (self.pow(2) + 1).rsqrt().conj()
  303. result: auto_element_wise
  304. - name: asinh_(Tensor(a!) self) -> Tensor(a!)
  305. self: not_implemented("inplace version of asinh")
  306. - name: atanh(Tensor self) -> Tensor
  307. self: grad * 1 / (1 - self.pow(2)).conj()
  308. result: auto_element_wise
  309. - name: atanh_(Tensor(a!) self) -> Tensor(a!)
  310. self: not_implemented("inplace version of atanh")
  311. - name: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
  312. self: as_strided_backward(grad, TensorGeometry(self), size, stride, storage_offset)
  313. result: auto_linear
  314. - name: as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!)
  315. self: as_strided_backward(grad, TensorGeometry(self), size, stride, storage_offset)
  316. result: auto_linear
  317. - name: asin(Tensor self) -> Tensor
  318. self: grad * (-self * self + 1).rsqrt().conj()
  319. result: auto_element_wise
  320. - name: atan(Tensor self) -> Tensor
  321. self: grad / (self * self + 1).conj()
  322. result: auto_element_wise
  323. - name: atan2(Tensor self, Tensor other) -> Tensor
  324. self, other: atan2_backward(grad, self, other, grad_input_mask)
  325. result: (-self_p * other_t + other_p * self_t) / (self_p.pow(2) + other_p.pow(2))
  326. - name: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  327. self: maybe_multiply(grad, beta.conj())
  328. batch1: maybe_multiply(grad.bmm(batch2.transpose(1, 2).conj()), alpha.conj())
  329. batch2: maybe_multiply(batch1.transpose(1, 2).conj().bmm(grad), alpha.conj())
  330. result: maybe_multiply(self_t, beta) + maybe_multiply(batch1_t.bmm(batch2_p), alpha) + maybe_multiply(batch1_p.bmm(batch2_t), alpha)
  331. - name: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor
  332. self: zeros_like(grad)
  333. result: auto_element_wise
  334. - name: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
  335. self: zeros_like(grad)
  336. p: zeros_like(p)
  337. result: self_t.zero_()
  338. - name: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
  339. self: zeros_like(grad)
  340. result: self_t.zero_()
  341. - name: bmm(Tensor self, Tensor mat2) -> Tensor
  342. self: grad.bmm(mat2.transpose(1, 2).conj())
  343. mat2: self.transpose(1, 2).conj().bmm(grad)
  344. result: self_t.bmm(mat2_p) + self_p.bmm(mat2_t)
  345. - name: matmul(Tensor self, Tensor other) -> Tensor
  346. self, other: matmul_backward(grad, self, other, grad_input_mask)
  347. - name: cat(Tensor[] tensors, int dim=0) -> Tensor
  348. tensors: cat_tensors_backward(grad, to_args_sizes_symint(tensors), to_args_scalartypes(tensors), dim)
  349. result: cat_jvp(tensors, dim)
  350. - name: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
  351. self: zeros_like(grad)
  352. result: self_t.zero_()
  353. - name: ceil(Tensor self) -> Tensor
  354. self: zeros_like(grad)
  355. result: auto_element_wise
  356. - name: cholesky(Tensor self, bool upper=False) -> Tensor
  357. self: cholesky_backward(grad, upper, result)
  358. - name: chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]
  359. dispatch:
  360. Default:
  361. # the default case will use the CompositeImplicitAutograd
  362. self: not_implemented("chunk")
  363. AutogradNestedTensor:
  364. self: chunk_backward_nested(grads, self, chunks, dim)
  365. - name: linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info)
  366. self: cholesky_backward(grad, upper, L)
  367. L: cholesky_jvp(self_t, L, upper)
  368. - name: cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor
  369. self, input2: cholesky_solve_backward(grad, self, input2, result, upper, grad_input_mask)
  370. result: cholesky_solve_jvp(result, input2_p, input2_t, self_t, upper)
  371. - name: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
  372. self: cholesky_inverse_backward(grad, self, upper, result)
  373. result: cholesky_inverse_jvp(self_p, self_t, result, upper)
  374. # For clamp, gradient is not defined at the boundaries. But empirically it's helpful
  375. # to be able to get gradient on min and max, so we return the subgradient 1 for these cases.
  376. - name: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
  377. self: clamp_backward(grad, self, min, max)
  378. min, max: clamp_backward_min_max(grad, self, min, max, grad_input_mask)
  379. result: clamp_jvp(self_p, self_t, min_p, min_t, max_p, max_t)
  380. - name: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
  381. self: clamp_backward(grad, self, min, max)
  382. result: auto_element_wise
  383. - name: clamp_min(Tensor self, Scalar min) -> Tensor
  384. self: where(self >= min, grad, at::scalar_tensor(0., grad.options()))
  385. result: auto_element_wise
  386. - name: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
  387. self: where(self >= min, grad, at::scalar_tensor(0., grad.options()))
  388. min: where(self < min, grad, at::scalar_tensor(0., grad.options()))
  389. result: where(self_p >= min_p, self_t, min_t)
  390. - name: clamp_max(Tensor self, Scalar max) -> Tensor
  391. self: where(self <= max, grad, at::scalar_tensor(0., grad.options()))
  392. result: auto_element_wise
  393. - name: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
  394. self: where(self <= max, grad, at::scalar_tensor(0., grad.options()))
  395. max: where(self > max, grad, at::scalar_tensor(0., grad.options()))
  396. result: where(self_p <= max_p, self_t, max_t)
  397. - name: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
  398. self: grad
  399. result: auto_linear
  400. - name: _lazy_clone(Tensor self) -> Tensor
  401. self: grad
  402. result: auto_linear
  403. - name: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
  404. self: _to_copy_backward(grad, self.options())
  405. result: _to_copy(self_t, dtype, layout, device, pin_memory, non_blocking, memory_format)
  406. # The condition is: if dtype is not nullopt, then isDifferentiableType(*dtype)
  407. # (If dtype IS nullopt, we rely on the regular check that any input requires grad).
  408. output_differentiability: ["!dtype || isDifferentiableType(*dtype)"]
  409. - name: _coalesce(Tensor self) -> Tensor
  410. self: grad
  411. - name: complex(Tensor real, Tensor imag) -> Tensor
  412. real: at::real(grad)
  413. imag: at::imag(grad)
  414. result: at::complex(real_t, imag_t)
  415. - name: polar(Tensor abs, Tensor angle) -> Tensor
  416. abs, angle: polar_backward(grad, result)
  417. result: at::complex(abs_t*angle_p.cos() - angle_t*abs_p*angle_p.sin(), abs_t*angle_p.sin() + angle_t*abs_p*angle_p.cos())
  418. - name: _conj(Tensor(a) self) -> Tensor(a)
  419. self: grad.conj()
  420. result: self_t.conj()
  421. - name: _neg_view(Tensor(a) self) -> Tensor(a)
  422. self: grad.neg()
  423. result: self_t._neg_view()
  424. - name: _conj_physical(Tensor self) -> Tensor
  425. self: grad.conj_physical()
  426. result: self_t.conj_physical()
  427. - name: conj_physical_(Tensor(a!) self) -> Tensor(a!)
  428. self: grad.conj_physical()
  429. result: self_t.conj_physical_()
  430. - name: copysign.Tensor(Tensor self, Tensor other) -> Tensor
  431. self: copysign_tensor_self_backward(grad, self, result)
  432. other: zeros_like(other)
  433. result: copysign_tensor_self_backward(self_t, self_p, result)
  434. - name: copysign.Scalar(Tensor self, Scalar other) -> Tensor
  435. self: copysign_tensor_self_backward(grad, self, result)
  436. result: auto_element_wise
  437. - name: cos(Tensor self) -> Tensor
  438. self: grad * -self.sin().conj()
  439. result: auto_element_wise
  440. - name: cosh(Tensor self) -> Tensor
  441. self: grad * self.sinh().conj()
  442. result: auto_element_wise
  443. - name: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
  444. output_differentiability: [False]
  445. - name: count_nonzero(Tensor self, int? dim=None) -> Tensor
  446. output_differentiability: [False]
  447. - name: linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor
  448. self: at::linalg_cross(other.conj(), grad, dim)
  449. other: at::linalg_cross(grad, self.conj(), dim)
  450. result: "at::linalg_cross(self_t, other_p, dim) + at::linalg_cross(self_p, other_t, dim)"
  451. - name: logcumsumexp(Tensor self, int dim) -> Tensor
  452. self: logcumsumexp_backward(grad, self, result, dim)
  453. result: logcumsumexp_jvp(self_p, self_t, dim)
  454. - name: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
  455. self: cumprod_backward(grad.to(self.scalar_type()), self, dim, result)
  456. result: "cumprod_jvp(self_t, self_p, result, dim).to(dtype.has_value() ? *dtype : self_p.scalar_type())"
  457. - name: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
  458. self: cumsum_backward(grad.to(self.scalar_type()), dim)
  459. result: auto_linear
  460. - name: cummax(Tensor self, int dim) -> (Tensor values, Tensor indices)
  461. self: cummaxmin_backward(grad, self, indices, dim)
  462. values: self_t.gather(dim, indices)
  463. - name: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
  464. self: cummaxmin_backward(grad, self, indices, dim)
  465. values: self_t.gather(dim, indices)
  466. - name: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
  467. self, weight, bias: "grad.defined() ? conv_tbc_backward(grad, self, weight, bias, pad) : std::tuple<Tensor, Tensor, Tensor>()"
  468. - name: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
  469. log_probs: _ctc_loss_backward(grad, log_probs, targets, input_lengths, target_lengths, result0, result1, blank, zero_infinity)
  470. - name: _ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
  471. log_probs: _ctc_loss_backward(grad, log_probs, targets, input_lengths, target_lengths, result0, result1, blank, zero_infinity)
  472. - name: deg2rad(Tensor self) -> Tensor
  473. self: deg2rad_backward(grad)
  474. result: auto_element_wise
  475. - name: _linalg_det(Tensor A) -> (Tensor result, Tensor LU, Tensor pivots)
  476. A: linalg_det_backward(grad, result, A, LU, pivots)
  477. result: linalg_det_jvp(A_t, result, LU, pivots, A_p.is_contiguous() && !A_p.is_complex())
  478. output_differentiability: [True, False, False]
  479. - name: _linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet, Tensor LU, Tensor pivots)
  480. A: slogdet_backward(grad_sign, grad_logabsdet, A, sign, LU, pivots)
  481. sign, logabsdet: slogdet_jvp(LU, pivots, A_t, sign, A_p.is_contiguous() && !A_p.is_complex())
  482. output_differentiability: [True, True, False, False]
  483. - name: block_diag(Tensor[] tensors) -> Tensor
  484. tensors: block_diag_backward(grad, to_args_sizes(tensors), to_args_scalartypes(tensors))
  485. result: block_diag_jvp(tensors)
  486. - name: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
  487. self: grad.diagonal(offset, dim1, dim2)
  488. result: auto_linear
  489. - name: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)
  490. self: diagonal_backward_symint(grad, self.sym_sizes(), offset, dim1, dim2)
  491. result: auto_linear
  492. - name: diagonal_backward(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2) -> Tensor
  493. grad_output: grad.diagonal(offset, dim1, dim2)
  494. result: auto_linear
  495. - name: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
  496. self: norm_backward(grad, self - other, p, result)
  497. other: -norm_backward(grad, self - other, p, result)
  498. result: norm_jvp(self_p - other_p, self_t - other_t, p, result, {}, false)
  499. # The backward formula is done in this order to improve numerical stability
  500. # of the higher order derivatives, see https://github.com/pytorch/pytorch/issues/43414
  501. # Note that we don't use "result" because saving it would be BC-breaking when it is used in an inplace operation later
  502. - name: div.Tensor(Tensor self, Tensor other) -> Tensor
  503. self: div_tensor_self_backward(grad, other, self.scalar_type())
  504. other: div_tensor_other_backward(grad, self, other)
  505. result: (self_t - other_t * result) / other_p
  506. - name: div.Scalar(Tensor self, Scalar other) -> Tensor
  507. self: div_tensor_self_backward(grad, other, self.scalar_type())
  508. result: self_t / other
  509. - name: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
  510. self: div_tensor_self_backward(grad, other, self.scalar_type(), rounding_mode)
  511. other: div_tensor_other_backward(grad, self, other, rounding_mode)
  512. result: "rounding_mode.has_value() ? result.new_zeros_symint(result.sym_sizes()) : self_t / other_p - other_t * (self_p / other_p) / other_p"
  513. - name: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
  514. self: div_tensor_self_backward(grad, other, self.scalar_type(), rounding_mode)
  515. result: "rounding_mode.has_value() ? result.new_zeros_symint(result.sym_sizes()) : self_t / other"
  516. - name: dot(Tensor self, Tensor tensor) -> Tensor
  517. self: grad * tensor.conj()
  518. tensor: grad * self.conj()
  519. result: at::dot(self_t, tensor_p) + at::dot(self_p, tensor_t)
  520. - name: vdot(Tensor self, Tensor other) -> Tensor
  521. self: grad.conj() * other
  522. other: grad * self
  523. result: at::vdot(self_t, other_p) + at::vdot(self_p, other_t)
  524. - name: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
  525. self: _fused_dropout_backward(grad, result1, p)
  526. - name: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)
  527. input: "GradMode::is_enabled() ? infinitely_differentiable_native_dropout_backward(grad, result1, (!train.has_value() || !train.value() ? 1 : (p == 1 ? 0.0 : 1.0 / (1.0 - p)))) : native_dropout_backward(grad, result1, (!train.has_value() || !train.value() ? 1 : (p == 1 ? 0.0 : 1.0 / (1.0 - p))))"
  528. result0: "(!train.has_value() || train.value()) ? (p == 1 ? 0.0 : 1.0 / (1.0 - p)) * input_t * result1 : input_t"
  529. - name: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
  530. grad_output: "native_dropout_double_backward(grad, grad_output, mask, scale)"
  531. mask: 'not_implemented("native_dropout_backward: mask")'
  532. - name: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  533. self: zeros_like(self)
  534. result: self_t.zero_()
  535. - name: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  536. self: zeros_like(self)
  537. other: zeros_like(other)
  538. result: self_t.zero_()
  539. - name: erf(Tensor self) -> Tensor
  540. self: 2.0 / sqrt(M_PI) * exp(-(self.pow(2))) * grad
  541. result: auto_element_wise
  542. - name: erfc(Tensor self) -> Tensor
  543. self: -2.0 / sqrt(M_PI) * exp(-(self.pow(2))) * grad
  544. result: auto_element_wise
  545. - name: special_erfcx(Tensor self) -> Tensor
  546. self: (2.0 * self * result - 2.0 / sqrt(M_PI)) * grad
  547. result: auto_element_wise
  548. - name: erfinv(Tensor self) -> Tensor
  549. self: 0.5 * sqrt(M_PI) * exp(self.erfinv().pow(2)) * grad
  550. result: auto_element_wise
  551. - name: exp(Tensor self) -> Tensor
  552. self: grad * result.conj()
  553. result: auto_element_wise
  554. - name: exp2(Tensor self) -> Tensor
  555. self: grad * result.conj() * M_LN2
  556. result: auto_element_wise
  557. - name: expm1(Tensor self) -> Tensor
  558. self: grad * (result.conj() + 1)
  559. result: auto_element_wise
  560. # TODO: this derivative is not SymInt safe, need sum_to support
  561. - name: expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
  562. self: at::sum_to(grad, self.sym_sizes())
  563. result: auto_linear
  564. - name: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
  565. self: zeros_like(grad)
  566. result: self_t.zero_()
  567. - name: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
  568. self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask)
  569. - name: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
  570. self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask)
  571. - name: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
  572. self, scale, zero_point: "grad.defined() ? _fake_quantize_learnable_per_tensor_affine_backward(grad, self, scale, zero_point, quant_min, quant_max, grad_factor) : std::tuple<Tensor, Tensor, Tensor>()"
  573. - name: fake_quantize_per_channel_affine_cachemask(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
  574. self: fake_quantize_per_channel_affine_cachemask_backward(grad, mask)
  575. - name: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
  576. self, scale, zero_point: "grad.defined() ? _fake_quantize_learnable_per_channel_affine_backward(grad, self, scale, zero_point, axis, quant_min, quant_max, grad_factor) : std::tuple<Tensor, Tensor, Tensor>()"
  577. - name: _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask)
  578. self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask)
  579. - name: fill.Scalar(Tensor self, Scalar value) -> Tensor
  580. self: zeros_like(grad)
  581. result: at::fill(self_t, 0)
  582. - name: fill.Tensor(Tensor self, Tensor value) -> Tensor
  583. self: zeros_like(grad)
  584. value: grad.sum()
  585. result: at::fill(self_t, value_t)
  586. - name: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
  587. self: zeros_like(grad)
  588. result: self_t.fill_(0)
  589. - name: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
  590. self: zeros_like(grad)
  591. value: grad.sum()
  592. result: self_t.fill_(value_t)
  593. - name: floor(Tensor self) -> Tensor
  594. self: zeros_like(grad)
  595. result: auto_element_wise
  596. - name: fmod.Scalar(Tensor self, Scalar other) -> Tensor
  597. self: grad
  598. result: auto_element_wise
  599. - name: fmod.Tensor(Tensor self, Tensor other) -> Tensor
  600. self: grad
  601. other: -grad * self.div(other, /*rounding_mode=*/"trunc")
  602. result: self_t - other_t * self_p.div(other_p, /*rounding_mode=*/"trunc")
  603. - name: frac(Tensor self) -> Tensor
  604. self: grad
  605. result: self_t
  606. - name: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
  607. self: grad / exponent.exp2()
  608. mantissa: self_t / exponent.exp2()
  609. - name: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
  610. self: gather_backward(grad, self, dim, index, sparse_grad)
  611. index: non_differentiable
  612. result: auto_linear
  613. - name: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  614. self: zeros_like(self)
  615. result: self_t.zero_()
  616. - name: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  617. self: zeros_like(self)
  618. other: zeros_like(other)
  619. result: self_t.zero_()
  620. - name: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
  621. self: zeros_like(grad)
  622. result: self_t.zero_()
  623. - name: geqrf(Tensor self) -> (Tensor a, Tensor tau)
  624. self: not_implemented("geqrf")
  625. - name: indices(Tensor(a) self) -> Tensor(a)
  626. output_differentiability: [False]
  627. - name: _indices(Tensor(a) self) -> Tensor(a)
  628. output_differentiability: [False]
  629. - name: crow_indices(Tensor(a) self) -> Tensor(a)
  630. output_differentiability: [False]
  631. - name: col_indices(Tensor(a) self) -> Tensor(a)
  632. output_differentiability: [False]
  633. - name: ccol_indices(Tensor(a) self) -> Tensor(a)
  634. output_differentiability: [False]
  635. - name: row_indices(Tensor(a) self) -> Tensor(a)
  636. output_differentiability: [False]
  637. - name: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
  638. input, grid: "grad.defined() ? grid_sampler_2d_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners, grad_input_mask) : std::tuple<Tensor, Tensor>()"
  639. - name: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
  640. input, grid: "grad.defined() ? grid_sampler_3d_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners, grad_input_mask) : std::tuple<Tensor, Tensor>()"
  641. # See NOTE [ grid_sample CPU fallback ]
  642. - name: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
  643. input, grid: "grad.defined() ? _grid_sampler_2d_cpu_fallback_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners) : std::tuple<Tensor, Tensor>()"
  644. - name: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  645. self: zeros_like(self)
  646. result: self_t.zero_()
  647. - name: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  648. self: zeros_like(self)
  649. other: zeros_like(other)
  650. result: self_t.zero_()
  651. - name: hardsigmoid(Tensor self) -> Tensor
  652. self: hardsigmoid_backward(grad, self)
  653. result: auto_element_wise
  654. - name: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
  655. output_differentiability: [False]
  656. - name: hardswish(Tensor self) -> Tensor
  657. self: hardswish_backward(grad, self)
  658. result: auto_element_wise
  659. - name: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
  660. grad_output: hardswish_backward(grad, self)
  661. self: at::where(at::logical_and(-3.0 < self, self < 3.0), grad * grad_output / 3.0, at::zeros({}, self.options()))
  662. result: "hardswish_backward(grad_output_t, self_p)
  663. + at::where(at::logical_and(-3.0 < self_p, self_p < 3.0), self_t * grad_output_p / 3.0, at::zeros({}, self_p.options()))"
  664. - name: hypot(Tensor self, Tensor other) -> Tensor
  665. self: grad * self / result
  666. other: grad * other / result
  667. result: self_t * self_p / result + other_t * other_p / result
  668. - name: i0(Tensor self) -> Tensor
  669. self: grad * at::special_i1(self)
  670. result: auto_element_wise
  671. - name: special_i0e(Tensor self) -> Tensor
  672. self: grad * (at::special_i1e(self) - self.sgn() * result)
  673. result: auto_element_wise
  674. - name: special_i1(Tensor self) -> Tensor
  675. self: i1_backward(grad, self, result)
  676. result: auto_element_wise
  677. - name: special_i1e(Tensor self) -> Tensor
  678. self: i1e_backward(grad, self, result)
  679. result: auto_element_wise
  680. - name: igamma(Tensor self, Tensor other) -> Tensor
  681. self: 'not_implemented("igamma: input")'
  682. other: grad * exp((self - 1) * log(other) - other - lgamma(self))
  683. - name: igammac(Tensor self, Tensor other) -> Tensor
  684. self: 'not_implemented("igammac: input")'
  685. other: -grad * exp((self - 1) * log(other) - other - lgamma(self))
  686. - name: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
  687. self: index_backward(grad.new_zeros_symint(self.sym_sizes(), self.options()), indices, grad)
  688. result: auto_linear
  689. - name: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
  690. self: at::_unsafe_index_put(grad.new_zeros_symint(self.sym_sizes(), self.options()), indices, grad, true)
  691. result: auto_linear
  692. - name: _unsafe_masked_index(Tensor self, Tensor mask, Tensor?[] indices, Scalar fill) -> Tensor
  693. self: at::_unsafe_masked_index_put_accumulate(grad.new_zeros_symint(self.sym_sizes(), self.options()), mask, indices, grad)
  694. mask: non_differentiable
  695. result: _unsafe_masked_index(self_t, mask, indices, 0)
  696. - name: _unsafe_masked_index_put_accumulate(Tensor self, Tensor mask, Tensor?[] indices, Tensor values) -> Tensor
  697. self: grad
  698. mask: non_differentiable
  699. values: at::_unsafe_masked_index(grad, mask, indices, 0)
  700. result: at::_unsafe_masked_index_put_accumulate(self_t, mask, indices, values_t)
  701. - name: index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
  702. self: grad
  703. # The case source.dim() == 0 is necessary to support scalar tensors of the form
  704. # source.dim() == 0 and index.dim() == 1 and index.size() == (1,),
  705. # This is because source is not broadcastable to index, as source.dim() < index.dim()
  706. source: "maybe_multiply(source.dim() > 0 ? grad.index_select(dim, index).expand_as(source) : grad.index_select(dim, index.squeeze(0)), alpha)"
  707. index: non_differentiable
  708. result: at::index_add(self_t, dim, index, maybe_multiply(source_t, alpha))
  709. - name: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor
  710. self, source: index_reduce_backward(grad, self, dim, index, source, reduce, include_self, result)
  711. index: non_differentiable
  712. - name: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
  713. self: grad.index_fill(dim, index, 0)
  714. # The case source.dim() == 0 is necessary to support scalar tensors of the form
  715. # source.dim() == 0 and index.dim() == 1 and index.size() == (1,),
  716. # This is because source is not broadcastable to index, as source.dim() < index.dim()
  717. source: "source.dim() > 0 ? grad.index_select(dim, index).expand_as(source) : grad.index_select(dim, index.squeeze(0))"
  718. index: non_differentiable
  719. result: self_t.index_copy(dim, index, source_t)
  720. - name: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
  721. self: grad.index_fill(dim, index, 0)
  722. index: non_differentiable
  723. result: self_t.index_fill(dim, index, 0)
  724. - name: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
  725. self: grad.index_fill(dim, index, 0)
  726. value: grad.index_select(dim, std::get<0>(at::_unique(index, /*sorted=*/false))).sum()
  727. index: non_differentiable
  728. result: self_t.index_fill(dim, index, value_t)
  729. - name: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
  730. self: "accumulate ? grad : grad.index_put(indices, zeros_like(values), false)"
  731. values: grad.index(indices)
  732. result: self_t.index_put(indices, values_t, accumulate)
  733. - name: _unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
  734. self: "accumulate ? grad : at::_unsafe_index_put(grad, indices, zeros_like(values), false)"
  735. values: at::_unsafe_index(grad, indices)
  736. result: at::_unsafe_index_put(self_t, indices, values_t, accumulate)
  737. - name: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
  738. self: "accumulate ? grad : grad.index_put(indices, zeros_like(values), false)"
  739. values: grad.index(indices)
  740. result: at::_index_put_impl_(self_t, indices, values_t, accumulate, unsafe)
  741. - name: index_select(Tensor self, int dim, Tensor index) -> Tensor
  742. self: index_select_backward_symint(grad, self.sym_sizes(), dim, index)
  743. index: non_differentiable
  744. result: auto_linear
  745. - name: linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info)
  746. A: -at::matmul(inverse.mH(), at::matmul(grad, inverse.mH()))
  747. inverse: -at::matmul(at::matmul(inverse, A_t), inverse)
  748. output_differentiability: [True, False]
  749. - name: linalg_pinv.atol_rtol_tensor(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor
  750. self: pinv_backward(grad, result, self)
  751. result: pinv_jvp(self_p, result, self_t)
  752. - name: isnan(Tensor self) -> Tensor
  753. self: non_differentiable
  754. - name: kthvalue(Tensor self, SymInt k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
  755. self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
  756. values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
  757. - name: ldexp.Tensor(Tensor self, Tensor other) -> Tensor
  758. self: grad * at::pow(2, other).conj()
  759. other: grad * result.conj() * M_LN2
  760. result: self_t * at::pow(2, other_p) + other_t * result * M_LN2
  761. - name: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  762. self: zeros_like(self)
  763. result: self_t.zero_()
  764. - name: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  765. self: zeros_like(self)
  766. other: zeros_like(other)
  767. result: self_t.zero_()
  768. - name: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
  769. self: "weight.isComplex() ? grad * (1 - weight.conj().toComplexDouble()) : grad * (1 - weight.toDouble())"
  770. end: grad * weight.conj()
  771. result: at::lerp(self_t, end_t, weight)
  772. - name: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
  773. self: grad * (1 - weight).conj()
  774. end: grad * weight.conj()
  775. weight: grad * (end - self).conj()
  776. result: at::lerp(self_t, end_t, weight_p) + weight_t * (end_p - self_p)
  777. - name: lgamma(Tensor self) -> Tensor
  778. self: grad * digamma(self)
  779. result: auto_element_wise
  780. - name: digamma(Tensor self) -> Tensor
  781. self: grad * polygamma(1, self)
  782. result: auto_element_wise
  783. - name: polygamma(int n, Tensor self) -> Tensor
  784. self: grad * polygamma(n + 1, self)
  785. result: auto_element_wise
  786. - name: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
  787. self: grad * polygamma(n + 1, self)
  788. result: self_t.mul_(polygamma(n + 1, original_self_p))
  789. - name: log(Tensor self) -> Tensor
  790. self: grad.div(self.conj())
  791. result: auto_element_wise
  792. - name: log10(Tensor self) -> Tensor
  793. self: grad / (self.conj() * 2.3025850929940456)
  794. result: auto_element_wise
  795. - name: log1p(Tensor self) -> Tensor
  796. self: log1p_backward(grad, self)
  797. result: auto_element_wise
  798. - name: log2(Tensor self) -> Tensor
  799. self: grad / (self.conj() * 0.6931471805599453)
  800. result: auto_element_wise
  801. - name: logaddexp(Tensor self, Tensor other) -> Tensor
  802. self: grad / (1 + exp(other - self)).conj()
  803. other: grad / (1 + exp(self - other)).conj()
  804. result: self_t / (1 + exp(other_p - self_p)) + other_t / (1 + exp(self_p - other_p))
  805. - name: logaddexp2(Tensor self, Tensor other) -> Tensor
  806. self: grad / (1 + pow(2, other - self))
  807. other: grad / (1 + pow(2, self - other))
  808. result: self_t / (1 + pow(2, other_p - self_p)) + other_t / (1 + pow(2, self_p - other_p))
  809. # Note [Gradient formula for xlogy at x = 0, y <= 0]
  810. # x * log(y) is not defined at y <= 0, so we cannot even talk about differentiability
  811. # Now, xlogy(0, y) = 0 by definition.
  812. # This does not make it differentiable as it's not defined in a neighbourhood of a point
  813. # (0, y) when y <= 0.
  814. # Now, when a function is non-differentiable, sometimes we return "a relatively sensible value"
  815. # In this case, as per the discussion in https://github.com/pytorch/pytorch/issues/80770, we choose
  816. # this value to be zero, which is the directional derivative along the line {x = 0}.
  817. - name: xlogy.Tensor(Tensor self, Tensor other) -> Tensor
  818. self: at::xlogy(grad, other).masked_fill((self == 0.) & (other <= 0.), 0.)
  819. other: grad * self / other
  820. result: at::xlogy(self_t, other_p).masked_fill((self_p == 0.) & (other_p <= 0.), 0.) + other_t * self_p / other_p
  821. - name: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor
  822. other: grad * self / other
  823. result: auto_element_wise
  824. - name: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor
  825. self: "other.toDouble() > 0.
  826. ? at::xlogy(grad, other)
  827. : at::xlogy(grad, other).masked_fill(self == 0., 0.)"
  828. result: auto_element_wise
  829. # See Note [Gradient formula for xlogy at x = 0, y <= 0]
  830. # Same here but with y <= -1
  831. - name: special_xlog1py(Tensor self, Tensor other) -> Tensor
  832. self: at::special_xlog1py(grad, other).masked_fill((self == 0.) & (other <= -1.), 0.)
  833. other: grad * self / (other + 1)
  834. result: at::special_xlog1py(self_t, other_p).masked_fill((self_p == 0.) & (other_p <= -1.), 0.) + other_t * self_p / (other_p + 1)
  835. - name: special_xlog1py.self_scalar(Scalar self, Tensor other) -> Tensor
  836. other: grad * self / (other + 1)
  837. result: auto_element_wise
  838. - name: special_xlog1py.other_scalar(Tensor self, Scalar other) -> Tensor
  839. self: "other.toDouble() > -1.
  840. ? at::special_xlog1py(grad, other)
  841. : at::special_xlog1py(grad, other).masked_fill(self == 0., 0.)"
  842. result: auto_element_wise
  843. - name: special_zeta(Tensor self, Tensor other) -> Tensor
  844. self: not_implemented("zeta")
  845. other: grad * -self * special_zeta(self + 1., other)
  846. - name: special_zeta.self_scalar(Scalar self, Tensor other) -> Tensor
  847. other: grad * -self * special_zeta(self.toDouble() + 1., other)
  848. - name: special_zeta.other_scalar(Tensor self, Scalar other) -> Tensor
  849. self: not_implemented("zeta")
  850. - name: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
  851. self: zeros_like(grad)
  852. result: self_t.zero_()
  853. - name: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
  854. self: logsumexp_backward(grad, self, result, dim, keepdim)
  855. result: logsumexp_jvp(self_p, self_t, dim, keepdim)
  856. - name: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)
  857. self, b: linalg_lstsq_backward(grads[0], grads[1], self, b, solution, grad_input_mask)
  858. solution: linalg_lstsq_solution_jvp(self_p, b_p, self_t, b_t)
  859. residuals: linalg_lstsq_residuals_jvp(self_p, b_p, self_t, b_t, solution, residuals)
  860. output_differentiability: [True, True, False, False]
  861. - name: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  862. self: zeros_like(self)
  863. result: self_t.zero_()
  864. - name: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  865. self: zeros_like(self)
  866. other: zeros_like(other)
  867. result: self_t.zero_()
  868. - name: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info)
  869. A: lu_factor_ex_backward(grad, LU, pivots, pivot)
  870. LU: lu_factor_ex_jvp(A_t, LU, pivots, pivot)
  871. output_differentiability: [True, False, False]
  872. - name: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)
  873. A: linalg_lu_backward(grad_L, grad_U, P, L, U, pivot)
  874. L: std::get<0>(linalg_lu_jvp(A_t, P, L, U, pivot))
  875. U: std::get<1>(linalg_lu_jvp(A_t, P, L, U, pivot))
  876. output_differentiability: [False, True, True]
  877. - name: linalg_lu_solve(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False) -> Tensor
  878. LU: linalg_lu_solve_LU(grad, LU, pivots, result, left, adjoint)
  879. B: "at::linalg_lu_solve(LU, pivots, grad, left, !adjoint)"
  880. result: linalg_lu_solve_jvp(result, LU_p, pivots, LU_t, B_t, left, adjoint)
  881. - name: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U)
  882. LU_data: lu_unpack_backward(grad_L, grad_U, LU_data.sym_size(-2), LU_data.sym_size(-1))
  883. LU_pivots: non_differentiable
  884. L: "LU_data_t.sym_size(-2) >= LU_data_t.sym_size(-1) ? LU_data_t.tril_symint(-1) : LU_data_t.narrow_symint(-1, 0, LU_data_t.sym_size(-2)).tril_symint(-1)"
  885. U: "LU_data_t.sym_size(-1) >= LU_data_t.sym_size(-2) ? LU_data_t.triu_symint() : LU_data_t.narrow_symint(-2, 0, LU_data_t.sym_size(-1)).triu_symint()"
  886. output_differentiability: [False, True, True]
  887. - name: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
  888. self: grad.masked_fill(mask, 0)
  889. mask: non_differentiable
  890. result: self_t.masked_fill(mask, 0)
  891. - name: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
  892. self: grad.masked_fill(mask, 0)
  893. value: masked_fill_backward(grad, mask)
  894. mask: non_differentiable
  895. result: self_t.masked_fill(mask, value_t)
  896. - name: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
  897. self: grad.masked_fill(mask, 0)
  898. source: masked_scatter_backward_symint(grad, mask, source.sym_sizes())
  899. mask: non_differentiable
  900. result: self_t.masked_scatter(mask, source_t)
  901. - name: masked_scatter_backward(Tensor grad_output, Tensor mask, SymInt[] sizes) -> Tensor
  902. grad_output: zeros_like(grad_output).masked_scatter(mask, grad)
  903. mask: non_differentiable
  904. result: masked_scatter_backward(grad_output_t, mask, grad_output_t.sizes())
  905. - name: masked_select(Tensor self, Tensor mask) -> Tensor
  906. self: masked_select_backward(grad, self, mask)
  907. mask: non_differentiable
  908. result: auto_linear
  909. - name: linalg_matrix_exp(Tensor self) -> Tensor
  910. self: linalg_matrix_exp_differential(self, grad, /*adjoint*/ true)
  911. result: linalg_matrix_exp_differential(self_p, self_t, /*adjoint*/ false)
  912. - name: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  913. self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
  914. values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
  915. - name: max(Tensor self) -> Tensor
  916. self: evenly_distribute_backward(grad, self, result)
  917. result: evenly_read_jvp(self_t, self_p, result)
  918. - name: maximum(Tensor self, Tensor other) -> Tensor
  919. self: at::where(self == other, grad / 2, grad).masked_fill_(self < other, 0)
  920. other: at::where(self == other, grad / 2, grad).masked_fill_(self > other, 0)
  921. result: other_t + at::where(self_p == other_p, at::scalar_tensor(0.5, result.options()), (self_p > other_p).to(result.scalar_type())) * (self_t - other_t)
  922. - name: fmax(Tensor self, Tensor other) -> Tensor
  923. self: grad.masked_fill((self >= other).logical_or_(other.isnan()).logical_not_(), 0)
  924. other: grad.masked_fill((self >= other).logical_or_(other.isnan()), 0)
  925. result: other_t + (self_p > other_p).logical_or_(other_p.isnan()) * (self_t - other_t)
  926. - name: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
  927. dispatch:
  928. Default:
  929. self: grad.expand_symint(self.sym_sizes()) / self.sym_numel()
  930. result: auto_linear
  931. AutogradNestedTensor:
  932. # TODO: replace this with grad.expand_as(self) / self.sym_numel() when that is supported
  933. self: (ones_like(self) * grad) / self.sym_numel()
  934. result: auto_linear
  935. - name: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  936. self: mean_backward(grad, self.sym_sizes(), dim, self.sym_numel(), keepdim)
  937. result: auto_linear
  938. - name: median(Tensor self) -> Tensor
  939. self: evenly_distribute_backward(grad, self, result)
  940. result: evenly_read_jvp(self_t, self_p, result)
  941. - name: nanmedian(Tensor self) -> Tensor
  942. self: evenly_distribute_backward(grad, self, result)
  943. result: evenly_read_jvp(self_t, self_p, result)
  944. # This is in theory incorrect in the following case:
  945. # sorted list: [..., a, b, b, ..., b, b, c, ...] with median = b and the value
  946. # | at middle position of the
  947. # | list between two `b`s. E.g.,
  948. # |
  949. # ^the middle position
  950. # The gradient exists and is essentially 0 in this case.
  951. #
  952. # In case where the middle position is at the boundary of `b` range, e.g.,
  953. # sorted list: [..., a, b, b, ..., b, b, c, ...]
  954. # |
  955. # ^the middle position
  956. # The backward implementation is correct in the sense that it returns the
  957. # subgradient on one side.
  958. - name: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  959. self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
  960. values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
  961. - name: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  962. self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
  963. values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
  964. - name: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
  965. self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
  966. values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
  967. - name: min(Tensor self) -> Tensor
  968. self: evenly_distribute_backward(grad, self, result)
  969. result: evenly_read_jvp(self_t, self_p, result)
  970. - name: minimum(Tensor self, Tensor other) -> Tensor
  971. self: at::where(self == other, grad / 2, grad).masked_fill_(self > other, 0)
  972. other: at::where(self == other, grad / 2, grad).masked_fill_(self < other, 0)
  973. result: other_t + at::where(self_p == other_p, at::scalar_tensor(0.5, result.options()), (self_p < other_p).to(result.scalar_type())) * (self_t - other_t)
  974. - name: fmin(Tensor self, Tensor other) -> Tensor
  975. self: grad.masked_fill((self <= other).logical_or_(other.isnan()).logical_not_(), 0)
  976. other: grad.masked_fill((self <= other).logical_or_(other.isnan()), 0)
  977. result: other_t + (self_p <= other_p).logical_or_(other_p.isnan()) * (self_t - other_t)
  978. - name: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
  979. self: scale_grad_by_count(restore_reduced_dims(grad, dim, keepdim), restore_reduced_dims(result, dim, keepdim) == self, dim)
  980. result: amaxamin_jvp(self_p, self_t, result, dim, keepdim)
  981. - name: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
  982. self: scale_grad_by_count(restore_reduced_dims(grad, dim, keepdim), restore_reduced_dims(result, dim, keepdim) == self, dim)
  983. result: amaxamin_jvp(self_p, self_t, result, dim, keepdim)
  984. - name: mm(Tensor self, Tensor mat2) -> Tensor
  985. self: mm_mat1_backward(grad, mat2, self.sym_sizes(), self.sym_strides(), self.layout(), 1)
  986. mat2: mm_mat2_backward(grad, self, mat2.sym_sizes(), mat2.sym_strides(), mat2.layout(), 1)
  987. result: at::mm(self_t, mat2_p) + at::mm(self_p, mat2_t)
  988. - name: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
  989. self: _grouped_mm_mat1_backward(grad, mat2, self.sym_sizes(), self.sym_strides(), self.layout(), offs, 1)
  990. mat2: _grouped_mm_mat2_backward(grad, self, mat2.sym_sizes(), mat2.sym_strides(), mat2.layout(), offs, 1)
  991. - name: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
  992. self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
  993. values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
  994. - name: mul.Tensor(Tensor self, Tensor other) -> Tensor
  995. self: mul_tensor_backward(grad, other, self.scalar_type())
  996. other: mul_tensor_backward(grad, self, other.scalar_type())
  997. result: other_t * self_p + self_t * other_p
  998. - name: mul.Scalar(Tensor self, Scalar other) -> Tensor
  999. self: mul_tensor_backward(grad, other, self.scalar_type())
  1000. result: self_t * other
  1001. - name: mv(Tensor self, Tensor vec) -> Tensor
  1002. self: grad.ger(vec.conj())
  1003. vec: self.conj().t().mv(grad)
  1004. result: mv(self_t, vec_p) + mv(self_p, vec_t)
  1005. - name: mvlgamma(Tensor self, int p) -> Tensor
  1006. self: mvlgamma_backward(grad, self, p)
  1007. result: auto_element_wise
  1008. - name: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
  1009. self: grad * at::isfinite(self)
  1010. result: auto_element_wise
  1011. - name: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
  1012. input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1013. result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, eps)
  1014. - name: _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
  1015. input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1016. result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, eps)
  1017. - name: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor)
  1018. input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, /*training=*/false, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1019. result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, /*training=*/false, eps)
  1020. - name: _native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
  1021. input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, Tensor(), Tensor(), result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1022. result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, Tensor(), Tensor(), result1, result2, training, eps)
  1023. - name: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  1024. input, weight, grad_out: batchnorm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_out, running_mean, running_var, train, eps, save_mean, save_invstd, grad_input_mask)
  1025. save_mean: not_implemented("native_batch_norm_backward save_mean")
  1026. save_invstd: not_implemented("native_batch_norm_backward save_invstd")
  1027. - name: native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
  1028. input, weight, bias: "grad.defined() ? native_layer_norm_backward_symint(grad, input, normalized_shape, result1, result2, weight, bias, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1029. result0: layer_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, result1, result2, normalized_shape)
  1030. - name: native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  1031. input, weight, grad_out: layer_norm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_out, mean, rstd, normalized_shape, grad_input_mask)
  1032. bias: Tensor()
  1033. mean: not_implemented("native_layer_norm_backward mean")
  1034. rstd: not_implemented("native_layer_norm_backward rstd")
  1035. - name: _fused_rms_norm(Tensor input, int[] normalized_shape, Tensor? weight, float? eps) -> (Tensor, Tensor)
  1036. input, weight: "GradMode::is_enabled() || grads[1].defined() ? infinitely_differentiable_native_rms_norm_backward(grads[0], grads[1], input, normalized_shape, result1, weight, grad_input_mask) : (grads[0].defined() ? _fused_rms_norm_backward(grads[0], input, normalized_shape, result1, weight, grad_input_mask) : std::tuple<Tensor, Tensor>())"
  1037. result0: rms_norm_jvp(input_p, input_t, weight_p, weight_t, result1, normalized_shape)
  1038. result1: rms_norm_rstd_jvp(input_p, input_t, result1, normalized_shape)
  1039. - name: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
  1040. input, weight, bias: "GradMode::is_enabled() || grads[1].defined() || grads[2].defined() ? infinitely_differentiable_native_group_norm_backward(grads[0], grads[1], grads[2], input, result1, result2, weight, N, C, HxW, group, eps, grad_input_mask) : (grads[0].defined() ? native_group_norm_backward_symint(grads[0].device().is_xpu() ? grads[0] : grads[0].contiguous(grads[0].device().is_cpu() ? input.suggest_memory_format() : c10::MemoryFormat::Contiguous), input.device().is_xpu() ? input : input.contiguous(input.device().is_cpu() ? input.suggest_memory_format() : c10::MemoryFormat::Contiguous), result1, result2, weight, N, C, HxW, group, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>())"
  1041. result0: group_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, result1, result2, group)
  1042. result1: group_norm_mean_jvp(input_t, result1, group)
  1043. result2: group_norm_invstd_jvp(input_p, input_t, result1, result2, group)
  1044. - name: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
  1045. self: zeros_like(self)
  1046. result: self_t.zero_()
  1047. - name: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  1048. self: zeros_like(self)
  1049. other: zeros_like(other)
  1050. result: self_t.zero_()
  1051. - name: neg(Tensor self) -> Tensor
  1052. self: grad.neg()
  1053. result: auto_element_wise
  1054. - name: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
  1055. input, weight, bias: "grad.defined() ? batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, /*update*/true, eps, grad_input_mask, retain_variables ? result3.clone() : result3) : std::tuple<Tensor, Tensor, Tensor>()"
  1056. result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, true, eps)
  1057. - name: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
  1058. input, weight, bias: "grad.defined() ? batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, /*update*/false, eps, grad_input_mask, retain_variables ? result3.clone() : result3) : std::tuple<Tensor, Tensor, Tensor>()"
  1059. result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, false, eps)
  1060. - name: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor)
  1061. input, weight, grad_out: batchnorm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_out, running_mean, running_var, update, eps, save_mean, save_var, grad_input_mask)
  1062. save_mean: not_implemented("batch_norm_backward save_mean")
  1063. save_var: not_implemented("batch_norm_backward save_var")
  1064. reserve: not_implemented("batch_norm_backward reserve")
  1065. - name: nextafter(Tensor self, Tensor other) -> Tensor
  1066. self: not_implemented("nextafter")
  1067. other: not_implemented("nextafter")
  1068. - name: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
  1069. self: norm_backward(grad, self, p, result)
  1070. result: norm_jvp(self_p, self_t, p, result)
  1071. - name: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
  1072. self: norm_backward(grad, self, p, result, dim, keepdim)
  1073. result: norm_jvp(self_p, self_t, p, result, dim, keepdim)
  1074. - name: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
  1075. self: norm_backward(grad, self.to(grad.scalar_type()), p, result)
  1076. result: norm_jvp(self_p, self_t, p, result)
  1077. - name: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
  1078. self: norm_backward(grad, self.to(grad.scalar_type()), p, result, dim, keepdim)
  1079. result: norm_jvp(self_p, self_t, p, result, dim, keepdim)
  1080. - name: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  1081. self: linalg_vector_norm_backward(grad, self, ord, result, dim, keepdim)
  1082. result: linalg_vector_norm_jvp(self_p, self_t, ord, result, dim, keepdim)
  1083. - name: _pdist_forward(Tensor self, float p=2) -> Tensor
  1084. self: _pdist_backward(grad, self, p, result)
  1085. - name: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
  1086. grad: not_implemented("_pdist_backward")
  1087. self: not_implemented("_pdist_backward")
  1088. pdist: not_implemented("_pdist_backward")
  1089. - name: _euclidean_dist(Tensor x1, Tensor x2) -> Tensor
  1090. x1, x2: _euclidean_dist_backward(grad, x1, x2, result)
  1091. - name: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
  1092. x1: _cdist_backward(grad.contiguous(), x1, x2, p, result)
  1093. x2: _cdist_backward(grad.mT().contiguous(), x2, x1, p, result.mT().contiguous())
  1094. - name: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
  1095. grad: not_implemented("_cdist_backward")
  1096. x1: not_implemented("_cdist_backward")
  1097. x2: not_implemented("_cdist_backward")
  1098. cdist: not_implemented("_cdist_backward")
  1099. - name: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
  1100. self: zeros_like(grad)
  1101. result: self_t.zero_()
  1102. - name: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
  1103. mean: at::zeros_symint(mean.sym_sizes(), grad.options())
  1104. result: auto_element_wise
  1105. - name: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
  1106. std: at::zeros_symint(std.sym_sizes(), grad.options())
  1107. result: auto_element_wise
  1108. - name: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
  1109. mean: at::zeros_symint(mean.sym_sizes(), grad.options())
  1110. std: at::zeros_symint(std.sym_sizes(), grad.options())
  1111. result: zeros_like(mean_t)
  1112. - name: linalg_householder_product(Tensor input, Tensor tau) -> Tensor
  1113. input, tau: householder_product_backward(grad, result, input, tau)
  1114. result: householder_product_jvp(input_t, tau_t, result, input_p, tau_p)
  1115. - name: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor
  1116. self, input2, input3: ormqr_backward(grad, result, self, input2, input3, left, transpose, grad_input_mask)
  1117. - name: permute(Tensor(a) self, int[] dims) -> Tensor(a)
  1118. self: permute_backwards(grad, dims)
  1119. result: auto_linear
  1120. - name: poisson(Tensor self, Generator? generator=None) -> Tensor
  1121. self: zeros_like(self)
  1122. result: auto_element_wise
  1123. - name: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
  1124. self: pow_backward(grad, self, exponent)
  1125. result: auto_element_wise
  1126. - name: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
  1127. self: pow_backward_self(grad, self, exponent)
  1128. exponent: pow_backward_exponent(grad, self, exponent, result)
  1129. result: (pow_backward_self(self_t.conj(), self_p, exponent_p) + pow_backward_exponent(exponent_t.conj(), self_p, exponent_p, result)).conj()
  1130. - name: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
  1131. exponent: pow_backward_exponent(grad, self, exponent, result)
  1132. result: auto_element_wise
  1133. - name: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
  1134. self: prod_backward(grad, self.to(grad.scalar_type()), result)
  1135. result: (prod_backward(at::ones({}, result.options()).expand_as(result), self_p.to(result.scalar_type()), result) * self_t.conj()).sum().conj()
  1136. - name: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  1137. self: prod_backward(grad, self.to(grad.scalar_type()), result, dim, keepdim)
  1138. result: (prod_backward(at::ones({}, result.options()).expand_as(result), self_p.to(result.scalar_type()), result, dim, keepdim) * self_t.conj()).sum(dim, keepdim).conj()
  1139. - name: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
  1140. self: "accumulate ? grad : grad.put(index, zeros_like(source), false)"
  1141. index: non_differentiable
  1142. source: grad.take(index).reshape_as(source)
  1143. result: self_t.put(index, source_t, accumulate)
  1144. - name: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)
  1145. A: linalg_qr_backward(grad_Q, grad_R, Q, R, mode)
  1146. Q, R: linalg_qr_jvp(A_t, Q, R, mode)
  1147. - name: rad2deg(Tensor self) -> Tensor
  1148. self: rad2deg_backward(grad)
  1149. result: auto_element_wise
  1150. - name: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
  1151. self: zeros_like(grad)
  1152. result: self_t.zero_()
  1153. - name: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
  1154. self: zeros_like(grad)
  1155. result: self_t.zero_()
  1156. - name: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
  1157. self: zeros_like(grad)
  1158. result: self_t.zero_()
  1159. - name: reciprocal(Tensor self) -> Tensor
  1160. self: -grad * (result * result).conj()
  1161. result: auto_element_wise
  1162. - name: remainder.Scalar(Tensor self, Scalar other) -> Tensor
  1163. self: grad
  1164. result: auto_element_wise
  1165. - name: remainder.Tensor(Tensor self, Tensor other) -> Tensor
  1166. self: grad
  1167. other: -grad * self.div(other, /*rounding_mode=*/"floor")
  1168. result: self_t - other_t * self_p.div(other_p, /*rounding_mode=*/"floor")
  1169. - name: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
  1170. self: renorm_backward(grad, self, p, dim, maxnorm)
  1171. result: renorm_jvp(self_p, self_t, p, dim, maxnorm)
  1172. - name: repeat(Tensor self, SymInt[] repeats) -> Tensor
  1173. self: repeat_backward(grad, repeats, self.sym_sizes())
  1174. result: auto_linear
  1175. - name: special_entr(Tensor self) -> Tensor
  1176. self: grad * (-(1 + self.log()))
  1177. result: auto_element_wise
  1178. - name: special_ndtri(Tensor self) -> Tensor
  1179. self: grad * std::sqrt(2 * M_PI) * (result.square() / 2).exp()
  1180. result: auto_element_wise
  1181. - name: special_log_ndtr(Tensor self) -> Tensor
  1182. self: grad / std::sqrt(2 * M_PI) * (result + self.pow(2) / 2).neg().exp()
  1183. result: auto_element_wise
  1184. # [Note: Sometimes view derivatives]
  1185. # The following situation applies to other operations as well.
  1186. # TODO: This note is only referenced by to_dense and to_sparse*. Make
  1187. # this more generic if it's been referenced more than once.
  1188. #
  1189. # DO NOT define a backward for reshape!
  1190. # reshape is special in that it sometimes returns a view, and sometimes not.
  1191. # Defining a backward will make codegen spit out the forward call as
  1192. # as_variable(baseType->reshape(self)),
  1193. # making it impossible (hard) to detect when it is actually a view.
  1194. # - name: reshape(Tensor self, IntArrayRef shape)
  1195. - name: _reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a)
  1196. self: grad.reshape_symint(self.sym_sizes())
  1197. result: auto_linear
  1198. - name: round(Tensor self) -> Tensor
  1199. self: zeros_like(grad)
  1200. result: auto_element_wise
  1201. - name: round.decimals(Tensor self, *, int decimals) -> Tensor
  1202. self: zeros_like(grad)
  1203. result: auto_element_wise
  1204. - name: rsqrt(Tensor self) -> Tensor
  1205. self: -0.5 * grad * result.pow(3).conj()
  1206. result: auto_element_wise
  1207. - name: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
  1208. self: grad.scatter(dim, index, 0)
  1209. index: non_differentiable
  1210. src: grad.gather(dim, index)
  1211. result: self_t.scatter(dim, index, src_t)
  1212. - name: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
  1213. self: grad.scatter(dim, index, 0)
  1214. index: non_differentiable
  1215. result: self_t.scatter(dim, index, 0)
  1216. - name: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
  1217. self: grad
  1218. index: non_differentiable
  1219. src: grad.gather(dim, index)
  1220. result: scatter_add(self_t, dim, index, src_t)
  1221. - name: select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a)
  1222. dispatch:
  1223. Default:
  1224. self: select_backward_symint(grad, self.sym_sizes(), dim, index)
  1225. result: auto_linear
  1226. AutogradNestedTensor:
  1227. self: _nested_select_backward_symint(grad, self, dim, index)
  1228. - name: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
  1229. grad_output: grad.select_symint(dim, index)
  1230. result: auto_linear
  1231. - name: sigmoid(Tensor self) -> Tensor
  1232. self: sigmoid_backward(grad, result)
  1233. result: auto_element_wise
  1234. - name: logit(Tensor self, float? eps=None) -> Tensor
  1235. self: "GradMode::is_enabled() ? infinitely_differentiable_logit_backward(grad, self, eps) : logit_backward(grad, self, eps)"
  1236. result: auto_element_wise
  1237. - name: sign(Tensor self) -> Tensor
  1238. self: zeros_like(grad)
  1239. result: auto_element_wise
  1240. - name: sgn(Tensor self) -> Tensor
  1241. self: sgn_backward(self, grad, result)
  1242. # Cannot use auto_element_wise here because the Jacobian is *not* Hermitian (in fact, it is symmetric)
  1243. # The function is not holomorphic, so there's no reason for its Jacobian to be Hermitian
  1244. # auto_element_wise has a name that's a bit deceiving in the complex case
  1245. result: sgn_backward(self_p, self_t, result)
  1246. - name: sin(Tensor self) -> Tensor
  1247. self: grad * self.cos().conj()
  1248. result: auto_element_wise
  1249. - name: sinc(Tensor self) -> Tensor
  1250. self: sinc_backward(grad, self)
  1251. result: auto_element_wise
  1252. - name: sinh(Tensor self) -> Tensor
  1253. self: grad * self.cosh().conj()
  1254. result: auto_element_wise
  1255. - name: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
  1256. self: slice_backward_wrapper(grad, self.sym_sizes(), dim, start, end, step)
  1257. result: auto_linear
  1258. - name: slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor
  1259. grad_output: grad.slice_symint(dim, start, end, step)
  1260. result: auto_linear
  1261. - name: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
  1262. self: grad.slice_symint(dim, start, end, step)
  1263. src: slice_scatter_symint(grad, zeros_like(self), dim, start, end, step)
  1264. result: auto_linear
  1265. - name: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
  1266. self: slice_scatter_symint(grad, zeros_like(src), dim, start, end, step)
  1267. src: grad.slice_symint(dim, start, end, step)
  1268. result: auto_linear
  1269. - name: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
  1270. self: select_scatter_symint(grad, zeros_like(src), dim, index)
  1271. src: grad.select_symint(dim, index)
  1272. result: auto_linear
  1273. - name: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
  1274. self: diagonal_scatter(grad, zeros_like(src), offset, dim1, dim2)
  1275. src: grad.diagonal(offset, dim1, dim2)
  1276. result: auto_linear
  1277. - name: as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
  1278. self: as_strided_scatter_backward(grad, TensorGeometry(self), TensorGeometry(src), size, stride, storage_offset)
  1279. # See Note [as_strided_scatter backward support]
  1280. src: grad.contiguous().as_strided_symint(size, stride, storage_offset)
  1281. result: auto_linear
  1282. - name: _linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor LU, Tensor pivots, Tensor info)
  1283. A, B: linalg_solve_backward(grad, result, A, LU, pivots, left, grad_input_mask[1])
  1284. result: "linalg_solve_jvp(A_t, B_t, result, LU, pivots, left)"
  1285. output_differentiability: [True, False, False, False] # LU is an auxiliary tensor not exposed to the user
  1286. - name: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
  1287. self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), true)
  1288. output_differentiability: [True, False]
  1289. values: gather_with_keepdimed_indices(self_t, dim, indices, true)
  1290. - name: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
  1291. self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), true)
  1292. output_differentiability: [True, False]
  1293. values: gather_with_keepdimed_indices(self_t, dim, indices, true)
  1294. - name: split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[]
  1295. self: split_backward(grads, split_size, dim, self.sym_sizes(), self.options())
  1296. result: auto_linear
  1297. - name: unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
  1298. self: split_backward(grads, split_size, dim, self.sym_sizes(), self.options())
  1299. result: auto_linear
  1300. - name: split_with_sizes(Tensor(a -> *) self, SymInt[] split_sizes, int dim=0) -> Tensor(a)[]
  1301. dispatch:
  1302. Default:
  1303. self: split_with_sizes_backward(grads, split_sizes, dim, self.sym_sizes(), self.options())
  1304. result: auto_linear
  1305. AutogradNestedTensor:
  1306. self: _nested_split_with_sizes_backward(grads, split_sizes, dim, at::native::get_nested_tensor_impl(self)->get_nested_sizes(), self.options())
  1307. - name: unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
  1308. self: split_with_sizes_backward(grads, split_sizes, dim, self.sym_sizes(), self.options())
  1309. result: auto_linear
  1310. - name: sqrt(Tensor self) -> Tensor
  1311. self: grad / (2 * result.conj())
  1312. result: auto_element_wise
  1313. - name: squeeze(Tensor(a) self) -> Tensor(a)
  1314. self: unsqueeze_to(grad, self.sym_sizes())
  1315. result: auto_linear
  1316. - name: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
  1317. dispatch:
  1318. Default:
  1319. self: unsqueeze_to(grad, dim, self.sym_sizes())
  1320. result: auto_linear
  1321. AutogradNestedTensor:
  1322. self: grad.unsqueeze(dim)
  1323. - name: squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a)
  1324. dispatch:
  1325. Default:
  1326. self: unsqueeze_to(grad, dim, self.sym_sizes())
  1327. result: auto_linear
  1328. AutogradNestedTensor:
  1329. self: unsqueeze_multiple(grad, dim, self.dim())
  1330. - name: squeeze_(Tensor(a!) self) -> Tensor(a!)
  1331. self: unsqueeze_to(grad, self.sym_sizes())
  1332. result: auto_linear
  1333. - name: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)
  1334. self: unsqueeze_to(grad, dim, self.sym_sizes())
  1335. result: auto_linear
  1336. - name: squeeze_.dims(Tensor(a!) self, int[] dim) -> Tensor(a!)
  1337. self: unsqueeze_to(grad, dim, self.sym_sizes())
  1338. result: auto_linear
  1339. - name: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
  1340. self: std_backward(result, grad, self, dim, correction, keepdim)
  1341. # pointwise (variance) + sum + sqrt
  1342. result: (at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim)) / (2. * result)).masked_fill_(result == 0, 0)
  1343. - name: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
  1344. self: std_mean_backward(grads[0], grads[1], self, result0, dim, correction, keepdim)
  1345. result0: (at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim)) / (2. * result0)).masked_fill_(result0 == 0, 0)
  1346. # linear
  1347. result1: mean(self_t, dim.value_or(IntArrayRef({})), keepdim)
  1348. - name: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  1349. self: handle_r_to_c(self.scalar_type(), grad)
  1350. other: handle_r_to_c(other.scalar_type(), maybe_multiply(-grad, alpha.conj()))
  1351. result: self_t - maybe_multiply(other_t, alpha)
  1352. - name: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
  1353. self: handle_r_to_c(self.scalar_type(), grad)
  1354. result: auto_element_wise
  1355. - name: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
  1356. self: handle_r_to_c(self.scalar_type(), maybe_multiply(-grad, alpha.conj()))
  1357. other: handle_r_to_c(other.scalar_type(), grad)
  1358. result: -maybe_multiply(self_t, alpha) + other_t
  1359. - name: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
  1360. self: handle_r_to_c(self.scalar_type(), maybe_multiply(-grad, alpha.conj()))
  1361. result: auto_element_wise
  1362. - name: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
  1363. dispatch:
  1364. Default:
  1365. self: grad.expand_symint(self.sym_sizes())
  1366. result: auto_linear
  1367. AutogradNestedTensor:
  1368. # TODO: replace this with grad.expand_as(self) when that is supported
  1369. self: ones_like(self) * grad
  1370. result: auto_linear
  1371. - name: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  1372. dispatch:
  1373. Default:
  1374. self: sum_backward(grad, self.sym_sizes(), dim, keepdim)
  1375. result: auto_linear
  1376. AutogradNestedTensor:
  1377. # TODO: replace this function once semantics for nested tensor expand have been settled on
  1378. self: _nested_sum_backward(grad, self, dim, keepdim)
  1379. - name: nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
  1380. self: nansum_backward(grad.to(self.scalar_type()), self, dim, keepdim)
  1381. result: at::where(self_p.isnan(), 0, self_t).sum(dim, keepdim, dtype)
  1382. # We never call _linalg_svd with compute_uv=False in an autograd context, so we don't even consider it here
  1383. - name: _linalg_svd(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)
  1384. A: "svd_backward(full_matrices && grad_U.defined() ? grad_U.narrow_symint(-1, 0, S.sym_size(-1)) : grad_U,
  1385. grad_S,
  1386. full_matrices && grad_Vh.defined() ? grad_Vh.narrow_symint(-2, 0, S.sym_size(-1)) : grad_Vh,
  1387. full_matrices ? U.narrow_symint(-1, 0, S.sym_size(-1)) : U,
  1388. S,
  1389. full_matrices ? Vh.narrow_symint(-2, 0, S.sym_size(-1)) : Vh)"
  1390. U, S, Vh: linalg_svd_jvp(A_t, U, S, Vh, full_matrices)
  1391. - name: _linalg_eigh(Tensor A, str UPLO="L", bool compute_v=True) -> (Tensor eigenvalues, Tensor eigenvectors)
  1392. A: linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors, /*is_hermitian=*/true)
  1393. eigenvalues, eigenvectors: linalg_eig_jvp(A_t, eigenvalues, eigenvectors, /*is_hermitian=*/true)
  1394. - name: linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors)
  1395. self: handle_r_to_c(self.scalar_type(), linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors, /*is_hermitian=*/false))
  1396. eigenvalues, eigenvectors: linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/false)
  1397. - name: t(Tensor(a) self) -> Tensor(a)
  1398. self: grad.t()
  1399. result: auto_linear
  1400. - name: t_(Tensor(a!) self) -> Tensor(a!)
  1401. self: grad.t()
  1402. result: auto_linear
  1403. - name: one_hot(Tensor self, int num_classes=-1) -> Tensor
  1404. self: non_differentiable
  1405. - name: flip(Tensor self, int[] dims) -> Tensor
  1406. self: grad.flip(dims)
  1407. result: auto_linear
  1408. - name: roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
  1409. self: grad.roll_symint(fmap(reverse_list_symint(shifts), [](c10::SymInt i){return -i;}), reverse_list(dims))
  1410. result: auto_linear
  1411. - name: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
  1412. self: grad.rot90(-k, dims)
  1413. result: auto_linear
  1414. - name: take(Tensor self, Tensor index) -> Tensor
  1415. self: take_backward(grad, self, index)
  1416. index: non_differentiable
  1417. result: auto_linear
  1418. - name: tan(Tensor self) -> Tensor
  1419. self: grad * (1 + result.pow(2)).conj()
  1420. result: auto_element_wise
  1421. - name: tanh(Tensor self) -> Tensor
  1422. self: tanh_backward(grad, result)
  1423. result: auto_element_wise
  1424. - name: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
  1425. self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), true)
  1426. output_differentiability: [True, False]
  1427. values: gather(self_t, dim, indices)
  1428. - name: trace(Tensor self) -> Tensor
  1429. self: trace_backward_symint(grad, self.sym_sizes())
  1430. result: auto_linear
  1431. - name: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
  1432. self: grad.transpose(dim0, dim1)
  1433. result: auto_linear
  1434. - name: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
  1435. self: grad.transpose(dim0, dim1)
  1436. result: auto_linear
  1437. - name: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
  1438. self, A: triangular_solve_backward(grad_solution, grad_cloned_coefficient, self, A, solution, upper, transpose, unitriangular, grad_input_mask)
  1439. solution: triangular_solve_jvp(solution, A_p, A_t, self_t, upper, transpose, unitriangular)
  1440. cloned_coefficient: A_t
  1441. - name: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor
  1442. self, B: linalg_solve_triangular_backward(grad, self, result, upper, left, unitriangular, grad_input_mask)
  1443. result: linalg_solve_triangular_forward_AD(self_t, B_t, self_p, result, upper, left, unitriangular)
  1444. - name: tril(Tensor self, SymInt diagonal=0) -> Tensor
  1445. self: grad.tril_symint(diagonal)
  1446. result: auto_linear
  1447. - name: triu(Tensor self, SymInt diagonal=0) -> Tensor
  1448. self: grad.triu_symint(diagonal)
  1449. result: auto_linear
  1450. - name: trunc(Tensor self) -> Tensor
  1451. self: zeros_like(grad)
  1452. result: auto_element_wise
  1453. - name: hash_tensor(Tensor self, int[1] dim=[], *, bool keepdim=False, int mode=0) -> Tensor
  1454. output_differentiability: [False]
  1455. # DO NOT define a backward for to_dense
  1456. # See [Note: Sometimes view derivatives]
  1457. # - name: to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor
  1458. #
  1459. - name: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
  1460. self: to_dense_backward(grad, self, masked_grad)
  1461. # DO NOT define a backward for to_sparse.sparse_dim
  1462. # See [Note: Sometimes view derivatives]
  1463. # - name: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
  1464. #
  1465. - name: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
  1466. self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
  1467. # DO NOT define a backward for to_sparse
  1468. # See [Note: Sometimes view derivatives]
  1469. # - name: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
  1470. #
  1471. - name: _to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
  1472. self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
  1473. # DO NOT define a backward for to_sparse_csr
  1474. # See [Note: Sometimes view derivatives]
  1475. # - name: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
  1476. #
  1477. - name: _to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
  1478. self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
  1479. # DO NOT define a backward for to_sparse_csc
  1480. # See [Note: Sometimes view derivatives]
  1481. # - name: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
  1482. #
  1483. - name: _to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
  1484. self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
  1485. # DO NOT define a backward for to_sparse_bsr
  1486. # See [Note: Sometimes view derivatives]
  1487. # - name: to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
  1488. #
  1489. - name: _to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
  1490. self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
  1491. # DO NOT define a backward for to_sparse_bsc
  1492. # See [Note: Sometimes view derivatives]
  1493. # - name: to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
  1494. #
  1495. - name: _to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
  1496. self: to_sparse_backward(grad, self.layout(), self.sym_blocksize())
  1497. - name: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
  1498. self: to_mkldnn_backward(grad, self)
  1499. - name: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
  1500. self: unfold_backward_symint(grad, self.sym_sizes(), dimension, size, step)
  1501. result: auto_linear
  1502. - name: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
  1503. grad_in: grad.unfold(dim, size, step)
  1504. result: auto_linear
  1505. - name: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
  1506. self: zeros_like(grad)
  1507. result: self_t.zero_()
  1508. - name: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)
  1509. output_differentiability: [True, False]
  1510. self: not_implemented("_unique")
  1511. - name: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
  1512. output_differentiability: [True, False, False]
  1513. self: not_implemented("unique_dim")
  1514. - name: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor)
  1515. output_differentiability: [True, False, False]
  1516. self: not_implemented("unique_consecutive")
  1517. - name: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
  1518. output_differentiability: [True, False, False]
  1519. self: not_implemented("unique_dim_consecutive")
  1520. - name: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
  1521. output_differentiability: [True, False, False]
  1522. self: not_implemented("_unique2")
  1523. - name: _unsafe_view(Tensor self, SymInt[] size) -> Tensor
  1524. self: grad.reshape_symint(self.sym_sizes())
  1525. result: auto_linear
  1526. - name: lift(Tensor self) -> Tensor
  1527. self: grad
  1528. result: auto_linear
  1529. - name: lift_fresh(Tensor(a) self) -> Tensor(a)
  1530. self: grad
  1531. result: auto_linear
  1532. - name: unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
  1533. self: grad.squeeze(dim)
  1534. result: auto_linear
  1535. - name: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
  1536. self: grad.squeeze(dim)
  1537. result: auto_linear
  1538. - name: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
  1539. self: var_backward(grad, self, dim, correction, keepdim)
  1540. # pointwise + sum
  1541. result: at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim))
  1542. - name: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
  1543. self: var_mean_backward(grads[0], grads[1], self, dim, correction, keepdim)
  1544. result0: at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim))
  1545. # linear
  1546. result1: mean(self_t, dim.value_or(IntArrayRef({})), keepdim)
  1547. - name: view(Tensor(a) self, SymInt[] size) -> Tensor(a)
  1548. dispatch:
  1549. Default:
  1550. self: grad.reshape_symint(self.sym_sizes())
  1551. result: auto_linear
  1552. AutogradNestedTensor:
  1553. self: grad.reshape_as(self)
  1554. result: auto_linear
  1555. - name: view.dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)
  1556. output_differentiability: [False]
  1557. - name: view_as_real(Tensor(a) self) -> Tensor(a)
  1558. self: at::view_as_complex(grad.contiguous()) # gx0 + 1j * gx1
  1559. result: at::view_as_real(self_t)
  1560. - name: view_as_complex(Tensor(a) self) -> Tensor(a)
  1561. self: at::view_as_real(grad.contiguous().resolve_conj()) # [gx, gy]
  1562. result: at::view_as_complex(self_t)
  1563. - name: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
  1564. condition: non_differentiable
  1565. self: where(condition, grad, 0)
  1566. other: where(condition, 0, grad)
  1567. result: where(condition, self_t, other_t)
  1568. # weight_norm_cuda_interface_backward does not have an explicitly defined derivative, so if we do happen
  1569. # to be running backward with create_graph=True, fall back to a backward function that uses
  1570. # differentiable ops.
  1571. - name: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
  1572. v, g: "grad.defined() ? (GradMode::is_enabled() ? _weight_norm_differentiable_backward(grad.contiguous(), v, g, result1, dim) : _weight_norm_interface_backward(grad.contiguous(), v, g, result1, dim)) : std::tuple<Tensor, Tensor>()"
  1573. - name: zero_(Tensor(a!) self) -> Tensor(a!)
  1574. self: zeros_like(grad)
  1575. result: auto_linear
  1576. - name: sparse_mask(Tensor self, Tensor mask) -> Tensor
  1577. self: sparse_mask_backward(grad, mask, self.layout())
  1578. mask: non_differentiable
  1579. - name: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
  1580. indices: non_differentiable
  1581. values: grad.sparse_mask(result)._values()
  1582. - name: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
  1583. compressed_indices: non_differentiable
  1584. plain_indices: non_differentiable
  1585. # TODO: remove to_dense after gh-107381 is fixed
  1586. values: grad.to_dense().sparse_mask(result).values()
  1587. - name: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
  1588. self: at::_sparse_sum_backward(grad, self, dim)
  1589. - name: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor
  1590. self: grad * _standard_gamma_grad(self, result)
  1591. - name: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
  1592. self: not_implemented("_standard_gamma_grad")
  1593. - name: values(Tensor(a) self) -> Tensor(a)
  1594. dispatch:
  1595. Default:
  1596. self: values_backward(grad, self)
  1597. AutogradNestedTensor:
  1598. self: at::_nested_view_from_buffer(grad.contiguous(), self._nested_tensor_size(), self._nested_tensor_strides(), self._nested_tensor_storage_offsets())
  1599. # Why is _values() not differentiable?
  1600. # See NOTE [ Sparse: autograd and API ]
  1601. - name: _values(Tensor(a) self) -> Tensor(a)
  1602. output_differentiability: [False]
  1603. # NN
  1604. - name: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
  1605. i1, i2, i3: "_trilinear_backward(grad,
  1606. wrap_opt_if(i1, grad_input_mask[1] || grad_input_mask[2]),
  1607. wrap_opt_if(i2, grad_input_mask[0] || grad_input_mask[2]),
  1608. wrap_opt_if(i3, grad_input_mask[0] || grad_input_mask[1]),
  1609. expand1, expand2, expand3, sumdim, grad_input_mask)"
  1610. result: "_trilinear(i1_t, i2_p, i3_p, expand1, expand2, expand3, sumdim, unroll_dim) +
  1611. _trilinear(i1_p, i2_t, i3_p, expand1, expand2, expand3, sumdim, unroll_dim) +
  1612. _trilinear(i1_p, i2_p, i3_t, expand1, expand2, expand3, sumdim, unroll_dim)"
  1613. - name: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
  1614. self: constant_pad_nd_backward(grad, pad)
  1615. result: constant_pad_nd_symint(self_t, pad, 0)
  1616. - name: binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
  1617. self: binary_cross_entropy_backward(grad, self, target, weight, reduction)
  1618. target: binary_cross_entropy_target_backward(grad, self, target, weight, reduction)
  1619. result: "apply_loss_reduction(
  1620. binary_cross_entropy_backward(self_t, self_p, target_p, weight, at::Reduction::None)
  1621. + binary_cross_entropy_target_backward(target_t, self_p, target_p, weight, at::Reduction::None),
  1622. reduction)"
  1623. - name: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
  1624. self: binary_cross_entropy_double_backward(grad_output, grad, self, target, weight, reduction)
  1625. target: binary_cross_entropy_double_backward_target(grad, grad_output, self, target, weight, reduction)
  1626. grad_output: binary_cross_entropy_double_backward_grad_output(grad, self, target, weight, reduction)
  1627. result: " binary_cross_entropy_double_backward(grad_output_p, self_t, self_p, target_p, weight, reduction)
  1628. + binary_cross_entropy_double_backward_target(target_t, grad_output_p, self_p, target_p, weight, reduction)
  1629. + binary_cross_entropy_double_backward_grad_output(grad_output_t, self_p, target_p, weight, reduction)"
  1630. - name: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
  1631. self: binary_cross_entropy_with_logits_backward(grad, self, target, weight, pos_weight, reduction)
  1632. target: binary_cross_entropy_with_logits_target_backward(grad, self, target, weight, pos_weight, reduction)
  1633. result: "apply_loss_reduction(
  1634. binary_cross_entropy_with_logits_backward(self_t, self_p, target_p, weight, pos_weight, at::Reduction::None)
  1635. + binary_cross_entropy_with_logits_target_backward(target_t, self_p, target_p, weight, pos_weight, at::Reduction::None),
  1636. reduction)"
  1637. - name: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
  1638. indices: non_differentiable
  1639. weight: embedding_backward_symint(grad, indices, weight.sym_size(0), padding_idx, scale_grad_by_freq, sparse)
  1640. result: auto_linear
  1641. - name: embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor
  1642. grad_output: embedding_dense_double_backward_symint(grad, indices, padding_idx)
  1643. indices: non_differentiable
  1644. result: auto_linear
  1645. - name: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
  1646. indices: non_differentiable
  1647. offsets: non_differentiable
  1648. weight: _embedding_bag_backward_symint(grad, indices, offsets, result1, result2, result3, weight.sym_size(0), scale_grad_by_freq, mode, sparse, per_sample_weights, padding_idx)
  1649. per_sample_weights: _embedding_bag_per_sample_weights_backward(grad, weight, indices, offsets, result1, mode, padding_idx)
  1650. - name: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
  1651. grad: not_implemented("_embedding_bag_backward")
  1652. indices: non_differentiable
  1653. offsets: non_differentiable
  1654. offset2bag: non_differentiable
  1655. bag_size: non_differentiable
  1656. maximum_indices: non_differentiable
  1657. per_sample_weights: not_implemented("_embedding_bag_backward")
  1658. - name: _embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
  1659. grad: not_implemented("_embedding_bag_dense_backward")
  1660. indices: non_differentiable
  1661. offset2bag: non_differentiable
  1662. bag_size: non_differentiable
  1663. maximum_indices: non_differentiable
  1664. per_sample_weights: not_implemented("_embedding_bag_dense_backward")
  1665. - name: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
  1666. indices: non_differentiable
  1667. self: not_implemented("embedding_renorm")
  1668. - name: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
  1669. self: mse_loss_backward(grad, self, target, reduction)
  1670. target: mse_loss_backward(grad, target, self, reduction)
  1671. result: apply_loss_reduction(mse_loss_backward(self_t.conj(), self_p, target_p, at::Reduction::None).conj() + mse_loss_backward(target_t.conj(), target_p, self_p, at::Reduction::None).conj(), reduction)
  1672. - name: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor
  1673. self: multi_margin_loss_backward(grad, self, target, p, margin, weight, reduction)
  1674. target: non_differentiable
  1675. - name: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target)
  1676. self: multilabel_margin_loss_backward(grad, self, target, reduction, is_target)
  1677. target: non_differentiable
  1678. - name: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
  1679. self: nll_loss_backward_symint(grad, self, target, weight, reduction, ignore_index, total_weight)
  1680. target: non_differentiable
  1681. output: std::get<0>(nll_loss_forward_symint(self_t, target, weight, reduction, ignore_index))
  1682. - name: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
  1683. self: nll_loss2d_backward_symint(grad, self, target, weight, reduction, ignore_index, total_weight)
  1684. target: non_differentiable
  1685. output: std::get<0>(nll_loss2d_forward_symint(self_t, target, weight, reduction, ignore_index))
  1686. - name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
  1687. self: smooth_l1_loss_backward(grad, self, target, reduction, beta)
  1688. target: smooth_l1_loss_backward(grad, target, self, reduction, beta)
  1689. result: apply_loss_reduction(smooth_l1_loss_backward(self_t.conj(), self_p, target_p, at::Reduction::None, beta).conj() + smooth_l1_loss_backward(target_t.conj(), target_p, self_p, at::Reduction::None, beta).conj(), reduction)
  1690. - name: huber_loss(Tensor self, Tensor target, int reduction=Mean, float delta=1.0) -> Tensor
  1691. self: huber_loss_backward(grad, self, target, reduction, delta)
  1692. target: huber_loss_backward(grad, target, self, reduction, delta)
  1693. result: apply_loss_reduction(huber_loss_backward(self_t.conj(), self_p, target_p, at::Reduction::None, delta).conj() + huber_loss_backward(target_t.conj(), target_p, self_p, at::Reduction::None, delta).conj(), reduction)
  1694. - name: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
  1695. self: soft_margin_loss_backward(grad, self, target, reduction)
  1696. result: apply_loss_reduction(soft_margin_loss_backward(self_t.conj(), self_p, target, at::Reduction::None).conj(), reduction)
  1697. - name: relu(Tensor self) -> Tensor
  1698. self: threshold_backward(grad, result, 0)
  1699. result: auto_element_wise
  1700. - name: silu(Tensor self) -> Tensor
  1701. self: "GradMode::is_enabled() ? infinitely_differentiable_silu_backward(grad, self) : silu_backward(grad, self)"
  1702. result: auto_element_wise
  1703. - name: mish(Tensor self) -> Tensor
  1704. self: "GradMode::is_enabled() ? infinitely_differentiable_mish_backward(grad, self) : mish_backward(grad, self)"
  1705. result: auto_element_wise
  1706. - name: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
  1707. self: elu_backward(grad, alpha, scale, input_scale, /* is_result */ false, self)
  1708. result: auto_element_wise
  1709. - name: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
  1710. self: elu_backward(grad, alpha, scale, input_scale, /* is_result */ true, result)
  1711. result: self_t.copy_(elu_backward(original_self_t, alpha, scale, input_scale, /* is_result */ true, result))
  1712. - name: celu(Tensor self, Scalar alpha=1.0) -> Tensor
  1713. self: elu_backward(grad, alpha, 1, 1.0/alpha.toFloat(), /* is_result */ false, self)
  1714. result: auto_element_wise
  1715. - name: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
  1716. self: elu_backward(grad, alpha, 1, 1.0/alpha.toFloat(), /* is_result */ true, result)
  1717. result: self_t.copy_(elu_backward(original_self_t, alpha, 1, 1.0/alpha.toFloat(), /* is_result */ true, result))
  1718. - name: gelu(Tensor self, *, str approximate='none') -> Tensor
  1719. self: gelu_backward(grad, self, approximate)
  1720. result: auto_element_wise
  1721. - name: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
  1722. grad_output: gelu_backward(grad, self, approximate)
  1723. self: gelu_double_backward(grad, grad_output, self, approximate)
  1724. result: gelu_backward(grad_output_t, self_p, approximate) + gelu_double_backward(self_t, grad_output_p, self_p, approximate)
  1725. - name: glu(Tensor self, int dim=-1) -> Tensor
  1726. # TODO: glu_backward can benefit from forward result,
  1727. # and forward ad/forward over reverse ad for that matter
  1728. self: glu_backward(grad, self, dim)
  1729. result: glu_jvp(result, self_p, self_t, dim)
  1730. - name: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
  1731. self: hardshrink_backward(grad, self, lambd)
  1732. result: auto_element_wise
  1733. - name: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
  1734. grad_out: hardshrink_backward(grad, self, lambd)
  1735. self: zeros_like(grad)
  1736. result: at::where((self_p > lambd).logical_or(self_p < -lambd), grad_out_t, at::zeros({}, result.options()).expand_as(result))
  1737. - name: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
  1738. self: hardtanh_backward(grad, self, min_val, max_val)
  1739. result: auto_element_wise
  1740. - name: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
  1741. self: leaky_relu_backward(grad, self, negative_slope, false)
  1742. result: auto_element_wise
  1743. - name: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
  1744. self: leaky_relu_backward(grad, result, negative_slope, true)
  1745. result: self_t.copy_(leaky_relu_backward(original_self_t.conj(), result, negative_slope, true).conj())
  1746. - name: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
  1747. self: log_sigmoid_backward(grad, self, buffer)
  1748. output: log_sigmoid_backward(self_t.conj(), self_p, buffer).conj()
  1749. output_differentiability: [True, False]
  1750. - name: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
  1751. self: _log_softmax_backward_data(grad, result, dim, self.scalar_type())
  1752. result: self_t - logsumexp_jvp(self_p, self_t, {dim}, true)
  1753. - name: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
  1754. self: _sparse_log_softmax_backward_data(grad, result, dim, self)
  1755. - name: _masked_softmax(Tensor self, Tensor mask, int? dim=None, int? mask_type=None) -> Tensor
  1756. self: _masked_softmax_backward(grad, result, mask, dim)
  1757. mask: non_differentiable
  1758. - name: _prelu_kernel(Tensor self, Tensor weight) -> Tensor
  1759. self, weight: "grad.defined() ? _prelu_kernel_backward(grad, self, weight) : std::tuple<Tensor, Tensor>()"
  1760. result: at::where(self_p >= 0, self_t, weight_p * self_t + weight_t * self_p)
  1761. - name: _prelu_kernel_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
  1762. grad_output: "grads[0].defined() ?
  1763. (grads[1].defined() ? at::where(self >= 0, grads[0], grads[0] * weight + grads[1] * self)
  1764. : at::where(self >= 0, grads[0], grads[0] * weight))
  1765. : at::where(self >= 0, at::zeros({}, grad_output.options()), grads[1] * self)"
  1766. self: "grads[1].defined() ? at::where(self >= 0, at::zeros({}, self.options()), grad_output * grads[1]) : zeros_like(self)"
  1767. weight: "grads[0].defined() ? at::where(self >= 0, at::zeros({}, weight.options()), grad_output * grads[0]) : zeros_like(self)"
  1768. result0: at::where(self_p >= 0, grad_output_t, grad_output_t * weight_p + grad_output_p * weight_t)
  1769. result1: at::where(self_p >= 0, at::zeros({}, self_p.options()), grad_output_p * self_t + grad_output_t * self_p)
  1770. - name: rrelu_with_noise(Tensor self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
  1771. self: rrelu_with_noise_backward(grad, self, noise, lower, upper, training, false)
  1772. result: auto_element_wise
  1773. - name: rrelu_with_noise_(Tensor(a!) self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
  1774. self: rrelu_with_noise_backward(grad, result, noise, lower, upper, training, true)
  1775. - name: rrelu_with_noise_functional(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> (Tensor, Tensor noise_out)
  1776. noise: non_differentiable
  1777. self: rrelu_with_noise_backward(grad, self, noise, lower, upper, training, false)
  1778. - name: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
  1779. self: _softmax_backward_data(grad, result, dim, self.scalar_type())
  1780. result: result * (self_t - logsumexp_jvp(self_p, self_t, {dim}, true))
  1781. - name: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
  1782. self: _sparse_softmax_backward_data(grad, result, dim, self)
  1783. - name: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
  1784. self: sparse_sparse_matmul_backward(grad, self, other, 0)
  1785. other: sparse_sparse_matmul_backward(grad, self, other, 1)
  1786. - name: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
  1787. self: softplus_backward(grad, self, beta, threshold)
  1788. result: auto_element_wise
  1789. - name: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
  1790. self: softshrink_backward(grad, self, lambd)
  1791. result: auto_element_wise
  1792. - name: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
  1793. self: threshold_backward(grad, self, threshold)
  1794. result: auto_element_wise
  1795. - name: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
  1796. self: threshold_backward(grad, self, threshold)
  1797. result: self_t.copy_(threshold_backward(self_t.conj(), original_self_p, threshold).conj())
  1798. - name: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor
  1799. self: reflection_pad1d_backward_symint(grad, self, padding)
  1800. result: auto_linear
  1801. - name: reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor
  1802. self: reflection_pad2d_backward_symint(grad, self, padding)
  1803. result: auto_linear
  1804. - name: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor
  1805. self: reflection_pad3d_backward_symint(grad, self, padding)
  1806. result: auto_linear
  1807. - name: replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor
  1808. self: replication_pad1d_backward_symint(grad, self, padding)
  1809. result: auto_linear
  1810. - name: replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor
  1811. self: replication_pad2d_backward_symint(grad, self, padding)
  1812. result: auto_linear
  1813. - name: replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor
  1814. self: replication_pad3d_backward_symint(grad, self, padding)
  1815. result: auto_linear
  1816. - name: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
  1817. self: upsample_linear1d_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales)
  1818. result: auto_linear
  1819. - name: upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  1820. self: upsample_bilinear2d_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales_h, scales_w)
  1821. result: auto_linear
  1822. - name: _upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  1823. self: _upsample_bilinear2d_aa_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales_h, scales_w)
  1824. result: auto_linear
  1825. - name: upsample_bicubic2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  1826. self: upsample_bicubic2d_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales_h, scales_w)
  1827. result: auto_linear
  1828. - name: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  1829. self: _upsample_bicubic2d_aa_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales_h, scales_w)
  1830. result: auto_linear
  1831. - name: upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
  1832. self: upsample_trilinear3d_backward_symint(grad, output_size, self.sym_sizes(), align_corners, scales_d, scales_h, scales_w)
  1833. result: auto_linear
  1834. - name: upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
  1835. self: upsample_nearest1d_backward_symint(grad, output_size, self.sym_sizes(), scales)
  1836. result: auto_linear
  1837. - name: _upsample_nearest_exact1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
  1838. self: _upsample_nearest_exact1d_backward_symint(grad, output_size, self.sym_sizes(), scales)
  1839. result: auto_linear
  1840. - name: upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
  1841. self: upsample_nearest2d_backward_symint(grad, output_size, self.sym_sizes(), scales_h, scales_w)
  1842. result: auto_linear
  1843. - name: _upsample_nearest_exact2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
  1844. self: _upsample_nearest_exact2d_backward_symint(grad, output_size, self.sym_sizes(), scales_h, scales_w)
  1845. result: auto_linear
  1846. - name: upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
  1847. self: upsample_nearest3d_backward_symint(grad, output_size, self.sym_sizes(), scales_d, scales_h, scales_w)
  1848. result: auto_linear
  1849. - name: _upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
  1850. self: _upsample_nearest_exact3d_backward_symint(grad, output_size, self.sym_sizes(), scales_d, scales_h, scales_w)
  1851. result: auto_linear
  1852. - name: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
  1853. self: pixel_unshuffle(grad, upscale_factor)
  1854. result: auto_linear
  1855. - name: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
  1856. self: pixel_shuffle(grad, downscale_factor)
  1857. result: auto_linear
  1858. - name: channel_shuffle(Tensor self, SymInt groups) -> Tensor
  1859. self: channel_shuffle_symint(grad, grad.sym_size(1) / groups)
  1860. result: auto_linear
  1861. - name: _adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
  1862. self: _adaptive_avg_pool2d_backward(grad, self)
  1863. result: auto_linear
  1864. - name: _adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
  1865. self: _adaptive_avg_pool3d_backward(grad, self)
  1866. result: auto_linear
  1867. - name: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
  1868. self: adaptive_max_pool2d_backward(grad, self, result1)
  1869. result0: gather(self_t.flatten(-2), -1, result1.flatten(-2)).view_as(result1)
  1870. output_differentiability: [True, False]
  1871. - name: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
  1872. self: adaptive_max_pool3d_backward(grad, self, result1)
  1873. result0: gather(self_t.flatten(-3), -1, result1.flatten(-3)).view_as(result1)
  1874. output_differentiability: [True, False]
  1875. - name: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
  1876. self: avg_pool2d_backward(grad, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
  1877. result: auto_linear
  1878. - name: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
  1879. self: avg_pool3d_backward(grad, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
  1880. result: auto_linear
  1881. - name: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor)
  1882. self: fractional_max_pool2d_backward(grad, self, kernel_size, output_size, result1)
  1883. result0: gather(self_t.flatten(-2), -1, result1.flatten(-2)).view_as(result1)
  1884. output_differentiability: [True, False]
  1885. - name: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor)
  1886. self: fractional_max_pool3d_backward(grad, self, kernel_size, output_size, result1)
  1887. result0: gather(self_t.flatten(-3), -1, result1.flatten(-3)).view_as(result1)
  1888. output_differentiability: [True, False]
  1889. - name: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
  1890. input, weight, bias: "grad.defined() ? linear_backward(input, grad, weight, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1891. - name: linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  1892. self, grad_output, weight: linear_double_backward(grads, self, grad_output, weight)
  1893. #mps
  1894. - name: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  1895. self: max_pool2d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode)
  1896. - name: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
  1897. self, weight, bias: "grad.defined() ? mps_convolution_backward_symint(self, grad, weight, padding, stride, dilation, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1898. - name: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  1899. grad_output, self, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask)
  1900. - name: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
  1901. self: max_pool2d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, result1)
  1902. result0: gather(self_t.flatten(-2), -1, result1.flatten(-2)).view_as(result1)
  1903. output_differentiability: [True, False]
  1904. - name: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
  1905. self: max_pool3d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, result1)
  1906. result0: gather(self_t.flatten(-3), -1, result1.flatten(-3)).view_as(result1)
  1907. output_differentiability: [True, False]
  1908. - name: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
  1909. self: max_pool_double_backward(grad, indices, 2)
  1910. indices: non_differentiable
  1911. result: auto_linear
  1912. - name: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
  1913. self: max_pool_double_backward(grad, indices, 3)
  1914. indices: non_differentiable
  1915. result: auto_linear
  1916. - name: convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
  1917. input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1918. result: convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups)
  1919. # TorchScript serializes calls to _convolution so this entry is present until that is changed to use convolution.
  1920. # Note that the benchmark, deterministic, cudnn_enabled, and allow_tf32 flags are queried from the global context
  1921. # by convolution_backward instead of being passed along from the forward pass.
  1922. - name: _convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
  1923. input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1924. result: _convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32)
  1925. - name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  1926. grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
  1927. result0: std::get<0>(convolution_backward_symint(grad_output_p, input_p, weight_t, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false})) + std::get<0>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false}))
  1928. result1: std::get<1>(convolution_backward_symint(grad_output_p, input_t, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false})) + std::get<1>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false}))
  1929. result2: convolution_backward_jvp_grad_bias(grad_output_t, result2)
  1930. - name: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
  1931. input, weight, bias: "grad.defined() ? convolution_backward_overrideable_symint(grad, input, weight, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1932. - name: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
  1933. grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
  1934. - name: slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor
  1935. self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1936. - name: slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor
  1937. self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1938. - name: _slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor
  1939. self, weight, bias: "grad.defined() ? _slow_conv2d_backward_symint(grad, self, weight, kernel_size, stride, padding, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1940. - name: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
  1941. grad_output, self, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1}}, false, {{0, 0}}, 1, grad_input_mask)
  1942. - name: _conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor
  1943. self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1944. - name: conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor
  1945. self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1946. - name: slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor
  1947. self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, /*dilation=*/ {{1, 1, 1}}, false, /*output_padding=*/ {{0, 0, 0}}, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1948. - name: slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor
  1949. self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1950. - name: slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor
  1951. self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  1952. - name: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
  1953. self: im2col(grad, kernel_size, dilation, padding, stride)
  1954. result: auto_linear
  1955. - name: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
  1956. self: col2im_symint(grad, {self.sym_size(-2), self.sym_size(-1)}, kernel_size, dilation, padding, stride)
  1957. result: auto_linear
  1958. - name: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
  1959. grad_output: _adaptive_avg_pool2d_symint(grad, {grad_output.sym_size(-2), grad_output.sym_size(-1)})
  1960. self: zeros_like(self)
  1961. result: _adaptive_avg_pool2d_backward(grad_output_t, self_p)
  1962. - name: _adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
  1963. grad_output: _adaptive_avg_pool3d_symint(grad, { grad_output.sym_size(-3), grad_output.sym_size(-2), grad_output.sym_size(-1) })
  1964. self: zeros_like(self)
  1965. result: _adaptive_avg_pool3d_backward(grad_output_t, self_p)
  1966. - name: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
  1967. grad_output: max_pool_double_backward(grad, indices, 2)
  1968. self: zeros_like(self)
  1969. result: auto_linear
  1970. - name: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
  1971. grad_output: max_pool_double_backward(grad, indices, 3)
  1972. self: zeros_like(self)
  1973. result: auto_linear
  1974. - name: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
  1975. grad_output: avg_pool2d(grad, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
  1976. self: zeros_like(self)
  1977. result: avg_pool2d_backward(grad_output_t, self_p, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
  1978. - name: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
  1979. grad_output: avg_pool3d(grad, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
  1980. self: zeros_like(self)
  1981. result: avg_pool3d_backward(grad_output_t, self_p, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
  1982. - name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
  1983. grad_output: elu_backward(grad, alpha, scale, input_scale, is_result, self_or_result)
  1984. self_or_result: elu_double_backward(grad, grad_output, alpha, scale, input_scale, is_result, self_or_result)
  1985. result: elu_backward(grad_output_t, alpha, scale, input_scale, is_result, self_or_result_p) + elu_double_backward(self_or_result_t, grad_output_p, alpha, scale, input_scale, is_result, self_or_result_p)
  1986. - name: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor
  1987. grad_output: max_pool_double_backward(grad, indices, 2)
  1988. self: zeros_like(self)
  1989. result: auto_linear
  1990. - name: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor
  1991. grad_output: max_pool_double_backward(grad, indices, 3)
  1992. self: zeros_like(self)
  1993. result: auto_linear
  1994. - name: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
  1995. grad_output: glu_double_backward_grad_output(grad, self, dim)
  1996. self: glu_double_backward(grad, grad_output, self, dim)
  1997. result: glu_backward_jvp(result, grad_output_p, self_p, grad_output_t, self_t, dim)
  1998. - name: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
  1999. grad_output: hardtanh_backward(grad, self, min_val, max_val)
  2000. self: zeros_like(grad)
  2001. result: at::where((self_p > min_val).logical_and(self_p < max_val), grad_output_t, at::zeros({}, result.options()).expand_as(result))
  2002. - name: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
  2003. grad_output: log_sigmoid_backward(grad, self, buffer)
  2004. self: log_sigmoid_double_backward(grad * grad_output, self)
  2005. result: log_sigmoid_backward(grad_output_t, self_p, buffer) + log_sigmoid_double_backward(self_t * grad_output_p, self_p)
  2006. - name: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
  2007. grad_output: grad.to(output.dtype()) - (grad.to(output.dtype()) * output.exp()).sum(dim, true)
  2008. output: (-grad_output.sum(dim, true) * output.exp() * grad.to(output.dtype())).to(output.dtype())
  2009. - name: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
  2010. # self_is_result is always false here since double backward call is an out-of-place call, self is input itself
  2011. grad_output: leaky_relu_backward(grad, self, negative_slope, false)
  2012. self: zeros_like(grad)
  2013. # leaky_relu_backward(grad_output, self, negative_slope, false)
  2014. # computes grad_output * at::where(self_p > 0, 1, negative_slope)
  2015. # so the jvp formula is the following:
  2016. # grad_output_t * at::where(self_p > 0, self_p.new_ones([]), negative_slope);
  2017. #
  2018. # leaky_relu_backward(grad_output, result, negative_slope, true)
  2019. # computes grad_output * at::where(result > 0, 1, negative_slope)
  2020. # under the assumption that `negative_slope` is positive (otherwise,
  2021. # it is not possible to compute the gradient).
  2022. #
  2023. # so the jvp formula is the following:
  2024. # grad_output_t * at::where(result_p > 0, result_p.new_ones([]), negative_slope);
  2025. # with the assumption that negative_slope is positive.
  2026. #
  2027. # Combined together that results in the following optimized kernel which
  2028. # also checks the assumption that negative_slope is positive when self_is_result
  2029. # is True:
  2030. result: leaky_relu_backward(grad_output_t, self_p, negative_slope, self_is_result)
  2031. # This derivative is mps-only, and `error_for_max_pool2d_double_backward` just raises an error.
  2032. - name: max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  2033. grad_output: error_for_max_pool2d_double_backward()
  2034. self: zeros_like(self)
  2035. result: auto_linear
  2036. - name: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
  2037. grad_output: max_pool_double_backward(grad, indices, 2)
  2038. self: zeros_like(self)
  2039. indices: non_differentiable
  2040. result: auto_linear
  2041. - name: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
  2042. grad_output: max_pool_double_backward(grad, indices, 3)
  2043. self: zeros_like(self)
  2044. indices: non_differentiable
  2045. result: auto_linear
  2046. - name: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
  2047. grad_output: mse_loss_backward(grad, self, target, reduction)
  2048. self: mse_loss_double_backward(grad * grad_output, self, reduction)
  2049. target: -mse_loss_double_backward(grad * grad_output, target, reduction)
  2050. result: " mse_loss_double_backward(self_t * grad_output_p, self_p, reduction)
  2051. - mse_loss_double_backward(target_t * grad_output_p, target_p, reduction)
  2052. + mse_loss_backward(grad_output_t, self_p, target_p, reduction)
  2053. "
  2054. - name: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
  2055. grad_output: nll_loss_symint(grad, target, weight, reduction, ignore_index)
  2056. self: zeros_like(grad)
  2057. target: non_differentiable
  2058. - name: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
  2059. grad_output: nll_loss2d_symint(grad, target, weight, reduction, ignore_index)
  2060. self: zeros_like(grad)
  2061. target: non_differentiable
  2062. - name: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
  2063. # self_is_result is always false here since double backward call is an out-of-place call, self is input itself
  2064. grad_output: rrelu_with_noise_backward(grad, self, noise, lower, upper, training, false)
  2065. self: zeros_like(grad)
  2066. result: rrelu_with_noise_backward(grad_output_t, self_p, noise, lower, upper, training, false)
  2067. - name: reflection_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
  2068. grad_output: reflection_pad1d_symint(grad, padding)
  2069. self: zeros_like(self)
  2070. result: reflection_pad1d_backward_symint(grad_output_t, self_p, padding)
  2071. - name: reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
  2072. grad_output: reflection_pad2d_symint(grad, padding)
  2073. self: zeros_like(self)
  2074. result: reflection_pad2d_backward_symint(grad_output_t, self_p, padding)
  2075. - name: reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
  2076. grad_output: reflection_pad3d_symint(grad, padding)
  2077. self: zeros_like(self)
  2078. result: reflection_pad3d_backward_symint(grad_output_t, self_p, padding)
  2079. - name: replication_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
  2080. grad_output: replication_pad1d_symint(grad, padding)
  2081. self: zeros_like(self)
  2082. result: replication_pad1d_backward_symint(grad_output_t, self_p, padding)
  2083. - name: replication_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
  2084. grad_output: replication_pad2d_symint(grad, padding)
  2085. self: zeros_like(self)
  2086. result: replication_pad2d_backward_symint(grad_output_t, self_p, padding)
  2087. - name: replication_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
  2088. grad_output: replication_pad3d_symint(grad, padding)
  2089. self: zeros_like(self)
  2090. result: replication_pad3d_backward_symint(grad_output_t, self_p, padding)
  2091. - name: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  2092. self, mat1, mat2: "sparse_sampled_addmm_backward(grad,
  2093. self,
  2094. wrap_opt_if(mat1, grad_input_mask[2]),
  2095. wrap_opt_if(mat2, grad_input_mask[1]),
  2096. alpha, beta, grad_input_mask)"
  2097. - name: _sparse_mm_reduce_impl(Tensor self, Tensor other, str reduce) -> (Tensor, Tensor)
  2098. output_differentiability: [True, False]
  2099. self, other: "grad.defined() ? _sparse_mm_reduce_impl_backward(self, grad, other, reduce, result1, grad_input_mask) : std::tuple<Tensor, Tensor>()"
  2100. - name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
  2101. grad_output: smooth_l1_loss_backward(grad, self, target, reduction, beta)
  2102. self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
  2103. target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
  2104. result: " smooth_l1_loss_double_backward(self_t * grad_output_p, self_p, target_p, reduction, beta)
  2105. - smooth_l1_loss_double_backward(target_t * grad_output_p, self_p, target_p, reduction, beta)
  2106. + smooth_l1_loss_backward(grad_output_t, self_p, target_p, reduction, beta)
  2107. "
  2108. - name: huber_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta) -> Tensor
  2109. grad_output: huber_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, delta)
  2110. self: huber_loss_double_backward(grad * grad_output, self, target, reduction, delta)
  2111. target: -huber_loss_double_backward(grad * grad_output, self, target, reduction, delta)
  2112. - name: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor
  2113. grad_output: softplus_backward(grad, self, beta, threshold)
  2114. self: softplus_double_backward(grad * grad_output, self, beta, threshold)
  2115. result: "softplus_backward(grad_output_t, self_p, beta, threshold)
  2116. + softplus_double_backward(self_t * grad_output_p, self_p, beta, threshold)"
  2117. - name: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
  2118. grad_output: _softmax_backward_data(grad.to(output.dtype()), output, dim, input_dtype)
  2119. output: softmax_double_backward(grad.to(output.dtype()), grad_output, dim, output).to(output.dtype())
  2120. - name: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
  2121. grad_output: soft_margin_loss_double_backward_grad_output(grad, grad_output, self, target, reduction)
  2122. self: soft_margin_loss_double_backward(grad * grad_output, self, target, reduction)
  2123. - name: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
  2124. grad_output: softshrink_backward(grad, self, lambd)
  2125. self: zeros_like(grad)
  2126. result: at::where((self_p > lambd).logical_or(self_p < -lambd), grad_output_t, at::zeros({}, result.options()).expand_as(result))
  2127. - name: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
  2128. grad_output: threshold_backward(grad, self, threshold)
  2129. self: zeros_like(grad)
  2130. result: zeros_like(self_t) + threshold_backward(grad_output_t, self_p, threshold)
  2131. - name: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
  2132. grad_output: upsample_linear1d_symint(grad, output_size, align_corners, scales)
  2133. result: auto_linear
  2134. - name: upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  2135. grad_output: upsample_bilinear2d_symint(grad, output_size, align_corners, scales_h, scales_w)
  2136. result: auto_linear
  2137. - name: _upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  2138. grad_output: _upsample_bilinear2d_aa_symint(grad, output_size, align_corners, scales_h, scales_w)
  2139. result: auto_linear
  2140. - name: upsample_bicubic2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  2141. grad_output: upsample_bicubic2d_symint(grad, output_size, align_corners, scales_h, scales_w)
  2142. result: auto_linear
  2143. - name: _upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  2144. grad_output: _upsample_bicubic2d_aa_symint(grad, output_size, align_corners, scales_h, scales_w)
  2145. result: auto_linear
  2146. - name: upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
  2147. grad_output: upsample_trilinear3d_symint(grad, output_size, align_corners, scales_d, scales_h, scales_w)
  2148. result: auto_linear
  2149. - name: upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
  2150. grad_output: upsample_nearest1d_symint(grad, output_size, scales)
  2151. result: auto_linear
  2152. - name: _upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
  2153. grad_output: _upsample_nearest_exact1d_symint(grad, output_size, scales)
  2154. result: auto_linear
  2155. - name: upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
  2156. grad_output: upsample_nearest2d_symint(grad, output_size, scales_h, scales_w)
  2157. result: auto_linear
  2158. - name: _upsample_nearest_exact2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
  2159. grad_output: _upsample_nearest_exact2d_symint(grad, output_size, scales_h, scales_w)
  2160. result: auto_linear
  2161. - name: upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
  2162. grad_output: upsample_nearest3d_symint(grad, output_size, scales_d, scales_h, scales_w)
  2163. result: auto_linear
  2164. - name: _upsample_nearest_exact3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
  2165. grad_output: _upsample_nearest_exact3d_symint(grad, output_size, scales_d, scales_h, scales_w)
  2166. result: auto_linear
  2167. - name: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
  2168. grad_output: sigmoid_backward(grad, output.conj())
  2169. output: grad.conj() * grad_output * (-2 * output.conj() + 1)
  2170. result: sigmoid_backward(grad_output_t, output_p) + output_t.conj() * grad_output_p * (-2 * output_p.conj() + 1)
  2171. - name: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
  2172. grad_output: tanh_backward(grad, output.conj())
  2173. output: grad.conj() * (-2 * output.conj() * grad_output)
  2174. result: tanh_backward(grad_output_t, output_p) + output_t.conj() * (-2 * output_p.conj() * grad_output_p)
  2175. # cudnn
  2176. - name: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
  2177. log_probs: _cudnn_ctc_loss_backward(grad, result0, result1, zero_infinity)
  2178. - name: _cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
  2179. log_probs: _cudnn_ctc_loss_backward(grad, result0, result1, zero_infinity)
  2180. - name: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
  2181. self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, output_padding, stride, dilation, true, groups, {grad_input_mask[0], grad_input_mask[1]})"
  2182. - name: _mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
  2183. self, weight: "grad.defined() ? mps_convolution_transpose_backward_symint(self, grad, weight, padding, output_padding, stride, dilation, groups, grad_input_mask) : std::tuple<Tensor, Tensor>()"
  2184. - name: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
  2185. self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, std::vector<c10::SymInt>(padding.size(), 0), stride, dilation, false, groups, {grad_input_mask[0], grad_input_mask[1]})"
  2186. - name: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output
  2187. self, grid: "grad.defined() ? cudnn_grid_sampler_backward(self, grid, grad) : std::tuple<Tensor, Tensor>()"
  2188. - name: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid
  2189. theta: cudnn_affine_grid_generator_backward(grad, N, C, H, W)
  2190. # NB: Why is the backwards here so complicated? CuDNN cannot be used to compute
  2191. # backward in evaluation mode, because the math for backward in evaluation mode
  2192. # is different (since the forward math is different), and CuDNN does not support
  2193. # it. And in any case, you shouldn't be using this bn in evaluation mode,
  2194. # because it should be merged into the previous convolution (left for future
  2195. # work.)
  2196. # NB2: The quotes around the gradient are needed to appease YAML parsing rules.
  2197. - name: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor)
  2198. input, weight, bias: "grad.defined() ? (training ? cudnn_batch_norm_backward(input, grad.contiguous(input.suggest_memory_format()), weight, running_mean, running_var, result1, result2, epsilon, retain_variables ? result3.clone() : result3) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple<Tensor, Tensor, Tensor>()"
  2199. result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, epsilon)
  2200. # HACK: save_mean and save_var are going to be passed in as
  2201. # requires_grad variables (even though we'll never backprop through
  2202. # them) so we need to prevent the unpacking from triggering an error.
  2203. - name: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor)
  2204. save_mean: not_implemented("cudnn_batch_norm_backward save_mean")
  2205. save_var: not_implemented("cudnn_batch_norm_backward save_var")
  2206. reserveSpace: not_implemented("cudnn_batch_norm_backward reserveSpace")
  2207. input, weight, grad_output: batchnorm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_output, running_mean, running_var, true, epsilon, save_mean, save_var, grad_input_mask)
  2208. # nnpack
  2209. - name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor
  2210. # NNPACK does not support strided convolutions in the backwards path, which is the reason why we are using the closest available function that does here.
  2211. input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector<c10::SymInt>(padding.size(), 1), false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  2212. #LSTM MPS
  2213. - name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
  2214. output_differentiability: [True, True, True, False, False, False]
  2215. input, hx, params: "lstm_mps_backward(grads[0], grads[1], grads[2], result3, result4, input, result5, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first)"
  2216. - name: lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
  2217. # Only frst three of _cudnn_rnn outputs can have gradients.
  2218. # _cudnn_rnn outputs: (output, hy, cy, reserve, weight_buf)
  2219. - name: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
  2220. dropout_state: non_differentiable
  2221. output_differentiability: [True, True, True, False, False]
  2222. input, hx, cx, weight: "_cudnn_rnn_backward_symint(input, weight, weight_stride0, result4, hx, cx, result0, grads[0], grads[1], grads[2], mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, retain_variables ? result3.clone() : result3, grad_input_mask)"
  2223. - name: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
  2224. dropout_state: non_differentiable
  2225. input: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
  2226. weight: not_implemented_list("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
  2227. hx: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
  2228. cx: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
  2229. output: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
  2230. grad_output: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
  2231. grad_hy: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
  2232. grad_cy: not_implemented("_cudnn_rnn_backward", kCudnnDoubleBackwardMsg)
  2233. # miopen
  2234. - name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
  2235. self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  2236. - name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
  2237. self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  2238. - name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
  2239. self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  2240. - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
  2241. input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(input.suggest_memory_format()), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple<Tensor, Tensor, Tensor>()"
  2242. result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, epsilon)
  2243. - name: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)
  2244. save_mean: not_implemented("miopen_batch_norm_backward save_mean")
  2245. save_var: not_implemented("miopen_batch_norm_backward save_var")
  2246. input, weight, grad_output: batchnorm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_output, running_mean, running_var, true, epsilon, save_mean, save_var, grad_input_mask)
  2247. - name: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
  2248. dropout_state: non_differentiable
  2249. output_differentiability: [True, True, True, False, False]
  2250. input, hx, cx, weight: "miopen_rnn_backward(input, weight, weight_stride0, result4, hx, cx, result0, grads[0], grads[1], grads[2], mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, retain_variables ? result3.clone() : result3, grad_input_mask)"
  2251. - name: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
  2252. dropout_state: non_differentiable
  2253. - name: miopen_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
  2254. log_probs: _miopen_ctc_loss_backward(grad, result0, result1, zero_infinity)
  2255. - name: miopen_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
  2256. log_probs: _miopen_ctc_loss_backward(grad, result0, result1, zero_infinity)
  2257. - name: mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor)
  2258. output_differentiability: [True, True, True, False]
  2259. input, weight0, weight1, weight2, weight3, hx_, cx_: "GradMode::is_enabled() ? mkldnn_rnn_layer_differentiable_backward(input, weight0, weight1, weight2, weight3, hx_, cx_, result0, result1, result2, grads[0], grads[1], grads[2], reverse, mode, hidden_size, num_layers, has_biases, train, bidirectional, batch_sizes, batch_first, result3) : mkldnn_rnn_layer_backward(input, weight0, weight1, weight2, weight3, hx_, cx_, result0, result1, result2, grads[0], grads[1], grads[2], reverse, mode, hidden_size, num_layers, has_biases, train, bidirectional, batch_sizes, batch_first, result3)"
  2260. - name: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
  2261. # mkldnn
  2262. - name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
  2263. self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
  2264. - name: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
  2265. self, weight, bias: mkldnn_linear_backward(self, grad, weight, grad_input_mask)
  2266. - name: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
  2267. self: mkldnn_max_pool2d_backward(grad, result, self, kernel_size, stride, padding, dilation, ceil_mode)
  2268. - name: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
  2269. self: mkldnn_max_pool3d_backward(grad, result, self, kernel_size, stride, padding, dilation, ceil_mode)
  2270. - name: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
  2271. self: mkldnn_adaptive_avg_pool2d_backward(grad, self)
  2272. - name: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
  2273. self: grad.reshape_symint(self.sym_sizes())
  2274. # NestedTensor
  2275. - name: _nested_tensor_from_tensor_list(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  2276. list: "grad.defined()? at::unbind(grad) : std::vector<Tensor>(list.size())"
  2277. - name: _nested_tensor_from_mask(Tensor t, Tensor mask, bool mask_check=True) -> Tensor
  2278. t: grad.to_padded_tensor_symint(0, t.sym_sizes())
  2279. mask: non_differentiable
  2280. - name: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor
  2281. padded: _nested_from_padded_backward(grad, padded, fuse_transform_0213)
  2282. cpu_nested_shape_example: non_differentiable
  2283. - name: to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor
  2284. self: "self.layout() == c10::kJagged ? at::_nested_from_padded_tensor_symint(grad, at::_nested_get_offsets(self), at::_nested_get_jagged_dummy(self), at::_nested_get_ragged_idx(self), at::_nested_get_min_seqlen(self).defined() ? std::optional<Tensor>(at::_nested_get_min_seqlen(self)) : ::std::nullopt, at::_nested_get_max_seqlen(self).defined() ? std::optional<Tensor>(at::_nested_get_max_seqlen(self)) : ::std::nullopt, std::optional<c10::SymInt>(at::_nested_get_values(self).sym_size(0))) : at::_nested_from_padded(grad, self._nested_tensor_size())"
  2285. padding: non_differentiable
  2286. - name: _nested_from_padded_tensor(Tensor padded, Tensor offsets, Tensor dummy, int ragged_idx=1, Tensor? min_seqlen=None, Tensor? max_seqlen=None, SymInt? sum_S=None) -> Tensor
  2287. padded: grad.to_padded_tensor_symint(0.0, at::OptionalArrayRef<c10::SymInt>(padded.sym_sizes()))
  2288. offsets: non_differentiable
  2289. dummy: non_differentiable
  2290. - name: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a)
  2291. self: grad.values()
  2292. nested_size: non_differentiable
  2293. nested_strides: non_differentiable
  2294. - name: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1, Tensor? min_seqlen=None, Tensor? max_seqlen=None) -> Tensor(a)
  2295. self: grad.values()
  2296. offsets: non_differentiable
  2297. lengths: non_differentiable
  2298. dummy: non_differentiable
  2299. min_seqlen: non_differentiable
  2300. max_seqlen: non_differentiable
  2301. - name: _nested_get_values(Tensor(a) self) -> Tensor(a)
  2302. self: "_nested_view_from_jagged(grad, at::_nested_get_offsets(self), at::_nested_get_jagged_dummy(self), at::_nested_get_lengths(self), at::_nested_get_ragged_idx(self), at::_nested_get_min_seqlen(self).defined() ? std::optional<Tensor>(at::_nested_get_min_seqlen(self)) : ::std::nullopt, at::_nested_get_max_seqlen(self).defined() ? std::optional<Tensor>(at::_nested_get_max_seqlen(self)) : ::std::nullopt)"
  2303. # Transformer
  2304. - name: _safe_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
  2305. self: _softmax_backward_data(grad, result, dim, self.scalar_type())
  2306. result: result * (self_t - safe_logsumexp_jvp(self_p, self_t, {dim}, true))
  2307. - name: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
  2308. output_differentiability: [True, False, False, False]
  2309. query, key, value, attn_bias: _scaled_dot_product_efficient_attention_backward(grad, query, key, value, attn_bias, output, log_sumexp, philox_seed, philox_offset, dropout_p, grad_input_mask, is_causal, scale)
  2310. - name: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
  2311. output_differentiability: [True, False, False, False, False, False, False, False, False]
  2312. query, key, value: _scaled_dot_product_flash_attention_backward_symint(grad, query, key, value, output, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale)
  2313. - name: _scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp)
  2314. output_differentiability: [True, False]
  2315. query, key, value: _scaled_dot_product_flash_attention_for_cpu_backward(grad, query, key, value, output, logsumexp, dropout_p, is_causal, attn_mask, scale)
  2316. - name: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
  2317. output_differentiability: [True, False, False, False, False]
  2318. query, key, value: _flash_attention_backward_symint(grad, query, key, value, output, softmax_logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left, window_size_right)
  2319. - name: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
  2320. output_differentiability: [True, False, False, False, False, False]
  2321. query, key, value, bias: _efficient_attention_backward_symint(grad, query, key, value, bias, output, cu_seqlens_q, cu_seqlens_k, max_seqlen_batch_q, max_seqlen_batch_k, logsumexp, dropout_p, philox_seed, philox_offset, custom_mask_type, bias.requires_grad(), scale)
  2322. - name: _cudnn_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
  2323. output_differentiability: [True, False, False, False, False, False, False, False, False]
  2324. query, key, value: _cudnn_attention_backward_symint(grad, query, key, value, output, logsumexp, philox_seed, philox_offset, attn_bias, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, scale)
  2325. - name: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
  2326. output_differentiability: [True, False, False, False, False, False, False, False, False]
  2327. query, key, value: _scaled_dot_product_cudnn_attention_backward_symint(grad, query, key, value, output, logsumexp, philox_seed, philox_offset, attn_bias, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, scale)
  2328. - name: _scaled_dot_product_fused_attention_overrideable(Tensor query, Tensor key, Tensor value, Tensor? attn_bias=None, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
  2329. output_differentiability: [True, False, False, False, False, False, False, False, False]
  2330. query, key, value, attn_bias: _scaled_dot_product_fused_attention_overrideable_backward_symint(grad, query, key, value, attn_bias, grad_input_mask, output, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset, scale)
  2331. # fft
  2332. - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
  2333. self: fft_r2c_backward(grad, dim, normalization, onesided, self.sym_size(dim.back()))
  2334. result: auto_linear
  2335. - name: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
  2336. self: fft_c2r_backward(grad, dim, normalization)
  2337. result: auto_linear
  2338. - name: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor
  2339. self: _fft_c2c_symint(grad, dim, normalization, !forward)
  2340. result: auto_linear
  2341. - name: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
  2342. dispatch:
  2343. Default:
  2344. self: unbind_backward(grads, dim)
  2345. result: auto_linear
  2346. AutogradNestedTensor:
  2347. self: "self.layout() == c10::kJagged ? unbind_backward_nested_jagged(grads, self, dim) : unbind_backward_nested(grads, at::native::get_nested_tensor_impl(self)->get_nested_sizes(), dim, self.options())"
  2348. result: auto_linear
  2349. - name: stack(Tensor[] tensors, int dim=0) -> Tensor
  2350. tensors: stack_tensors_backward(grad, dim, to_args_scalartypes(tensors))
  2351. result: stack_jvp(tensors, dim)
  2352. # fused RNN kernels
  2353. # Only frst two of _thnn_fused_lstm_cell outputs can have gradients.
  2354. # _thnn_fused_lstm_cell outputs: (hy, cy, workspace)
  2355. - name: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
  2356. output_differentiability: [True, True, False]
  2357. input_gates, hidden_gates, cx, input_bias, hidden_bias: "GradMode::is_enabled() ? _thnn_differentiable_lstm_cell_backward(grads[0], grads[1], input_gates, hidden_gates, input_bias, hidden_bias, cx, result1) : _thnn_fused_lstm_cell_backward(grads[0], grads[1], cx, result1, result2, input_bias.defined())"
  2358. - name: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)
  2359. input_gates, hidden_gates, hx, input_bias, hidden_bias: "grad.defined() ? (GradMode::is_enabled() ? _thnn_differentiable_gru_cell_backward(grad, input_gates, hidden_gates, hx, input_bias, hidden_bias) : _thnn_fused_gru_cell_backward(grad, result1, input_bias.defined())) : std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>()"
  2360. # PackedSequence helpers
  2361. - name: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
  2362. input: _pack_padded_sequence_backward_symint(grad, input.sym_sizes(), result1, batch_first)
  2363. # TH wrappers
  2364. - name: eq.Scalar(Tensor self, Scalar other) -> Tensor
  2365. output_differentiability: [False]
  2366. - name: eq.Tensor(Tensor self, Tensor other) -> Tensor
  2367. output_differentiability: [False]
  2368. - name: ge.Scalar(Tensor self, Scalar other) -> Tensor
  2369. output_differentiability: [False]
  2370. - name: ge.Tensor(Tensor self, Tensor other) -> Tensor
  2371. output_differentiability: [False]
  2372. - name: gt.Scalar(Tensor self, Scalar other) -> Tensor
  2373. output_differentiability: [False]
  2374. - name: gt.Tensor(Tensor self, Tensor other) -> Tensor
  2375. output_differentiability: [False]
  2376. - name: le.Scalar(Tensor self, Scalar other) -> Tensor
  2377. output_differentiability: [False]
  2378. - name: le.Tensor(Tensor self, Tensor other) -> Tensor
  2379. output_differentiability: [False]
  2380. - name: lt.Scalar(Tensor self, Scalar other) -> Tensor
  2381. output_differentiability: [False]
  2382. - name: lt.Tensor(Tensor self, Tensor other) -> Tensor
  2383. output_differentiability: [False]
  2384. - name: ne.Scalar(Tensor self, Scalar other) -> Tensor
  2385. output_differentiability: [False]
  2386. - name: ne.Tensor(Tensor self, Tensor other) -> Tensor
  2387. output_differentiability: [False]
  2388. - name: multinomial(Tensor self, SymInt num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
  2389. output_differentiability: [False]
  2390. - name: nonzero(Tensor self) -> Tensor
  2391. output_differentiability: [False]
  2392. - name: segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor
  2393. data: _segment_reduce_backward(grad, result, data, reduce, lengths, offsets, axis, initial)
  2394. - name: _pin_memory(Tensor self, Device? device=None) -> Tensor
  2395. self: grad
  2396. - name: _new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor
  2397. self: non_differentiable
  2398. other: non_differentiable
  2399. output_differentiability: [False]
  2400. - name: _test_warn_in_autograd(Tensor self) -> Tensor
  2401. self: warn_backwards(grad)
  2402. - name: _test_autograd_multiple_dispatch.fullcoverage(Tensor self) -> Tensor
  2403. dispatch:
  2404. Default:
  2405. self: grad.expand_symint(self.sym_sizes()) + 1
  2406. result: auto_linear
  2407. AutogradNestedTensor:
  2408. self: grad.mul(grad)
  2409. AutogradCUDA:
  2410. self: grad.expand_symint(self.sym_sizes()) * 2
  2411. - name: _test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor
  2412. dispatch:
  2413. AutogradNestedTensor:
  2414. self: grad.mul(grad).add(grad)
  2415. - name: _test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)
  2416. dispatch:
  2417. Default:
  2418. self: grad.reshape_as(self)
  2419. AutogradCUDA:
  2420. self: grad.reshape_as(self) + 1
  2421. - name: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  2422. output_differentiability: [False]
  2423. - name: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
  2424. self, src: scatter_reduce_backward(grad, self, dim, index, src, reduce, include_self, result)
  2425. index: non_differentiable
  2426. result: scatter_reduce_jvp(self_p, self_t, dim, index, src_p, src_t, reduce, include_self, result)
  2427. - name: special_airy_ai(Tensor x) -> Tensor
  2428. x: non_differentiable
  2429. - name: special_bessel_j0(Tensor self) -> Tensor
  2430. self: non_differentiable
  2431. - name: special_bessel_j1(Tensor self) -> Tensor
  2432. self: non_differentiable
  2433. - name: special_bessel_y0(Tensor self) -> Tensor
  2434. self: non_differentiable
  2435. - name: special_bessel_y1(Tensor self) -> Tensor
  2436. self: non_differentiable
  2437. - name: special_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
  2438. x: non_differentiable
  2439. n: non_differentiable
  2440. - name: special_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
  2441. n: non_differentiable
  2442. - name: special_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
  2443. x: non_differentiable
  2444. - name: special_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
  2445. x: non_differentiable
  2446. n: non_differentiable
  2447. - name: special_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
  2448. n: non_differentiable
  2449. - name: special_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
  2450. x: non_differentiable
  2451. - name: special_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
  2452. x: non_differentiable
  2453. n: non_differentiable
  2454. - name: special_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
  2455. n: non_differentiable
  2456. - name: special_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
  2457. x: non_differentiable
  2458. - name: special_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
  2459. x: non_differentiable
  2460. n: non_differentiable
  2461. - name: special_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
  2462. n: non_differentiable
  2463. - name: special_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
  2464. x: non_differentiable
  2465. - name: special_hermite_polynomial_h(Tensor x, Tensor n) -> Tensor
  2466. x: non_differentiable
  2467. n: non_differentiable
  2468. - name: special_hermite_polynomial_h.x_scalar(Scalar x, Tensor n) -> Tensor
  2469. n: non_differentiable
  2470. - name: special_hermite_polynomial_h.n_scalar(Tensor x, Scalar n) -> Tensor
  2471. x: non_differentiable
  2472. - name: special_hermite_polynomial_he(Tensor x, Tensor n) -> Tensor
  2473. x: non_differentiable
  2474. n: non_differentiable
  2475. - name: special_hermite_polynomial_he.x_scalar(Scalar x, Tensor n) -> Tensor
  2476. n: non_differentiable
  2477. - name: special_hermite_polynomial_he.n_scalar(Tensor x, Scalar n) -> Tensor
  2478. x: non_differentiable
  2479. - name: special_laguerre_polynomial_l(Tensor x, Tensor n) -> Tensor
  2480. x: non_differentiable
  2481. n: non_differentiable
  2482. - name: special_laguerre_polynomial_l.x_scalar(Scalar x, Tensor n) -> Tensor
  2483. n: non_differentiable
  2484. - name: special_laguerre_polynomial_l.n_scalar(Tensor x, Scalar n) -> Tensor
  2485. x: non_differentiable
  2486. - name: special_legendre_polynomial_p(Tensor x, Tensor n) -> Tensor
  2487. x: non_differentiable
  2488. n: non_differentiable
  2489. - name: special_legendre_polynomial_p.x_scalar(Scalar x, Tensor n) -> Tensor
  2490. n: non_differentiable
  2491. - name: special_legendre_polynomial_p.n_scalar(Tensor x, Scalar n) -> Tensor
  2492. x: non_differentiable
  2493. - name: special_modified_bessel_i0(Tensor self) -> Tensor
  2494. self: non_differentiable
  2495. - name: special_modified_bessel_i1(Tensor self) -> Tensor
  2496. self: non_differentiable
  2497. - name: special_modified_bessel_k0(Tensor self) -> Tensor
  2498. self: non_differentiable
  2499. - name: special_modified_bessel_k1(Tensor self) -> Tensor
  2500. self: non_differentiable
  2501. - name: special_scaled_modified_bessel_k0(Tensor x) -> Tensor
  2502. x: non_differentiable
  2503. - name: special_scaled_modified_bessel_k1(Tensor x) -> Tensor
  2504. x: non_differentiable
  2505. - name: special_shifted_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
  2506. x: non_differentiable
  2507. n: non_differentiable
  2508. - name: special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
  2509. n: non_differentiable
  2510. - name: special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
  2511. x: non_differentiable
  2512. - name: special_shifted_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
  2513. x: non_differentiable
  2514. n: non_differentiable
  2515. - name: special_shifted_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
  2516. n: non_differentiable
  2517. - name: special_shifted_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
  2518. x: non_differentiable
  2519. - name: special_shifted_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
  2520. x: non_differentiable
  2521. n: non_differentiable
  2522. - name: special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
  2523. n: non_differentiable
  2524. - name: special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
  2525. x: non_differentiable
  2526. - name: special_shifted_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
  2527. x: non_differentiable
  2528. n: non_differentiable
  2529. - name: special_shifted_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
  2530. n: non_differentiable
  2531. - name: special_shifted_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
  2532. x: non_differentiable
  2533. - name: special_spherical_bessel_j0(Tensor x) -> Tensor
  2534. x: non_differentiable
  2535. - name: _reshape_copy(Tensor self, SymInt[] size) -> Tensor
  2536. self: grad.reshape_symint(self.sym_sizes())
  2537. result: auto_linear
  2538. # note(crcrpar): `torchgen/api/autograd` logic would unwantedly replace substrings of `self` and `other` of function names.
  2539. - name: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
  2540. self: div_tensor_self_backward(grads[i], other[i], self[i].scalar_type())
  2541. other: div_tensor_other_backward(grads[i], self[i], other[i])
  2542. result: (self_t - other_t * result[i]) / other_p
  2543. - name: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
  2544. self: pow_backward_self(grads[i], self[i], exponent[i])
  2545. exponent: pow_backward_exponent(grads[i], self[i], exponent[i], result[i])
  2546. result: (pow_backward_self(self_t.conj(), self_p, exponent_p) + pow_backward_exponent(exponent_t.conj(), self_p, exponent_p, result[i])).conj()
  2547. - name: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
  2548. self: pow_backward(grads[i], self[i], exponent[i])
  2549. result: pow_backward(self_t.conj(), self_p, exponent[i]).conj()
  2550. - name: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
  2551. exponent: pow_backward_exponent(grads[i], self, exponent[i], result[i])
  2552. # note(crcrpar): following definitions seem necessary because the reference native functions
  2553. # of `maximum` and `minimum` don't have the overload def with Scalar as their second argument.
  2554. - name: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
  2555. self: at::where(self[i] == scalar, grads[i] / 2, grads[i]).masked_fill_(self[i] > scalar, 0)
  2556. result: scalar + at::where(self_p == scalar, at::scalar_tensor(0.5, result[i].options()), (self_p < scalar).to(result[i].scalar_type())) * (self_t - scalar)
  2557. - name: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
  2558. self: at::where(self[i] == scalars[i], grads[i] / 2, grads[i]).masked_fill_(self[i] > scalars[i], 0)
  2559. result: scalars[i] + at::where(self_p == scalars[i], at::scalar_tensor(0.5, result[i].options()), (self_p < scalars[i]).to(result[i].scalar_type())) * (self_t - scalars[i])
  2560. - name: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
  2561. self: at::where(self[i] == scalar, grads[i] / 2, grads[i]).masked_fill_(self[i] < scalar, 0)
  2562. result: scalar + at::where(self_p == scalar, at::scalar_tensor(0.5, result[i].options()), (self_p > scalar).to(result[i].scalar_type())) * (self_t - scalar)
  2563. - name: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
  2564. self: at::where(self[i] == scalars[i], grads[i] / 2, grads[i]).masked_fill_(self[i] < scalars[i], 0)
  2565. result: scalars[i] + at::where(self_p == scalars[i], at::scalar_tensor(0.5, result[i].options()), (self_p > scalars[i]).to(result[i].scalar_type())) * (self_t - scalars[i])
  2566. # note(crcrpar): forward-mode AD is tricky for a simple string replace to handle:
  2567. # formula.replace("p", "ord") produces `norm_jvord(self_ord, self_t, ord, result)`
  2568. - name: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[]
  2569. self: norm_backward(grads[i], self[i], ord, result[i])
  2570. result: norm_jvp(self_p, self_t, ord, result[i])