FunctionalStorageImpl.h 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. #pragma once
  3. #include <ATen/Tensor.h>
  4. #include <utility>
  5. namespace at::functionalization {
  6. // See Note [Functionalization Pass In Core]
  7. enum class InverseReturnMode {
  8. /// Specifies that functional inverses should always return a view.
  9. AlwaysView,
  10. /// Specifies that functional inverses should always return a non-view / copy.
  11. NeverView,
  12. /// Specifies that functional inverses should return a view unless a (copying)
  13. /// scatter
  14. /// inverse exists, in which case that will be used instead.
  15. /// This avoids as_strided() calls that can be difficult for subclasses to
  16. /// handle.
  17. ViewOrScatterInverse,
  18. };
  19. #define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
  20. static const char* name() { \
  21. return #TYPE; \
  22. }
  23. #define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
  24. using SerializableTuple = std::tuple<__VA_ARGS__>
  25. // ViewMeta is a class used by the functionalization pass to navigate between
  26. // a base tensor and a view tensor.
  27. // For example, if I call `b = a.view1(...)`
  28. // the functionalization pass will generate and store a ViewMeta specialization
  29. // for `view1` operation on b that looks like:
  30. //
  31. // struct TORCH_API view1_ViewMeta : public ViewMeta {
  32. // FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
  33. // FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
  34. // bool /* reapply_views */,
  35. // const std::vector<int64_t>&);
  36. //
  37. // view1_ViewMeta(const SerializableTuple& tpl)
  38. // : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
  39. //
  40. // view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
  41. // : ViewMeta(/*has_symbolic_inputs=*/false),
  42. // reapply_views(reapply_views),
  43. // size(size) {}
  44. //
  45. // Tensor forward(const Tensor& base) override {
  46. // return base.view1(...);
  47. // }
  48. //
  49. // Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
  50. // return at::functionalization::impl::view1_inverse(base, mutated_view,
  51. // ...);
  52. // }
  53. //
  54. // SerializableTuple to_serializable_tuple() {
  55. // return std::make_tuple(reapply_views, size);
  56. // }
  57. //
  58. // bool reapply_views;
  59. // std::vector<int64_t> size;
  60. // };
  61. //
  62. // The forward function describes how to replay view1 on a tensor.
  63. //
  64. // The reverse function describes how, given a tensor that is already a view,
  65. // how to get the corresponding base tensor. See Note [Functionalization Pass:
  66. // View Inverses] for details.
  67. //
  68. // `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
  69. // representing the `ViewMeta` instance state. Methods that take in/return such
  70. // a type are used for supporting pickle serialization.
  71. struct ViewMeta {
  72. ViewMeta(
  73. bool has_symbolic_inputs,
  74. bool is_multi_output = false,
  75. bool is_as_strided = false,
  76. int64_t out_idx = 0)
  77. : out_index(out_idx),
  78. is_multi_output(is_multi_output),
  79. is_as_strided(is_as_strided),
  80. has_symbolic_inputs(has_symbolic_inputs) {}
  81. virtual ~ViewMeta() = default;
  82. virtual Tensor forward(const Tensor& base) = 0;
  83. virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
  84. // See Note [out_idx in ViewMeta]
  85. int64_t out_index;
  86. // Tells us if this is a multi-output view
  87. bool is_multi_output;
  88. bool is_as_strided;
  89. // Tells us if this view operation has any symbolic inputs
  90. bool has_symbolic_inputs;
  91. // Returns a new ViewMeta with the same forward/reverse
  92. // functions, but a new out index.
  93. //
  94. // This method should be implemented by those `ViewMeta` that have more than
  95. // one output.
  96. virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
  97. TORCH_CHECK_NOT_IMPLEMENTED(
  98. false,
  99. "ViewMeta::to_out_index not implemented. ",
  100. "Likely because there's only one output.");
  101. }
  102. };
  103. // FunctionalStorageImpl is a subclass of StorageImpl used by the
  104. // functionalization pass. It has no underlying data (similar to meta storage).
  105. // It also knows how to reflect mutations to tensors in the absence of a valid
  106. // data pointer.
  107. //
  108. // A storage represents the state shared by (potentially multiple) views of the
  109. // same tensor. For example, in the following code:
  110. //
  111. // b = a.view1(...)
  112. // c = b.view2(...)
  113. // b.add_(1)
  114. // --> storage.add_update(b, {view1_meta})
  115. //
  116. // The call to add_(1) will result in a call to alias.add_update(b,
  117. // {view1_meta}), queueing up the mutation from b onto the alias. Later, suppose
  118. // c is used in an expression (e.g. you try to print c, or pass it to an
  119. // operator). Doing so will involve "syncing" c. First we apply any pending
  120. // updates to the alias, and then we regenerate c by replaying its views off of
  121. // the updated alias. E.g:
  122. //
  123. // print(str(c))
  124. // --> c.sync_()
  125. // --> alias.apply_updates() // after this, the alias will be updated to
  126. // reflect the mutation to b
  127. struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
  128. public:
  129. struct Update {
  130. // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
  131. const at::Tensor new_val;
  132. // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
  133. const std::vector<std::shared_ptr<ViewMeta>> view_metas;
  134. };
  135. explicit FunctionalStorageImpl(const Tensor& value);
  136. void add_update(
  137. const Tensor& updated_val,
  138. const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
  139. bool apply_updates();
  140. const Tensor& base() {
  141. return base_;
  142. }
  143. size_t generation() const {
  144. return generation_;
  145. }
  146. void freeze() {
  147. frozen_ = true;
  148. }
  149. c10::SymInt get_storage_size(bool before) {
  150. if (before) {
  151. return original_storage_size_;
  152. } else {
  153. return curr_storage_size_;
  154. }
  155. }
  156. ~FunctionalStorageImpl() override = default;
  157. uint64_t mutation_counter() {
  158. return mutation_counter_;
  159. }
  160. void mark_mutation() {
  161. mutation_counter_++;
  162. }
  163. void mark_mutation_during_no_grad_or_inference_mode() {
  164. mutation_counter_during_no_grad_or_inference_mode_++;
  165. }
  166. void mark_mutation_hidden_from_autograd() {
  167. mutation_counter_hidden_from_autograd_++;
  168. }
  169. bool are_all_mutations_under_no_grad_or_inference_mode() const {
  170. auto non_autograd_mutations =
  171. mutation_counter_during_no_grad_or_inference_mode_ +
  172. mutation_counter_hidden_from_autograd_;
  173. // The <= is because both counters will technically be incremented, if we
  174. // perform e.g. a triton kernel mutation under no_grad
  175. return mutation_counter_ <= non_autograd_mutations;
  176. }
  177. bool are_all_mutations_hidden_from_autograd() const {
  178. // mutations under no_grad / inference_mode are technically not hidden from
  179. // autograd - they change the version counter
  180. return mutation_counter_ <= mutation_counter_hidden_from_autograd_;
  181. }
  182. void mark_inductor_storage_resize(c10::SymInt new_size) {
  183. inductor_storage_resized_ = true;
  184. curr_storage_size_ = std::move(new_size);
  185. inductor_storage_resized_counter_++;
  186. }
  187. bool was_inductor_storage_resized() {
  188. return inductor_storage_resized_;
  189. }
  190. uint64_t inductor_storage_resized_counter() {
  191. return inductor_storage_resized_counter_;
  192. }
  193. private:
  194. // NB: base_ should always point to a tensor BELOW the current
  195. // functionalization layer. This is mainly to avoid reference cycles. e.g.
  196. // given `b = a.view(...)` Both a.storage_ and b.storage_ are a
  197. // FunctionStorageImpl containing an Walualias, with contains a Tensor
  198. // `base_`. In this case (where a and b are FunctionalTensorWrapper's), base_
  199. // should point not to a, but to a's unwrapped value, a.value_` See Note
  200. // [Functionalization: Walualias Removal] for a diagram that shows this
  201. // visually.
  202. at::Tensor base_;
  203. std::vector<Update> updates_;
  204. // generation_ gets incremented every time a mutation is queued onto the
  205. // alias. It is used to determine if a given tensor is "up to date", or if it
  206. // needs to be regenerated from the alias.
  207. size_t generation_ = 0;
  208. // If frozen, no more mutations are allowed on this storage. Once frozen, a
  209. // storage cannot be unfrozen.
  210. bool frozen_ = false;
  211. // These mutation counters are bumped on the storage
  212. // whenever a FunctionalTensorWrapper experiences a mutation.
  213. // When the mutation is under no_grad, or comes from a triton kernel, we also
  214. // bump the corresponding during_no_grad or hidden_from_autograd counters. Why
  215. // do we need to detect these two situations separately from "normal" input
  216. // mutations? (1) "normal" input mutations can mutate autograd metadata like
  217. // .grad_fn,
  218. // in which case they need to be replayed outside of the compiled graph
  219. // (2) "no_grad" input mutations are generally safe to keep in the graph (and
  220. // compile),
  221. // but they bump the tensor's VC, so we need to mark_dirty() on the inputs
  222. // in torch.compile
  223. // (3) mutations that are fully hidden from autograd (e.g. from a triton
  224. // kernel)
  225. // do not mutate any autograd state, and be fully kept in the graph
  226. // When we detect that an input was mutated, we need to be able to tell if:
  227. // (1) all of the mutations were from triton kernels
  228. // (2) all of the mutations were under no_grad
  229. uint64_t mutation_counter_during_no_grad_or_inference_mode_ = 0;
  230. uint64_t mutation_counter_ = 0;
  231. uint64_t mutation_counter_hidden_from_autograd_ = 0;
  232. // Used to tell if:
  233. // (1) There were any storage resizes on a graph input
  234. // (2) The original/curr storage size tell us if these resizes result in a nop
  235. bool inductor_storage_resized_ = false;
  236. uint64_t inductor_storage_resized_counter_ = 0;
  237. c10::SymInt original_storage_size_;
  238. c10::SymInt curr_storage_size_;
  239. };
  240. } // namespace at::functionalization
  241. #else
  242. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  243. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)