Parallel.h 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. #pragma once
  3. #include <ATen/Config.h>
  4. #include <c10/macros/Macros.h>
  5. #include <functional>
  6. #include <string>
  7. namespace at {
  8. inline int64_t divup(int64_t x, int64_t y) {
  9. return (x + y - 1) / y;
  10. }
  11. // Called during new thread initialization
  12. TORCH_API void init_num_threads();
  13. // Sets the number of threads to be used in parallel region
  14. TORCH_API void set_num_threads(int /*nthreads*/);
  15. // Returns the maximum number of threads that may be used in a parallel region
  16. TORCH_API int get_num_threads();
  17. // Returns the current thread number (starting from 0)
  18. // in the current parallel region, or 0 in the sequential region
  19. TORCH_API int get_thread_num();
  20. // Checks whether the code runs in parallel region
  21. TORCH_API bool in_parallel_region();
  22. namespace internal {
  23. // Initialise num_threads lazily at first parallel call
  24. inline void lazy_init_num_threads() {
  25. thread_local bool init = false;
  26. if (C10_UNLIKELY(!init)) {
  27. at::init_num_threads();
  28. init = true;
  29. }
  30. }
  31. TORCH_API void set_thread_num(int /*id*/);
  32. class TORCH_API ThreadIdGuard {
  33. public:
  34. ThreadIdGuard(int new_id) : old_id_(at::get_thread_num()) {
  35. set_thread_num(new_id);
  36. }
  37. ~ThreadIdGuard() {
  38. set_thread_num(old_id_);
  39. }
  40. private:
  41. int old_id_;
  42. };
  43. } // namespace internal
  44. /*
  45. parallel_for
  46. begin: index at which to start applying user function
  47. end: index at which to stop applying user function
  48. grain_size: number of elements per chunk. impacts the degree of parallelization
  49. f: user function applied in parallel to the chunks, signature:
  50. void f(int64_t begin, int64_t end)
  51. Warning: parallel_for does NOT copy thread local
  52. states from the current thread to the worker threads.
  53. This means for example that Tensor operations CANNOT be used in the
  54. body of your function, only data pointers.
  55. */
  56. template <class F>
  57. inline void parallel_for(
  58. const int64_t begin,
  59. const int64_t end,
  60. const int64_t grain_size,
  61. const F& f);
  62. /*
  63. parallel_reduce
  64. begin: index at which to start applying reduction
  65. end: index at which to stop applying reduction
  66. grain_size: number of elements per chunk. impacts number of elements in
  67. intermediate results tensor and degree of parallelization.
  68. ident: identity for binary combination function sf. sf(ident, x) needs to return
  69. x.
  70. f: function for reduction over a chunk. f needs to be of signature scalar_t
  71. f(int64_t partial_begin, int64_t partial_end, scalar_t identify)
  72. sf: function to combine two partial results. sf needs to be of signature
  73. scalar_t sf(scalar_t x, scalar_t y)
  74. For example, you might have a tensor of 10000 entries and want to sum together
  75. all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
  76. an intermediate result tensor with 4 elements. Then it will execute the function
  77. "f" you provide and pass the beginning and end index of these chunks, so
  78. 0-2499, 2500-4999, etc. and the combination identity. It will then write out
  79. the result from each of these chunks into the intermediate result tensor. After
  80. that it'll reduce the partial results from each chunk into a single number using
  81. the combination function sf and the identity ident. For a total summation this
  82. would be "+" and 0 respectively. This is similar to tbb's approach [1], where
  83. you need to provide a function to accumulate a subrange, a function to combine
  84. two partial results and an identity.
  85. Warning: parallel_reduce does NOT copy thread local
  86. states from the current thread to the worker threads.
  87. This means for example that Tensor operations CANNOT be used in the
  88. body of your function, only data pointers.
  89. [1] https://software.intel.com/en-us/node/506154
  90. */
  91. template <class scalar_t, class F, class SF>
  92. inline scalar_t parallel_reduce(
  93. const int64_t begin,
  94. const int64_t end,
  95. const int64_t grain_size,
  96. const scalar_t ident,
  97. const F& f,
  98. const SF& sf);
  99. // Returns a detailed string describing parallelization settings
  100. TORCH_API std::string get_parallel_info();
  101. // Sets number of threads used for inter-op parallelism
  102. TORCH_API void set_num_interop_threads(int /*nthreads*/);
  103. // Returns the number of threads used for inter-op parallelism
  104. TORCH_API size_t get_num_interop_threads();
  105. // Launches inter-op parallel task
  106. TORCH_API void launch(std::function<void()> func);
  107. namespace internal {
  108. void launch_no_thread_state(std::function<void()> fn);
  109. } // namespace internal
  110. // Launches intra-op parallel task
  111. TORCH_API void intraop_launch(const std::function<void()>& func);
  112. // Returns number of intra-op threads used by default
  113. TORCH_API int intraop_default_num_threads();
  114. } // namespace at
  115. #if AT_PARALLEL_OPENMP
  116. #include <ATen/ParallelOpenMP.h> // IWYU pragma: keep
  117. #elif AT_PARALLEL_NATIVE
  118. #include <ATen/ParallelNative.h> // IWYU pragma: keep
  119. #endif
  120. #include <ATen/Parallel-inl.h> // IWYU pragma: keep
  121. #else
  122. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  123. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)