common.h 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
  2. // !!!! PLEASE READ !!!!
  3. // Minimize (transitively) included headers from _avx*.cc because some of the
  4. // functions defined in the headers compiled with platform dependent compiler
  5. // options can be reused by other translation units generating illegal
  6. // instruction run-time error.
  7. // Common utilities for writing performance kernels and easy dispatching of
  8. // different backends.
  9. /*
  10. The general workflow shall be as follows, say we want to
  11. implement a functionality called void foo(int a, float b).
  12. In foo.h, do:
  13. void foo(int a, float b);
  14. In foo_avx512.cc, do:
  15. void foo__avx512(int a, float b) {
  16. [actual avx512 implementation]
  17. }
  18. In foo_avx2.cc, do:
  19. void foo__avx2(int a, float b) {
  20. [actual avx2 implementation]
  21. }
  22. In foo_avx.cc, do:
  23. void foo__avx(int a, float b) {
  24. [actual avx implementation]
  25. }
  26. In foo.cc, do:
  27. // The base implementation should *always* be provided.
  28. void foo__base(int a, float b) {
  29. [base, possibly slow implementation]
  30. }
  31. decltype(foo__base) foo__avx512;
  32. decltype(foo__base) foo__avx2;
  33. decltype(foo__base) foo__avx;
  34. void foo(int a, float b) {
  35. // You should always order things by their preference, faster
  36. // implementations earlier in the function.
  37. AVX512_DO(foo, a, b);
  38. AVX2_DO(foo, a, b);
  39. AVX_DO(foo, a, b);
  40. BASE_DO(foo, a, b);
  41. }
  42. */
  43. // Details: this functionality basically covers the cases for both build time
  44. // and run time architecture support.
  45. //
  46. // During build time:
  47. // The build system should provide flags CAFFE2_PERF_WITH_AVX512,
  48. // CAFFE2_PERF_WITH_AVX2, and CAFFE2_PERF_WITH_AVX that corresponds to the
  49. // __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__, and __AVX__ flags the
  50. // compiler provides. Note that we do not use the compiler flags but rely on
  51. // the build system flags, because the common files (like foo.cc above) will
  52. // always be built without __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__
  53. // and __AVX__.
  54. // During run time:
  55. // we use cpuinfo to identify cpu support and run the proper functions.
  56. #pragma once
  57. #if defined(CAFFE2_PERF_WITH_SVE) || defined(CAFFE2_PERF_WITH_AVX512) || \
  58. defined(CAFFE2_PERF_WITH_AVX2) || defined(CAFFE2_PERF_WITH_AVX)
  59. #include <cpuinfo.h>
  60. #endif
  61. // DO macros: these should be used in your entry function, similar to foo()
  62. // above, that routes implementations based on CPU capability.
  63. #define BASE_DO(funcname, ...) return funcname##__base(__VA_ARGS__);
  64. #ifdef CAFFE2_PERF_WITH_SVE
  65. #define SVE_DO(funcname, ...) \
  66. { \
  67. static const bool isDo = cpuinfo_initialize() && cpuinfo_has_arm_sve(); \
  68. if (isDo) { \
  69. return funcname##__sve(__VA_ARGS__); \
  70. } \
  71. }
  72. #else // CAFFE2_PERF_WITH_SVE
  73. #define SVE_DO(funcname, ...)
  74. #endif // CAFFE2_PERF_WITH_SVE
  75. #ifdef CAFFE2_PERF_WITH_AVX512
  76. #define AVX512_DO(funcname, ...) \
  77. { \
  78. static const bool isDo = cpuinfo_initialize() && \
  79. cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512dq() && \
  80. cpuinfo_has_x86_avx512vl(); \
  81. if (isDo) { \
  82. return funcname##__avx512(__VA_ARGS__); \
  83. } \
  84. }
  85. #else // CAFFE2_PERF_WITH_AVX512
  86. #define AVX512_DO(funcname, ...)
  87. #endif // CAFFE2_PERF_WITH_AVX512
  88. #ifdef CAFFE2_PERF_WITH_AVX2
  89. #define AVX2_DO(funcname, ...) \
  90. { \
  91. static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2(); \
  92. if (isDo) { \
  93. return funcname##__avx2(__VA_ARGS__); \
  94. } \
  95. }
  96. #define AVX2_FMA_DO(funcname, ...) \
  97. { \
  98. static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2() && \
  99. cpuinfo_has_x86_fma3(); \
  100. if (isDo) { \
  101. return funcname##__avx2_fma(__VA_ARGS__); \
  102. } \
  103. }
  104. #else // CAFFE2_PERF_WITH_AVX2
  105. #define AVX2_DO(funcname, ...)
  106. #define AVX2_FMA_DO(funcname, ...)
  107. #endif // CAFFE2_PERF_WITH_AVX2
  108. #ifdef CAFFE2_PERF_WITH_AVX
  109. #define AVX_DO(funcname, ...) \
  110. { \
  111. static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx(); \
  112. if (isDo) { \
  113. return funcname##__avx(__VA_ARGS__); \
  114. } \
  115. }
  116. #define AVX_F16C_DO(funcname, ...) \
  117. { \
  118. static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx() && \
  119. cpuinfo_has_x86_f16c(); \
  120. if (isDo) { \
  121. return funcname##__avx_f16c(__VA_ARGS__); \
  122. } \
  123. }
  124. #else // CAFFE2_PERF_WITH_AVX
  125. #define AVX_DO(funcname, ...)
  126. #define AVX_F16C_DO(funcname, ...)
  127. #endif // CAFFE2_PERF_WITH_AVX
  128. #else
  129. #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
  130. #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)