| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
- // !!!! PLEASE READ !!!!
- // Minimize (transitively) included headers from _avx*.cc because some of the
- // functions defined in the headers compiled with platform dependent compiler
- // options can be reused by other translation units generating illegal
- // instruction run-time error.
- // Common utilities for writing performance kernels and easy dispatching of
- // different backends.
- /*
- The general workflow shall be as follows, say we want to
- implement a functionality called void foo(int a, float b).
- In foo.h, do:
- void foo(int a, float b);
- In foo_avx512.cc, do:
- void foo__avx512(int a, float b) {
- [actual avx512 implementation]
- }
- In foo_avx2.cc, do:
- void foo__avx2(int a, float b) {
- [actual avx2 implementation]
- }
- In foo_avx.cc, do:
- void foo__avx(int a, float b) {
- [actual avx implementation]
- }
- In foo.cc, do:
- // The base implementation should *always* be provided.
- void foo__base(int a, float b) {
- [base, possibly slow implementation]
- }
- decltype(foo__base) foo__avx512;
- decltype(foo__base) foo__avx2;
- decltype(foo__base) foo__avx;
- void foo(int a, float b) {
- // You should always order things by their preference, faster
- // implementations earlier in the function.
- AVX512_DO(foo, a, b);
- AVX2_DO(foo, a, b);
- AVX_DO(foo, a, b);
- BASE_DO(foo, a, b);
- }
- */
- // Details: this functionality basically covers the cases for both build time
- // and run time architecture support.
- //
- // During build time:
- // The build system should provide flags CAFFE2_PERF_WITH_AVX512,
- // CAFFE2_PERF_WITH_AVX2, and CAFFE2_PERF_WITH_AVX that corresponds to the
- // __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__, and __AVX__ flags the
- // compiler provides. Note that we do not use the compiler flags but rely on
- // the build system flags, because the common files (like foo.cc above) will
- // always be built without __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__
- // and __AVX__.
- // During run time:
- // we use cpuinfo to identify cpu support and run the proper functions.
- #pragma once
- #if defined(CAFFE2_PERF_WITH_SVE) || defined(CAFFE2_PERF_WITH_AVX512) || \
- defined(CAFFE2_PERF_WITH_AVX2) || defined(CAFFE2_PERF_WITH_AVX)
- #include <cpuinfo.h>
- #endif
- // DO macros: these should be used in your entry function, similar to foo()
- // above, that routes implementations based on CPU capability.
- #define BASE_DO(funcname, ...) return funcname##__base(__VA_ARGS__);
- #ifdef CAFFE2_PERF_WITH_SVE
- #define SVE_DO(funcname, ...) \
- { \
- static const bool isDo = cpuinfo_initialize() && cpuinfo_has_arm_sve(); \
- if (isDo) { \
- return funcname##__sve(__VA_ARGS__); \
- } \
- }
- #else // CAFFE2_PERF_WITH_SVE
- #define SVE_DO(funcname, ...)
- #endif // CAFFE2_PERF_WITH_SVE
- #ifdef CAFFE2_PERF_WITH_AVX512
- #define AVX512_DO(funcname, ...) \
- { \
- static const bool isDo = cpuinfo_initialize() && \
- cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512dq() && \
- cpuinfo_has_x86_avx512vl(); \
- if (isDo) { \
- return funcname##__avx512(__VA_ARGS__); \
- } \
- }
- #else // CAFFE2_PERF_WITH_AVX512
- #define AVX512_DO(funcname, ...)
- #endif // CAFFE2_PERF_WITH_AVX512
- #ifdef CAFFE2_PERF_WITH_AVX2
- #define AVX2_DO(funcname, ...) \
- { \
- static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2(); \
- if (isDo) { \
- return funcname##__avx2(__VA_ARGS__); \
- } \
- }
- #define AVX2_FMA_DO(funcname, ...) \
- { \
- static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx2() && \
- cpuinfo_has_x86_fma3(); \
- if (isDo) { \
- return funcname##__avx2_fma(__VA_ARGS__); \
- } \
- }
- #else // CAFFE2_PERF_WITH_AVX2
- #define AVX2_DO(funcname, ...)
- #define AVX2_FMA_DO(funcname, ...)
- #endif // CAFFE2_PERF_WITH_AVX2
- #ifdef CAFFE2_PERF_WITH_AVX
- #define AVX_DO(funcname, ...) \
- { \
- static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx(); \
- if (isDo) { \
- return funcname##__avx(__VA_ARGS__); \
- } \
- }
- #define AVX_F16C_DO(funcname, ...) \
- { \
- static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx() && \
- cpuinfo_has_x86_f16c(); \
- if (isDo) { \
- return funcname##__avx_f16c(__VA_ARGS__); \
- } \
- }
- #else // CAFFE2_PERF_WITH_AVX
- #define AVX_DO(funcname, ...)
- #define AVX_F16C_DO(funcname, ...)
- #endif // CAFFE2_PERF_WITH_AVX
- #else
- #error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
- #endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
|