compile.c 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. /* clang-format off */
  2. #include <stdio.h>
  3. #include <stdint.h>
  4. #include <inttypes.h>
  5. #include <string.h>
  6. #include <cuda.h>
  7. // helpers to check for cuda errors
  8. #define CUDA_CHECK(ans) {{\
  9. gpuAssert((ans), __FILE__, __LINE__);\
  10. }}\
  11. static inline void gpuAssert(CUresult code, const char *file, int line) {{
  12. if (code != CUDA_SUCCESS) {{
  13. const char *prefix = "Triton Error [CUDA]: ";
  14. const char *str;
  15. cuGetErrorString(code, &str);
  16. char err[1024] = {{0}};
  17. strcat(err, prefix);
  18. strcat(err, str);
  19. printf("%s\\n", err);
  20. exit(code);
  21. }}
  22. }}
  23. // globals
  24. #define CUBIN_NAME {kernel_name}_cubin
  25. CUmodule {kernel_name}_mod = NULL;
  26. CUfunction {kernel_name}_func = NULL;
  27. unsigned char CUBIN_NAME[{bin_size}] = {{ {bin_data} }};
  28. void unload_{kernel_name}(void) {{
  29. CUDA_CHECK(cuModuleUnload({kernel_name}_mod));
  30. }}
  31. // TODO: some code duplication with `runtime/backend/cuda.c`
  32. void load_{kernel_name}() {{
  33. int dev = 0;
  34. void *bin = (void *)&CUBIN_NAME;
  35. int shared = {shared};
  36. CUDA_CHECK(cuModuleLoadData(&{kernel_name}_mod, bin));
  37. CUDA_CHECK(cuModuleGetFunction(&{kernel_name}_func, {kernel_name}_mod, "{triton_kernel_name}"));
  38. // set dynamic shared memory if necessary
  39. int shared_optin;
  40. CUDA_CHECK(cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev));
  41. if (shared > 49152 && shared_optin > 49152) {{
  42. CUDA_CHECK(cuFuncSetCacheConfig({kernel_name}_func, CU_FUNC_CACHE_PREFER_SHARED));
  43. CUDA_CHECK(cuFuncSetAttribute({kernel_name}_func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin))
  44. }}
  45. }}
  46. /*
  47. {kernel_docstring}
  48. */
  49. CUresult {kernel_name}(CUstream stream, {signature}) {{
  50. if ({kernel_name}_func == NULL)
  51. load_{kernel_name}();
  52. unsigned int gX = {gridX};
  53. unsigned int gY = {gridY};
  54. unsigned int gZ = {gridZ};
  55. CUdeviceptr global_scratch = 0;
  56. CUdeviceptr profile_scratch = 0;
  57. void *args[{num_args}] = {{ {arg_pointers} }};
  58. // TODO: shared memory
  59. if(gX * gY * gZ > 0)
  60. return cuLaunchKernel({kernel_name}_func, gX, gY, gZ, {num_warps} * {warp_size}, 1, 1, {shared}, stream, args, NULL);
  61. return (CUresult)NULL;
  62. }}