| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558 |
- #define _CRT_SECURE_NO_WARNINGS
- #include "cuda.h"
- #ifndef _WIN32
- #include <dlfcn.h>
- #else
- #define WIN32_LEAN_AND_MEAN
- #include <windows.h>
- #endif
- #include <stdbool.h>
- #include <stdio.h>
- #include <stdlib.h>
- #define PY_SSIZE_T_CLEAN
- #include <Python.h>
- typedef struct {
- PyObject_HEAD _Alignas(128) CUtensorMap tensorMap;
- } PyCUtensorMapObject;
- // Raises a Python exception and returns false if code is not CUDA_SUCCESS.
- static bool gpuAssert(CUresult code, const char *file, int line) {
- if (code == CUDA_SUCCESS)
- return true;
- const char *prefix = "Triton Error [CUDA]: ";
- const char *str;
- cuGetErrorString(code, &str);
- char err[1024] = {0};
- strcat(err, prefix);
- strcat(err, str);
- PyGILState_STATE gil_state;
- gil_state = PyGILState_Ensure();
- PyErr_SetString(PyExc_RuntimeError, err);
- PyGILState_Release(gil_state);
- return false;
- }
- // To be used only *outside* a Py_{BEGIN,END}_ALLOW_THREADS block.
- #define CUDA_CHECK_AND_RETURN_NULL(ans) \
- do { \
- if (!gpuAssert((ans), __FILE__, __LINE__)) \
- goto cleanup; \
- } while (0)
- // To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block.
- #define CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(ans) \
- do { \
- if (!gpuAssert((ans), __FILE__, __LINE__)) { \
- PyEval_RestoreThread(_save); \
- return NULL; \
- } \
- } while (0)
- // Used to check if functions exist in old CUDA driver versions.
- #define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction) \
- do { \
- if ((funcPointer) == NULL) { \
- (funcPointer) = (initializerFunction)(); \
- if ((funcPointer) == NULL) { \
- goto cleanup; \
- } \
- } \
- } while (0)
- static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
- int device_id;
- if (!PyArg_ParseTuple(args, "i", &device_id))
- return NULL;
- // Get device handle
- CUdevice device;
- cuDeviceGet(&device, device_id);
- // create a struct to hold device properties
- int max_shared_mem;
- int max_num_regs;
- int multiprocessor_count;
- int warp_size;
- int sm_clock_rate;
- int mem_clock_rate;
- int mem_bus_width;
- CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
- &max_shared_mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
- device));
- CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
- &max_num_regs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device));
- CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
- &multiprocessor_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
- CUDA_CHECK_AND_RETURN_NULL(
- cuDeviceGetAttribute(&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device));
- CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
- &sm_clock_rate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
- CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
- &mem_clock_rate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device));
- CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
- &mem_bus_width, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device));
- return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i, s:i, s:i}", "max_shared_mem",
- max_shared_mem, "max_num_regs", max_num_regs,
- "multiprocessor_count", multiprocessor_count, "warpSize",
- warp_size, "sm_clock_rate", sm_clock_rate,
- "mem_clock_rate", mem_clock_rate, "mem_bus_width",
- mem_bus_width);
- cleanup:
- return NULL;
- }
- static PyObject *loadBinary(PyObject *self, PyObject *args) {
- const char *name;
- const char *data;
- Py_ssize_t data_size;
- int shared;
- int device;
- if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
- &device)) {
- return NULL;
- }
- CUfunction fun;
- CUmodule mod;
- int32_t n_regs = 0;
- int32_t n_spills = 0;
- int32_t n_max_threads = 0;
- // create driver handles
- CUcontext pctx = 0;
- Py_BEGIN_ALLOW_THREADS;
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&pctx));
- if (!pctx) {
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
- cuDevicePrimaryCtxRetain(&pctx, device));
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(pctx));
- }
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuModuleLoadData(&mod, data));
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
- cuModuleGetFunction(&fun, mod, name));
- // get allocated registers and spilled registers from the function
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
- cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun));
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
- cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
- n_spills /= 4;
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
- &n_max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun));
- // set dynamic shared memory if necessary
- int shared_optin;
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute(
- &shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
- device));
- if (shared > 49152 && shared_optin > 49152) {
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
- cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED));
- int shared_total, shared_static;
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute(
- &shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
- device));
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
- &shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun));
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
- cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
- shared_optin - shared_static));
- }
- Py_END_ALLOW_THREADS;
- if (PyErr_Occurred()) {
- return NULL;
- }
- return Py_BuildValue("(KKiii)", (uint64_t)mod, (uint64_t)fun, n_regs,
- n_spills, n_max_threads);
- }
- typedef CUresult (*cuOccupancyMaxActiveClusters_t)(
- int *numClusters, CUfunction func, const CUlaunchConfig *config);
- typedef CUresult (*cuTensorMapEncodeTiled_t)(
- CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
- cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
- const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
- const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
- CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
- CUtensorMapFloatOOBfill oobFill);
- #ifndef _WIN32
- #define defineGetFunctionHandle(name, symbolName) \
- static symbolName##_t name() { \
- /* Open the shared library */ \
- void *libHandle = dlopen("libcuda.so.1", RTLD_LAZY); \
- if (!libHandle) { \
- PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1"); \
- return NULL; \
- } \
- /* Clear any existing error */ \
- dlerror(); \
- symbolName##_t funcHandle = (symbolName##_t)dlsym(libHandle, #symbolName); \
- /* Check for errors */ \
- const char *err = dlerror(); \
- if (err) { \
- PyErr_SetString(PyExc_RuntimeError, \
- "Failed to retrieve " #symbolName " from libcuda.so.1"); \
- dlclose(libHandle); \
- return NULL; \
- } \
- return funcHandle; \
- }
- #else
- #define defineGetFunctionHandle(name, symbolName) \
- static symbolName##_t name() { \
- /* Open the shared library */ \
- HMODULE handle = LoadLibraryA("nvcuda.dll"); \
- if (!handle) { \
- PyErr_SetString(PyExc_RuntimeError, "Failed to open nvcuda.dll"); \
- return NULL; \
- } \
- symbolName##_t funcHandle = \
- (symbolName##_t)GetProcAddress((HMODULE)handle, #symbolName); \
- /* Check for errors */ \
- long err = GetLastError(); \
- if (err) { \
- PyErr_SetString(PyExc_RuntimeError, \
- "Failed to retrieve " #symbolName " from nvcuda.dll"); \
- return NULL; \
- } \
- return funcHandle; \
- }
- #endif
- defineGetFunctionHandle(getCuOccupancyMaxActiveClustersHandle,
- cuOccupancyMaxActiveClusters);
- defineGetFunctionHandle(getCuTensorMapEncodeTiledHandle,
- cuTensorMapEncodeTiled);
- static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
- int clusterDim = -1, maxActiveClusters = -1;
- int shared = 0;
- CUfunction func;
- if (!PyArg_ParseTuple(args, "Kii", &func, &shared, &clusterDim)) {
- return NULL;
- }
- // Let each SM have one block
- int maxActiveBlocks = 1;
- Py_BEGIN_ALLOW_THREADS;
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
- func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared));
- Py_END_ALLOW_THREADS;
- CUlaunchAttribute launchAttr[1];
- launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
- launchAttr[0].value.clusterDim.x = clusterDim;
- launchAttr[0].value.clusterDim.y = 1;
- launchAttr[0].value.clusterDim.z = 1;
- CUlaunchConfig config;
- config.gridDimX = clusterDim * maxActiveBlocks;
- config.gridDimY = 1;
- config.gridDimZ = 1;
- config.blockDimX = 128;
- config.blockDimY = 1;
- config.blockDimZ = 1;
- config.sharedMemBytes = shared;
- config.hStream = 0;
- config.numAttrs = 1;
- config.attrs = launchAttr;
- static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL;
- INITIALIZE_FUNCTION_POINTER_IF_NULL(cuOccupancyMaxActiveClusters,
- getCuOccupancyMaxActiveClustersHandle);
- Py_BEGIN_ALLOW_THREADS;
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
- func, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
- cuOccupancyMaxActiveClusters(&maxActiveClusters, func, &config));
- Py_END_ALLOW_THREADS;
- return PyLong_FromLong(maxActiveClusters);
- cleanup:
- return NULL;
- }
- static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) {
- long size;
- if (!PyArg_ParseTuple(args, "l", &size)) {
- return NULL;
- }
- if (size < 0) {
- PyErr_SetString(PyExc_ValueError, "fifo size must be non-negative");
- return NULL;
- }
- Py_BEGIN_ALLOW_THREADS;
- // Ensure we have an active context.
- CUcontext ctx = NULL;
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&ctx));
- if (!ctx) {
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
- cuDevicePrimaryCtxRetain(&ctx, /*device=*/0));
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(ctx));
- }
- // We can't set the fifo size after running a kernel that calls printf. This
- // is true even if the set() call is a nop and the new size is the same as the
- // old size.
- //
- // This is unfriendly, so check if the old size matches the new size, and skip
- // the set() call if so.
- size_t oldSize = 0;
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
- cuCtxGetLimit(&oldSize, CU_LIMIT_PRINTF_FIFO_SIZE));
- if (oldSize != size) {
- CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
- cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, size));
- }
- Py_END_ALLOW_THREADS;
- Py_RETURN_NONE;
- }
- static PyObject *PyCUtensorMap_alloc(PyTypeObject *type, Py_ssize_t n_items) {
- PyCUtensorMapObject *self = NULL;
- void *mem = NULL;
- size_t size = type->tp_basicsize;
- #ifdef _WIN32
- mem = _aligned_malloc(size, 128);
- if (mem == NULL) {
- #else
- if (posix_memalign(&mem, 128, size) != 0) {
- #endif
- PyErr_NoMemory();
- return NULL;
- }
- self = (PyCUtensorMapObject *)mem;
- PyObject_INIT(self, type);
- return (PyObject *)self;
- }
- static void PyCUtensorMap_dealloc(PyObject *self) {
- Py_TYPE(self)->tp_free(self);
- }
- static void PyCUtensorMap_free(void *ptr) {
- #ifdef _WIN32
- _aligned_free(ptr);
- #else
- free(ptr);
- #endif
- }
- // clang-format off
- static PyTypeObject PyCUtensorMapType = {
- PyVarObject_HEAD_INIT(NULL, 0)
- .tp_name = "triton.backends.nvidia.PyCUtensorMap",
- .tp_basicsize = sizeof(PyCUtensorMapObject),
- .tp_itemsize = 0,
- .tp_flags = Py_TPFLAGS_DEFAULT,
- .tp_doc = "<PyCUtensorMap object>",
- .tp_new = PyType_GenericNew,
- .tp_alloc = PyCUtensorMap_alloc,
- .tp_dealloc = (destructor)PyCUtensorMap_dealloc,
- .tp_free = PyCUtensorMap_free,
- };
- // clang-format on
- static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
- unsigned long long global_address;
- int swizzle;
- int elemSize;
- int elemType;
- PyObject *blockSize;
- PyObject *shape;
- PyObject *strides;
- int padding;
- if (!PyArg_ParseTuple(args, "KiiiOOOi", &global_address, &swizzle, &elemSize,
- &elemType, &blockSize, &shape, &strides, &padding)) {
- return NULL;
- }
- PyCUtensorMapObject *desc = (PyCUtensorMapObject *)PyObject_CallObject(
- (PyObject *)&PyCUtensorMapType, NULL);
- if (!desc) {
- return NULL;
- }
- PyObject *blockSizeFast = NULL;
- PyObject *shapeFast = NULL;
- PyObject *stridesFast = NULL;
- uint32_t blockSizeInt[5];
- uint64_t shapeInt[5];
- uint64_t stridesLL[5];
- blockSizeFast = PySequence_Fast(blockSize, "blockSize must be a sequence");
- if (!blockSizeFast)
- goto cleanup;
- int rank = PySequence_Fast_GET_SIZE(blockSizeFast);
- for (int i = 0; i < rank; ++i) {
- PyObject *item = PySequence_Fast_GET_ITEM(blockSizeFast, i);
- if (!PyLong_Check(item)) {
- PyErr_SetString(PyExc_TypeError, "block size must be an int");
- goto cleanup;
- }
- blockSizeInt[rank - i - 1] = PyLong_AsLongLong(item);
- }
- shapeFast = PySequence_Fast(shape, "shape must be a sequence");
- if (!shapeFast)
- goto cleanup;
- if (rank != PySequence_Fast_GET_SIZE(shapeFast)) {
- PyErr_SetString(PyExc_RuntimeError, "Rank mismatch");
- goto cleanup;
- }
- for (int i = 0; i < rank; ++i) {
- PyObject *item = PySequence_Fast_GET_ITEM(shapeFast, i);
- if (!PyLong_Check(item)) {
- PyErr_SetString(PyExc_TypeError, "shape must be an int");
- goto cleanup;
- }
- shapeInt[rank - i - 1] = PyLong_AsLong(item);
- }
- stridesFast = PySequence_Fast(strides, "strides must be a sequence");
- if (!stridesFast)
- goto cleanup;
- if (rank != PySequence_Fast_GET_SIZE(stridesFast)) {
- PyErr_SetString(PyExc_RuntimeError, "Rank mismatch");
- goto cleanup;
- }
- for (int i = 0; i + 1 < rank; ++i) {
- PyObject *item = PySequence_Fast_GET_ITEM(stridesFast, i);
- if (!PyLong_Check(item)) {
- PyErr_SetString(PyExc_TypeError, "shape must be an int");
- goto cleanup;
- }
- stridesLL[rank - i - 2] = elemSize * PyLong_AsLongLong(item);
- }
- stridesLL[rank - 1] =
- shapeInt[rank - 1] * (rank == 1 ? elemSize : stridesLL[rank - 2]);
- Py_DECREF(blockSizeFast);
- blockSizeFast = NULL;
- Py_DECREF(shapeFast);
- shapeFast = NULL;
- Py_DECREF(stridesFast);
- stridesFast = NULL;
- CUtensorMapFloatOOBfill fill =
- (padding == 1) ? CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
- : CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
- uint32_t elementStrides[5] = {1, 1, 1, 1, 1};
- static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL;
- INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
- getCuTensorMapEncodeTiledHandle);
- CUresult res = cuTensorMapEncodeTiled(
- &desc->tensorMap, elemType, rank, (void *)global_address, shapeInt,
- stridesLL, blockSizeInt, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
- swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill);
- if (res != CUDA_SUCCESS) {
- const char *str;
- cuGetErrorString(res, &str);
- char err[4096] = {0};
- size_t off = 0;
- off += snprintf(
- err + off, sizeof(err) - off,
- "Triton Error [CUDA]: Failed to create tensor map descriptor: %s\n",
- str ? str : "Unknown error");
- off += snprintf(err + off, sizeof(err) - off,
- "elemType=%d rank=%d global_address=0x%llx elemSize=%d "
- "swizzle=%d padding=%d\n",
- elemType, rank, (unsigned long long)global_address,
- elemSize, swizzle, padding);
- off += snprintf(err + off, sizeof(err) - off, "shape=[");
- for (int i = 0; i < rank; ++i) {
- off +=
- snprintf(err + off, sizeof(err) - off, "%llu%s",
- (unsigned long long)shapeInt[i], (i + 1 < rank) ? ", " : "");
- }
- off += snprintf(err + off, sizeof(err) - off, "]\n");
- off += snprintf(err + off, sizeof(err) - off, "strides=[");
- for (int i = 0; i < rank; ++i) {
- off += snprintf(err + off, sizeof(err) - off, "%llu%s",
- (unsigned long long)stridesLL[i],
- (i + 1 < rank) ? ", " : "");
- }
- off += snprintf(err + off, sizeof(err) - off, "]\n");
- off += snprintf(err + off, sizeof(err) - off, "blockSize=[");
- for (int i = 0; i < rank; ++i) {
- off += snprintf(err + off, sizeof(err) - off, "%u%s",
- (unsigned)blockSizeInt[i], (i + 1 < rank) ? ", " : "");
- }
- off += snprintf(err + off, sizeof(err) - off, "] elementStrides=[");
- for (int i = 0; i < rank; ++i) {
- off += snprintf(err + off, sizeof(err) - off, "%u%s",
- (unsigned)elementStrides[i], (i + 1 < rank) ? ", " : "");
- }
- off += snprintf(err + off, sizeof(err) - off, "]\n");
- PyErr_SetString(PyExc_RuntimeError, err);
- goto cleanup;
- }
- return (PyObject *)desc;
- cleanup:
- Py_XDECREF(blockSizeFast);
- Py_XDECREF(shapeFast);
- Py_XDECREF(stridesFast);
- Py_XDECREF(desc);
- return NULL;
- }
- static PyMethodDef ModuleMethods[] = {
- {"load_binary", loadBinary, METH_VARARGS,
- "Load provided cubin into CUDA driver"},
- {"get_device_properties", getDeviceProperties, METH_VARARGS,
- "Get the properties for a given device"},
- {"cuOccupancyMaxActiveClusters", occupancyMaxActiveClusters, METH_VARARGS,
- "Python interface for cuOccupancyMaxActiveClusters function"},
- {"set_printf_fifo_size", setPrintfFifoSize, METH_VARARGS,
- "Python interface for cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, x), which "
- "controls how many bytes can be streamed from kernels before data starts "
- "being dropped. This inherits all the limitations of this call; in "
- "particular it's an error to change this value after launching any kernel "
- "that calls printf()."},
- {"fill_tma_descriptor", fillTMADescriptor, METH_VARARGS, "doc"},
- {NULL, NULL, 0, NULL} // sentinel
- };
- static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "cuda_utils",
- NULL, // documentation
- -1, // size
- ModuleMethods};
- PyMODINIT_FUNC PyInit_cuda_utils(void) {
- if (PyType_Ready(&PyCUtensorMapType) < 0) {
- return NULL;
- }
- PyObject *m = PyModule_Create(&ModuleDef);
- if (m == NULL) {
- return NULL;
- }
- PyModule_AddFunctions(m, ModuleMethods);
- Py_INCREF(&PyCUtensorMapType);
- PyModule_AddObject(m, "PyCUtensorMap", (PyObject *)&PyCUtensorMapType);
- return m;
- }
|