以下面这个例子作为教程,实现功能是element-wise add;
(pytorch中想调用cuda模块,还是用另外使用C编写接口脚本)
第一步:cuda编程的源文件和头文件
// mathutil_cuda_kernel.cu// 头文件,最后一个是cuda特有的#include <curand.h>#include <stdio.h>#include <math.h>#include <float.h>#include "mathutil_cuda_kernel.h"// 获取GPU线程通道信息dim3 cuda_gridsize(int n){ int k = (n - 1) / BLOCK + 1; int x = k; int y = 1; if(x > 65535) { x = ceil(sqrt(k)); y = (n - 1) / (x * BLOCK) + 1; } dim3 d(x, y, 1); return d;}// 这个函数是cuda执行函数,可以看到细化到了每一个元素__global__ void broadcast_sum_kernel(float *a, float *b, int x, int y, int size){ int i = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x; if(i >= size) return; int j = i % x; i = i / x; int k = i % y; a[IDX2D(j, k, y)] += b[k];}// 这个函数是与c语言函数链接的接口函数void broadcast_sum_cuda(float *a, float *b, int x, int y, cudaStream_t stream){ int size = x * y; cudaError_t err; // 上面定义的函数 broadcast_sum_kernel<<<cuda_gridsize(size), BLOCK, 0, stream>>>(a, b, x, y, size); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s/n", cudaGetErrorString(err)); exit(-1); }}
#ifndef _MATHUTIL_CUDA_KERNEL#define _MATHUTIL_CUDA_KERNEL#define IDX2D(i, j, dj) (dj * i + j)#define IDX3D(i, j, k, dj, dk) (IDX2D(IDX2D(i, j, dj), k, dk))#define BLOCK 512#define MAX_STREAMS 512#ifdef __cplusplusextern "C" {#endifvoid broadcast_sum_cuda(float *a, float *b, int x, int y, cudaStream_t stream);#ifdef __cplusplus}#endif#endif
第二步:C编程的源文件和头文件(接口函数)
// mathutil_cuda.c// THC是pytorch底层GPU库#include <THC/THC.h>#include "mathutil_cuda_kernel.h"extern THCState *state;int broadcast_sum(THCudaTensor *a_tensor, THCudaTensor *b_tensor, int x, int y){ float *a = THCudaTensor_data(state, a_tensor); float *b = THCudaTensor_data(state, b_tensor); cudaStream_t stream = THCState_getCurrentStream(state); // 这里调用之前在cuda中编写的接口函数 broadcast_sum_cuda(a, b, x, y, stream); return 1;}
int broadcast_sum(THCudaTensor *a_tensor, THCudaTensor *b_tensor, int x, int y);
第三步:编译,先编译cuda模块,再编译接口函数模块(不能放在一起同时编译)
nvcc -c -o mathutil_cuda_kernel.cu.o mathutil_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52
import osimport torchfrom torch.utils.ffi import create_extensionthis_file = os.path.dirname(__file__)sources = []headers = []defines = []with_cuda = Falseif torch.cuda.is_available(): print('Including CUDA code.') sources += ['src/mathutil_cuda.c'] headers += ['src/mathutil_cuda.h'] defines += [('WITH_CUDA', None)] with_cuda = Truethis_file = os.path.dirname(os.path.realpath(__file__))extra_objects = ['src/mathutil_cuda_kernel.cu.o'] # 这里是编译好后的.o文件位置extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]ffi = create_extension( '_ext.cuda_util', headers=headers, sources=sources, define_macros=defines, relative_to=__file__, with_cuda=with_cuda, extra_objects=extra_objects)if __name__ == '__main__': ffi.build()
新闻热点
疑难解答