#include "cuda_runtime.h" #include <highgui.hpp> using namespace cv;#define DIM 600 //图像长宽#define PI 3.1415926535897932f __global__ void kernel(unsigned char *ptr){ // map from blockIdx to pixel position int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; __shared__ float sharedMem[16][16]; const float period = 128.0f; sharedMem[threadIdx.x][threadIdx.y] = 255 * (sinf(x*2.0f*PI / period) + 1.0f) * (sinf(y*2.0f*PI / period) + 1.0f) / 4.0f; __syncthreads(); ptr[offset * 3 + 0] = 0; ptr[offset * 3 + 1] = sharedMem[15 - threadIdx.x][15 - threadIdx.y]; ptr[offset * 3 + 2] = 0;}// globals needed by the update routine struct DataBlock{ unsigned char *dev_bitmap;};int main(void){ DataBlock data; cudaError_t error; Mat image = Mat(DIM, DIM, CV_8UC3, Scalar::all(0)); data.dev_bitmap = image.data; unsigned char *dev_bitmap; error = cudaMalloc((void**)&dev_bitmap, 3 * image.cols*image.rows); data.dev_bitmap = dev_bitmap; dim3 grid(DIM / 10, DIM / 10); dim3 block(10, 10); //DIM*DIM个线程块 kernel << <grid, block >> > (dev_bitmap); error = cudaMemcpy(image.data, dev_bitmap, 3 * image.cols*image.rows, cudaMemcpyDeviceToHost); error = cudaFree(dev_bitmap); imshow("__share__ and __syncthreads()", image); waitKey();}如果线程间不加入__syncthreads()同步机制,同一线程块内不同线程访问sharedMem,获取的结果可能是不一样的,生成的图像如下,有散乱的杂点:加入__syncthreads()同步机制,保证了同一线程块中不同的线程都执行完成__syncthreads()这个集合点之前的部分之后,才继续往下执行,所以不同的线程访问sharedMem获取的结果是一致的,图像无杂散点,是一个规律的排布:
新闻热点
疑难解答