Julia集中的元素都是经过简单的迭代计算得到的,很适合用CUDA进行加速。对一个600*600的图像,需要进行360000次迭代计算,所以在CUDA中创建了600*600个线程块(block),每个线程块包含1个线程,并行执行360000次运行,图像的创建和显示通过OpenCV实现:
#include "cuda_runtime.h"#include <highgui.hpp>using namespace cv;#define DIM 600 //图像长宽struct cuComplex{ float r; float i; __device__ cuComplex(float a, float b) : r(a), i(b) {} __device__ float magnitude2(void) { return r * r + i * i; } __device__ cuComplex Operator*(const cuComplex& a) { return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i); } __device__ cuComplex operator+(const cuComplex& a) { return cuComplex(r + a.r, i + a.i); }};__device__ int julia(int x, int y){ const float scale = 1.5; float jx = scale * (float)(DIM / 2 - x) / (DIM / 2); float jy = scale * (float)(DIM / 2 - y) / (DIM / 2); cuComplex c(0.25, 0.010); cuComplex a(jx, jy); int i = 0; for (i = 0; i < 200; i++) { a = a * a + c; if (a.magnitude2() > 1000) return 0; } return 1;}__global__ void kernel(unsigned char *ptr){ // map from blockIdx to pixel position int x = blockIdx.x; int y = blockIdx.y; int offset = x + y * gridDim.x; // now calculate the value at that position int juliaValue = julia(x, y); ptr[offset * 3 + 0] = 0; ptr[offset * 3 + 1] = 0; ptr[offset * 3 + 2] = 255 * juliaValue;}// globals needed by the update routinestruct DataBlock{ unsigned char *dev_bitmap;};int main(void){ DataBlock data; cudaError_t error; Mat image = Mat(DIM, DIM, CV_8UC3, Scalar::all(0)); data.dev_bitmap = image.data; unsigned char *dev_bitmap; error = cudaMalloc((void**)&dev_bitmap, 3 * image.cols*image.rows); data.dev_bitmap = dev_bitmap; dim3 grid(DIM, DIM); kernel << <grid, 1 >> > (dev_bitmap); error = cudaMemcpy(image.data, dev_bitmap, 3 * image.cols*image.rows, cudaMemcpyDeviceToHost); error = cudaFree(dev_bitmap); imshow("CUDA For Julia | c(0.25, 0.010)", image); waitKey();}c(-0.8,0.156):
c(-0.85,0.06):
c(-0.305,0.60):
c(0.25,0.010):
新闻热点
疑难解答