#include #include #include #include #define OCL_RET_ERR(retval, msg) \ do { \ if (retval != CL_SUCCESS) \ { \ fprintf(stderr, "%s\n", msg); \ exit(retval); \ } \ } while (0) \ // OpenCL kernel. Each work item takes care of one element of c const char *kernelSource = "\n" \ "__kernel void vecAdd( __global float *a, \n" \ " __global float *b, \n" \ " __global float *c, \n" \ " const unsigned int n) \n" \ "{ \n" \ " //Get our global thread ID \n" \ " int id = get_global_id(0); \n" \ " \n" \ " //Make sure we do not go out of bounds \n" \ " if (id < n) \n" \ " c[id] = a[id] + b[id]; \n" \ "} \n" \ "\n" ; const char *kernelSource_atom = "\n" \ "inline void atomicAdd_g_f(volatile __global float *addr, float val) \n" \ "{ \n" \ " union{ \n" \ " unsigned int u32; \n" \ " float f32; \n" \ " } next, expected, current; \n" \ " current.f32 = *addr; \n" \ " do{ \n" \ " expected.f32 = current.f32; \n" \ " next.f32 = expected.f32 + val; \n" \ " current.u32 = atomic_cmpxchg( (volatile __global unsigned int *)addr, expected.u32, next.u32); \n" \ " } while( current.u32 != expected.u32 ); \n" \ "} \n" \ " \n" \ "__kernel void vecAdd( __global float *a, \n" \ " __global float *b, \n" \ " __global float *c, \n" \ " const unsigned int n) \n" \ "{ \n" \ " //Get our global thread ID \n" \ " int id = get_global_id(0); \n" \ " \n" \ " //Make sure we do not go out of bounds \n" \ " if (id < n) \n" \ " { \n" \ " c[id] = 0.0f; \n" \ " atomicAdd_g_f(&c[id], a[id] + b[id]); \n" \ " } \n"\ "} \n" \ "\n" ; #define NREPEAT 10000 int main( int argc, char* argv[] ) { // Length of vectors unsigned int n = 1000000; // Host input vectors float *h_a; float *h_b; // Host output vector float *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel kernel; // kernel // Size, in bytes, of each vector size_t bytes = n*sizeof(float); // Allocate memory for each vector on host h_a = (float*)malloc(bytes); h_b = (float*)malloc(bytes); h_c = (float*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); } size_t globalSize, localSize; cl_int err; // Number of work items in each local work group localSize = 64; // Number of total work items - localSize must be devisor globalSize = ceil(n/(float)localSize)*localSize; // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, NULL, &err); OCL_RET_ERR(err, "clCreateProgramWithSource error"); // Build the program executable //err = clBuildProgram(program, 0, NULL, "-cl-nv-verbose", NULL, NULL); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); //OCL_RET_ERR(err, "clBuildProgram error"); size_t log_size = 0; char *build_log = (char*)malloc(log_size); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); if (log_size > 2) { clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); fprintf(stderr, "Build log: %s\n", build_log); } // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err); OCL_RET_ERR(err, "clCreateKernel error"); // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, bytes, h_a, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, bytes, h_b, 0, NULL, NULL); OCL_RET_ERR(err, "clEnqueueWriteBuffer error"); // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); OCL_RET_ERR(err, "clSetKernelArg error"); // Execute the kernel over the entire range of the data set for (i = 0; i < NREPEAT; i++) { err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); } OCL_RET_ERR(err, "clEnqueueNDRangeKernel error"); // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); //Sum up vector c and print result divided by n, this should equal 1 within error float sum = 0; for(i=0; i