// // File: hello.c // // Abstract: A simple "Hello World" compute example showing basic usage of OpenCL which // calculates the mathematical square (X[i] = pow(X[i],2)) for a buffer of // floating point values. // // // Version: <1.0> // // Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") // in consideration of your agreement to the following terms, and your use, // installation, modification or redistribution of this Apple software // constitutes acceptance of these terms. If you do not agree with these // terms, please do not use, install, modify or redistribute this Apple // software. // // In consideration of your agreement to abide by the following terms, and // subject to these terms, Apple grants you a personal, non - exclusive // license, under Apple's copyrights in this original Apple software ( the // "Apple Software" ), to use, reproduce, modify and redistribute the Apple // Software, with or without modifications, in source and / or binary forms; // provided that if you redistribute the Apple Software in its entirety and // without modifications, you must retain this notice and the following text // and disclaimers in all such redistributions of the Apple Software. Neither // the name, trademarks, service marks or logos of Apple Inc. may be used to // endorse or promote products derived from the Apple Software without specific // prior written permission from Apple. Except as expressly stated in this // notice, no other rights or licenses, express or implied, are granted by // Apple herein, including but not limited to any patent rights that may be // infringed by your derivative works or by other works in which the Apple // Software may be incorporated. // // The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO // WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED // WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION // ALONE OR IN COMBINATION WITH YOUR PRODUCTS. // // IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR // CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION // AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER // UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR // OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright ( C ) 2008 Apple Inc. All Rights Reserved. // //////////////////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #include /* time_t, struct tm, difftime, time, mktime */ //////////////////////////////////////////////////////////////////////////////// // Use a static data size for simplicity // #define DATA_SIZE (65536) #ifndef INFINITY #define INFINITY 1.0/0 #endif #ifndef M_PI #define M_PI 3.141592653589793 #endif //////////////////////////////////////////////////////////////////////////////// // Simple compute kernel which computes the square of an input array // const char *KernelSource = "\n" \ "#ifndef INFINITY \n" \ "#define INFINITY 1.0/0 \n" \ "#endif \n" \ "#ifndef M_PI \n" \ "#define M_PI 3.14159265358979323846 \n" \ "#endif \n" \ "void dct_ii(float *x, float *X) { \n" \ " float sum = 0.; \n" \ " for (int n = 0; n < 128; ++n) { \n" \ " sum += x[n]; \n" \ " } \n" \ " X[0] = sum; \n" \ " for (uint k = 1; k < 128; ++k) { \n" \ " sum = 0.; \n" \ " for (int n = 0; n < 128; ++n) { \n" \ " sum += x[n] * cos((float)(M_PI * (n + .5) * k / 128)); \n" \ " } \n" \ " X[k] = sum; \n" \ " } \n" \ "} \n" \ " \n" \ "__kernel void test_dct( __global float *gdata, __global float *gres){ \n" \ " uint gid = get_global_id(0); \n" \ " uint idx = gid*128 ; \n" \ " float data[128]; \n" \ " float res[128]; \n" \ " for(uint i=0; i<128; i++){ \n" \ " data[i] = gdata[idx+i]; \n" \ " } \n" \ " //for(uint i=5; i<=128; i++){ \n" \ " dct_ii(data, res); \n" \ " //} \n" \ " for(uint i=0; i<128; i++){ \n" \ " gres[idx+i] = res[i]; \n" \ " } \n" \ " \n" \ "} \n" \ "\n"; //////////////////////////////////////////////////////////////////////////////// void dct_ii(int N, float *x, float *X) { float sum = 0.; for (int n = 0; n < N; ++n) { sum += x[n]; } X[0] = sum; for (uint k = 1; k < N; ++k) { sum = 0.; for (int n = 0; n < N; ++n) { sum += x[n] * cos((float)(M_PI * (n + .5) * k / N)); } X[k] = sum; } } unsigned long int get_ms(){ struct timeval tv; gettimeofday(&tv, NULL); return (unsigned long int)(tv.tv_sec) * 1000 + (unsigned long int)(tv.tv_usec) / 1000; } int main( int argc, char *argv[] ) { int err; // error code returned from api calls unsigned long int timer_1st; unsigned long int timer_2nd; unsigned long int seconds; float *data; // original data set given to device float *results; // results returned from device float *hresults; // results returned from host size_t count; unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation size_t k; size_t i; cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array cl_platform_id platforms[8]; //an array to hold the IDs of all the platforms, hopefuly there won't be more than 8 cl_uint num_platforms; //this number will hold the number of platforms on this machine char vendor[1024]; //this strirng will hold a platforms vendor cl_device_id devices[8]; //this variable holds the number of devices for each platform, hopefully it won't be more than 8 per platform cl_uint num_devices; //this number will hold the number of devices on this machine char deviceName[1024]; //this string will hold the devices name cl_uint numberOfCores; //this variable holds the number of cores of on a device cl_long amountOfMemory; //this variable holds the amount of memory on a device cl_uint clockFreq; //this variable holds the clock frequency of a device cl_ulong maxAlocatableMem; //this variable holds the maximum allocatable memory cl_ulong localMem; //this variable holds local memory for a device cl_bool available; //this variable holds if the device is available int pid; //Platform id // Init data arrays and count data = malloc(sizeof(float)*DATA_SIZE*128); // original data set given to device results = malloc(sizeof(float)*DATA_SIZE*128); // results returned from device hresults = malloc(sizeof(float)*DATA_SIZE*128); // host results count = DATA_SIZE; // Fill our data set with random float values // for(i = 0; i < DATA_SIZE*128; i++) data[i] = rand() / (float)RAND_MAX; // Compute host version of dct timer_1st = get_ms(); for(i = 0; i < DATA_SIZE; i++) dct_ii(128, &data[i*128], &hresults[i*128]); printf("%f seconds to calculate host values\n", (get_ms() - timer_1st)/1000.0); // Connect to a compute device // clGetPlatformIDs(3, platforms, &num_platforms); if (argc<2){ printf("Platform id is required. Choose from 0-2"); return 1; } if (sscanf (argv[1], "%i", &pid)!=1) { printf ("error - not an integer"); return 1; } err = clGetDeviceIDs(platforms[pid], CL_DEVICE_TYPE_ALL, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } else { printf("Succeeded to create a device group!\n"); } //scan in device information clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL); clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numberOfCores), &numberOfCores, NULL); clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(amountOfMemory), &amountOfMemory, NULL); clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clockFreq), &clockFreq, NULL); clGetDeviceInfo(device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxAlocatableMem), &maxAlocatableMem, NULL); clGetDeviceInfo(device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(localMem), &localMem, NULL); clGetDeviceInfo(device_id, CL_DEVICE_AVAILABLE, sizeof(available), &available, NULL); //print out device information printf("\tDevice: %u\n", pid); printf("\t\tName:\t\t\t\t%s\n", deviceName); printf("\t\tVendor:\t\t\t\t%s\n", vendor); printf("\t\tAvailable:\t\t\t%s\n", available ? "Yes" : "No"); printf("\t\tCompute Units:\t\t\t%u\n", numberOfCores); printf("\t\tClock Frequency:\t\t%u mHz\n", clockFreq); printf("\t\tGlobal Memory:\t\t\t%0.00f mb\n", (double)amountOfMemory/1048576); printf("\t\tMax Allocateable Memory:\t%0.00f mb\n", (double)maxAlocatableMem/1048576); printf("\t\tLocal Memory:\t\t\t%u kb\n\n", (unsigned int)localMem); // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } else { printf("Succeeded to create a compute context!\n"); } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } else { printf("Succeeded to create a command commands!\n"); } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } else { printf("Succeeded to create compute program!\n"); } // Build the program executable // err = clBuildProgram(program, 0, NULL, "-cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math", NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } else { printf("Succeeded to create program executable!\n"); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "test_dct", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } else { printf("Succeeded to create compute kernel!\n"); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE*128, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * DATA_SIZE*128, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * DATA_SIZE*128, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } seconds = 0; for(k = 0; k<9; k++){ // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; timer_1st = get_ms(); err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &count, NULL, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); seconds += get_ms() - timer_1st; // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE*128, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < DATA_SIZE*128; i++) { if(fabs(results[i] - hresults[i]) < (float) 0.0001) correct++; //if(i<10) printf("%f vs %f\n", results[i], hresults[i]); } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, i); } printf("%f average seconds to calculate gpu values\n", seconds/9000.0); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }