The code below illustrates cube calculation using OpenCL. This OpenCL example is structured as follows :
1.) Initially the OpenCL kernel is written in const char *KernelSource.
2.) At the start of main we define all the requisite OpenCL related and other normal variables.
3.) Next in this opencl example, we set OpenCL required environment for running the kernel using functions like clGetDeviceIDs(), clCreateContext(), clCreateCommandQueue().
4.) Then on we create program with source mentioned in char *KernelSource using the function clCreateProgramWithSource() followed by building it with clBuildProgram() and creating our kernel object using clCreateKernel().
5.) Then we allocate memory for input and output using on the selected OpenCL device using the function clCreateBuffer().
6.) Hence we write our input data into the allocated memory using function clEnqueueWriteBuffer() and set the arguments for the compute kernel with clSetKernelArg() as shown in the OpenCL example below.
7.) We get the maximum work group size for executing the kernel on the device through clGetKernelWorkGroupInfo() followed by executing the kernel over the entire range of our 1d input data set using the maximum number of work group items for this device through clEnqueueNDRangeKernel().
8.) Finally we wait for the commands to get serviced before reading back the results using clFinish() and thereby print the results after reading the output buffer from the opencl device using clEnqueueReadBuffer().
#include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <OpenCL/opencl.h> #define INP_SIZE (1024) // Simple compute kernel which computes the cube of an input array const char *KernelSource = "\n" \ "__kernel void square( __global float* input, __global float* output, \n" \ " const unsigned int count) { \n" \ " int i = get_global_id(0); \n" \ " if(i < count) \n" \ " output[i] = input[i] * input[i] * input[i]; \n" \ "} \n" ; int main(int argc, char** argv) { int err; // error code float data[INP_SIZE]; // original input data set to device float results[INP_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size size_t local; // local domain size cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random values int i = 0; unsigned int count = INP_SIZE; for(i = 0; i < count; i++) data[i] = rand() / 50.00; // Connect to a compute device // If want to run your kernel on CPU then replace the parameter CL_DEVICE_TYPE_GPU // with CL_DEVICE_TYPE_CPU err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context //Contexts are responsible for managing objects such as command-queues, memory, program and kernel objects and for executing kernels on one or more devices specified in the context. context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results clFinish(commands); // Read back the results from the device to verify the output err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Print obtained results from OpenCL kernel for(i=0; i<count); i++ ) { printf("result[%d] = %f", i, result[i]) ; } // Cleaning up clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
Its really helpful article for me....
ReplyDeleteThank u..