The code below illustrates cube calculation using
OpenCL. This
OpenCL example is structured as follows :
1.) Initially the
OpenCL kernel is written in const char *KernelSource.
2.) At the start of main we define all the requisite
OpenCL related and other normal variables.
3.) Next in this
opencl example, we set OpenCL required environment for running the kernel using functions like
clGetDeviceIDs(),
clCreateContext(),
clCreateCommandQueue().
4.) Then on we create program with source mentioned in
char *KernelSource using the function
clCreateProgramWithSource() followed by building it with
clBuildProgram() and creating our kernel object using
clCreateKernel().
5.) Then we allocate memory for input and output using on the selected OpenCL device using the function
clCreateBuffer().
6.) Hence we write our input data into the allocated memory using function
clEnqueueWriteBuffer() and
set the arguments for the compute kernel with clSetKernelArg() as shown in the OpenCL example below.
7.) We get the maximum work group size for executing the kernel on the device through
clGetKernelWorkGroupInfo() followed by executing the kernel over the entire range of our 1d input data set using the maximum number of work group items for this device through
clEnqueueNDRangeKernel().
8.) Finally we wait for the commands to get serviced before reading back the results using
clFinish() and thereby print the results after reading the output buffer from the opencl device using
clEnqueueReadBuffer().
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
#define INP_SIZE (1024)
// Simple compute kernel which computes the cube of an input array
const char *KernelSource = "\n" \
"__kernel void square( __global float* input, __global float* output, \n" \
" const unsigned int count) { \n" \
" int i = get_global_id(0); \n" \
" if(i < count) \n" \
" output[i] = input[i] * input[i] * input[i]; \n" \
"} \n" ;
int main(int argc, char** argv)
{
int err; // error code
float data[INP_SIZE]; // original input data set to device
float results[INP_SIZE]; // results returned from device
unsigned int correct; // number of correct results returned
size_t global; // global domain size
size_t local; // local domain size
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Fill our data set with random values
int i = 0;
unsigned int count = INP_SIZE;
for(i = 0; i < count; i++)
data[i] = rand() / 50.00;
// Connect to a compute device
// If want to run your kernel on CPU then replace the parameter CL_DEVICE_TYPE_GPU
// with CL_DEVICE_TYPE_CPU
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
//Contexts are responsible for managing objects such as command-queues, memory, program and kernel objects and for executing kernels on one or more devices specified in the context.
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}
// Create the compute program from the source buffer
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, "square", &err);
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
// Create the input and output arrays in device memory for our calculation
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
global = count;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (err)
{
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
clFinish(commands);
// Read back the results from the device to verify the output
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Print obtained results from OpenCL kernel
for(i=0; i<count); i++ )
{
printf("result[%d] = %f", i, result[i]) ;
}
// Cleaning up
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}