Sunday, 16 June 2013

OpenCL example

Posted by Mahesh Doijade


what is opencl, opencl


     The code below illustrates cube calculation using OpenCL. This OpenCL example is structured as follows :
1.) Initially the OpenCL kernel is written in const char *KernelSource.

2.) At the start of main we define all the requisite OpenCL related and other normal variables.

3.) Next in this opencl example, we set OpenCL required environment for running the kernel using functions like clGetDeviceIDs(), clCreateContext(), clCreateCommandQueue().

4.) Then on we create program with source mentioned in char *KernelSource using the function clCreateProgramWithSource() followed by building it with clBuildProgram() and creating our kernel object using clCreateKernel().

5.) Then we allocate memory for input and output using on the selected OpenCL device using the function clCreateBuffer().

6.) Hence we write our input data into the allocated memory using function clEnqueueWriteBuffer()  and set the arguments for the compute kernel with clSetKernelArg() as shown in the OpenCL example below.

7.) We get the maximum work group size for executing the kernel on the device through clGetKernelWorkGroupInfo() followed by executing the kernel over the entire range of our 1d input data set using the maximum number of work group items for this device through clEnqueueNDRangeKernel().

8.) Finally we wait for the commands to get serviced before reading back the results using clFinish() and thereby print the results after reading the output buffer from the opencl device using clEnqueueReadBuffer()    

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>

#define INP_SIZE (1024)

// Simple compute kernel which computes the cube of an input array 

const char *KernelSource = "\n" \
"__kernel void square( __global float* input, __global float* output, \n" \
" const unsigned int count) {            \n" \
" int i = get_global_id(0);              \n" \
" if(i < count) \n" \
" output[i] = input[i] * input[i] * input[i]; \n" \
"}                     \n" ;

int main(int argc, char** argv)
{

 int err; // error code
 float data[INP_SIZE]; // original input data set to device
 float results[INP_SIZE]; // results returned from device
 unsigned int correct; // number of correct results returned

 size_t global; // global domain size 
 size_t local; // local domain size 

 cl_device_id device_id; // compute device id 
 cl_context context; // compute context
 cl_command_queue commands; // compute command queue
 cl_program program; // compute program
 cl_kernel kernel; // compute kernel
 cl_mem input; // device memory used for the input array
 cl_mem output; // device memory used for the output array

 // Fill our data set with random values
 int i = 0;
 unsigned int count = INP_SIZE;

 for(i = 0; i < count; i++)
 data[i] = rand() / 50.00;

 
 // Connect to a compute device
 // If want to run your kernel on CPU then replace the parameter CL_DEVICE_TYPE_GPU 
 // with CL_DEVICE_TYPE_CPU

 err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

 if (err != CL_SUCCESS)
 {
     printf("Error: Failed to create a device group!\n");
     return EXIT_FAILURE;
 }


 // Create a compute context
 //Contexts are responsible for managing objects such as command-queues, memory, program and kernel objects and for executing kernels on one or more devices specified in the context.

 context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

 if (!context)
 {
     printf("Error: Failed to create a compute context!\n");
     return EXIT_FAILURE;
 }

 // Create a command commands
 commands = clCreateCommandQueue(context, device_id, 0, &err);
 if (!commands)
 {
     printf("Error: Failed to create a command commands!\n");
     return EXIT_FAILURE;
 }

 // Create the compute program from the source buffer
 program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
 if (!program)
 {
     printf("Error: Failed to create compute program!\n");
     return EXIT_FAILURE;
 }

 // Build the program executable
 err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 if (err != CL_SUCCESS)
 {
    size_t len;
    char buffer[2048];
    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    printf("%s\n", buffer);
    exit(1);
 }

 // Create the compute kernel in the program we wish to run
 kernel = clCreateKernel(program, "square", &err);
 if (!kernel || err != CL_SUCCESS)
 {
    printf("Error: Failed to create compute kernel!\n");
    exit(1);
 }

 // Create the input and output arrays in device memory for our calculation
 input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
 output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);

 if (!input || !output)
 {
    printf("Error: Failed to allocate device memory!\n");
    exit(1);
 } 

 // Write our data set into the input array in device memory 
 err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
 if (err != CL_SUCCESS)
 {
    printf("Error: Failed to write to source array!\n");
    exit(1);
 }

 // Set the arguments to our compute kernel
 err = 0;
 err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
 err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
 err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);

 if (err != CL_SUCCESS)
 {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    exit(1);
 }

 // Get the maximum work group size for executing the kernel on the device
 err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
 if (err != CL_SUCCESS)
 {
    printf("Error: Failed to retrieve kernel work group info! %d\n", err);
    exit(1);
 }

 // Execute the kernel over the entire range of our 1d input data set
 // using the maximum number of work group items for this device
 global = count;
 err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 if (err)
 {
    printf("Error: Failed to execute kernel!\n");
    return EXIT_FAILURE;
 }

 // Wait for the command commands to get serviced before reading back results
 clFinish(commands);

 // Read back the results from the device to verify the output
 err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); 
 if (err != CL_SUCCESS)
 {
    printf("Error: Failed to read output array! %d\n", err);
    exit(1);
 }

  // Print obtained results from OpenCL kernel
 for(i=0; i<count); i++ )
 {
    printf("result[%d] = %f", i, result[i]) ;
 }

 // Cleaning up
 clReleaseMemObject(input);
 clReleaseMemObject(output);
 clReleaseProgram(program);
 clReleaseKernel(kernel);
 clReleaseCommandQueue(commands);
 clReleaseContext(context);

 return 0;
}


1 comment:

  1. Its really helpful article for me....
    Thank u..

    ReplyDelete