Tuesday, 11 March 2014

Matrix Multiplication in CUDA using Shared memory

Posted by Mahesh Doijade
Tiled Matrix Multiplication in CUDA, Matrix Multiplication in CUDA using Shared memory
Global memory access penalties greatly hampers the performance of CUDA codes due to latency involved. In case of Matrix Multiplication, if one implements in the naive way then its apparent that there is plenty of redundant global memory accesses involved, as much of the accessed elements can be reused for computation of several resultant elements, in order to eliminate this redundant one can leverage the shared memory to overcome the global memory access pattern issue involved in this. Due to using tiled based approach the global memory access are been reduced by the factor of tile size, that is, shared memory size for the part of the matrix involved. Go through the code to know the algorithm involved, and any questions I will be pleased to answer.

Matrix Multiplication in CUDA using Shared memory

#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>

// This code assumes that your device support block size of 1024
#define MAX_RANGE 9999

#define funcCheck(stmt) do {                                                    \
        cudaError_t err = stmt;                                               \
        if (err != cudaSuccess) {                                             \
            printf( "Failed to run stmt %d ", __LINE__);                       \
            printf( "Got CUDA error ...  %s ", cudaGetErrorString(err));    \
            return -1;                                                        \
        }                                                                     \
    } while(0)

// Compute C = A * B
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
                                    int numARows, int numAColumns,
                                    int numBRows, int numBColumns,
                                    int numCRows, int numCColumns) 
{
    __shared__ float sA[32][32];   // Tile size of 32x32 
    __shared__ float sB[32][32];

    int Row = blockDim.y*blockIdx.y + threadIdx.y;
    int Col = blockDim.x*blockIdx.x + threadIdx.x;
    float Cvalue = 0.0;
    sA[threadIdx.y][threadIdx.x] = 0.0;
    sB[threadIdx.y][threadIdx.x] = 0.0;

    for (int k = 0; k < (((numAColumns - 1)/ 32) + 1); k++)
    {
        if ( (Row < numARows) && (threadIdx.x + (k*32)) < numAColumns)
        {
            sA[threadIdx.y][threadIdx.x] = A[(Row*numAColumns) + threadIdx.x + (k*32)];
        }
        else
        {
            sA[threadIdx.y][threadIdx.x] = 0.0;
        }            
        if ( Col < numBColumns && (threadIdx.y + k*32) < numBRows)
        {
            sB[threadIdx.y][threadIdx.x] = B[(threadIdx.y + k*32)*numBColumns + Col];
        }
        else
        {
            sB[threadIdx.y][threadIdx.x] = 0.0;
        }            
        __syncthreads();

        for (int j = 0; j < 32; ++j)
        {
            Cvalue += sA[threadIdx.y][j] * sB[j][threadIdx.x];
        }
    }
    if (Row < numCRows && Col < numCColumns)
    {
        C[Row*numCColumns + Col] = Cvalue;
    }
}

void matMultiplyOnHost(float * A, float * B, float * C, int numARows,
                        int numAColumns, int numBRows, int numBColumns,
                        int numCRows, int numCColumns)
{
    for (int i=0; i < numARows; i ++)
    {
        for (int j = 0; j < numAColumns; j++)
        {
            C[i*numCColumns + j ] = 0.0;
            for (int k = 0; k < numCColumns; k++)
            {
                C[i*numCColumns + j ] += A[i*numAColumns + k] * B [k*numBColumns + j];
            }
        }
    }
    return;
}

int main(int argc, char ** argv) {
    float * hostA; // The A matrix
    float * hostB; // The B matrix
    float * hostC; // The output C matrix
    float * hostComputedC;
    float * deviceA;
    float * deviceB;
    float * deviceC;

    // Please adjust rows and columns according to you need.
    int numARows = 512; // number of rows in the matrix A
    int numAColumns = 512; // number of columns in the matrix A
    int numBRows = 512; // number of rows in the matrix B
    int numBColumns = 512; // number of columns in the matrix B

    int numCRows; // number of rows in the matrix C (you have to set this)
    int numCColumns; // number of columns in the matrix C (you have to set this)

    hostA = (float *) malloc(sizeof(float)*numARows*numAColumns);
    hostB = (float *) malloc(sizeof(float)*numBRows*numBColumns);

    for (int i = 0; i < numARows*numAColumns; i++)
    {
        hostA[i] = (rand() % MAX_RANGE) / 2.0;
    }
    for (int i = 0; i < numBRows*numBColumns; i++)
    {
        hostB[i] = (rand() % MAX_RANGE) / 2.0;
    }

    // Setting numCRows and numCColumns
    numCRows = numARows;
    numCColumns = numBColumns;

    hostC = (float *) malloc(sizeof(float)*numCRows*numCColumns);    
    hostComputedC = (float *) malloc(sizeof(float)*numCRows*numCColumns);    

    // Allocating GPU memory
    funcCheck(cudaMalloc((void **)&deviceA, sizeof(float)*numARows*numAColumns));
    funcCheck(cudaMalloc((void **)&deviceB, sizeof(float)*numBRows*numBColumns));
    funcCheck(cudaMalloc((void **)&deviceC, sizeof(float)*numCRows*numCColumns));

    // Copy memory to the GPU 
    funcCheck(cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice));
    funcCheck(cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice));

    // Initialize the grid and block dimensions 
    dim3 dimBlock(32, 32, 1);    
    dim3 dimGrid((numCColumns/32) + 1, (numCRows/32) + 1, 1);

    //@@ Launch the GPU Kernel here
    matrixMultiplyShared<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);    

    cudaError_t err1 = cudaPeekAtLastError();
    cudaDeviceSynchronize();
    printf( "Got CUDA error ... %s \n", cudaGetErrorString(err1));

    // Copy the results in GPU memory back to the CPU    
    funcCheck(cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns, cudaMemcpyDeviceToHost));

    matMultiplyOnHost(hostA, hostB, hostComputedC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);

    for (int i=0; i < numCColumns*numCRows; i++)
    {
        if (hostComputedC[i]  != hostC[i] )
        {
            printf("Mismatch at Row = %d Col = %d hostComputed[] = %f --device[] %f\n", i / numCColumns, i % numCColumns, hostComputedC[i], hostC[i]);
            break;
        }
    }
    // Free the GPU memory
    funcCheck(cudaFree(deviceA));
    funcCheck(cudaFree(deviceB));        
    funcCheck(cudaFree(deviceC));    

    free(hostA);
    free(hostB);
    free(hostC);
    free(hostComputedC);

    return 0;
}

7 comments:

  1. Hello, when i tried to run this program i got this error as:
    "Got CUDA error ... invalid configuration argument
    Mismatch at Row = 0 Col = 0 hostComputed[] = 3429991680.000000 --device[] 2529568768.000000" how can i fix this ?

    ReplyDelete
    Replies
    1. This is most probably due to your card not supporting 1024 threads in a threadblock, check with your cards compute capability specs, the workaround for this is to change all the places where 32 is used with tile size support your card, so supposeif its 256 max. threads than your tile size should 16x16. So two places to change --
      1.) all the places in the kernel where there is '32' should be made '16'
      2.) while deciding declaring, dim3 dimBlock(32, 32, 1); dim3 dimGrid((numCColumns/32) + 1, (numCRows/32) + 1, 1);
      this should be --> dim3 dimBlock(16, 16, 1); dim3 dimGrid((numCColumns/16) + 1, (numCRows/16) + 1, 1);

      Delete
  2. Thank you so much for your reply
    I am getting the same error but with the modified values such as
    "Got CUDA error ... no error
    Mismatch at Row = 0 Col = 3 hostComputed[] = 3197444864.000000 --device[] 3197444608.000000"
    when i applied all the specifications you mentioned above.

    ReplyDelete
  3. In kernel call , the argument "deviceC" is a matrix, while in the kernel function from "C[Row*numCColumns + Col] = Cvalue;"it appears it is a vector.

    ReplyDelete
    Replies
    1. Mohit, deviceC is a matrix only. we only allocate it in 1D fashion as this makes allocation-wise and memory performance access-wise better basically we do this as global memory coalescing can be easily achieved if the matrix is allocated as 1-dimensional

      Delete
  4. 1)are you using shared memory of size 32x32 instead of globalmemory ?
    2)the input matrix is 512x512 but you are using shared memory of size 32 x32 wont it overflow?where is the division of matrix into submatices and allocating them to the shared memory ?

    ReplyDelete
    Replies
    1. Hi,
      Below are the answers for your questions,
      1)are you using shared memory of size 32x32 instead of globalmemory ?
      --> Yes the code uses 32x32 shared memory, each thread block reads its chunk of the matrix A and B into there respective shared memory of 32x32. One shouldn't use global memory for carrying out the matrix multiplication as the i) the global memory is very slow compared to shared memory. ii) Whenever there is any operations in which data reuse is happening you are always leverage the fast shared memory.

      2)the input matrix is 512x512 but you are using shared memory of size 32 x32 wont it overflow? where is the division of matrix into submatices and allocating them to the shared memory?
      --> The input matrix of 512x512 is divided amongst the 2D thread blocks "dimBlock(32, 32, 1)", this is all happening in the "matrixMultiplyShared" kernel's for-loop. There is no overflow here as the matrix 512x512 is divided into sub-matrices 32x32 and that is loaded into the shared memory.

      Delete