Global memory access penalties greatly hampers the performance of CUDA codes due to latency involved. In case of Matrix Multiplication, if one implements in the naive way then its apparent that there is plenty of redundant global memory accesses involved, as much of the accessed elements can be reused for computation of several resultant elements, in order to eliminate this redundant one can leverage the shared memory to overcome the global memory access pattern issue involved in this. Due to using tiled based approach the global memory access are been reduced by the factor of tile size, that is, shared memory size for the part of the matrix involved. Go through the code to know the algorithm involved, and any questions I will be pleased to answer.

#### Matrix Multiplication in CUDA using Shared memory

```
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
// This code assumes that your device support block size of 1024
#define MAX_RANGE 9999
#define funcCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
printf( "Failed to run stmt %d ", __LINE__); \
printf( "Got CUDA error ... %s ", cudaGetErrorString(err)); \
return -1; \
} \
} while(0)
// Compute C = A * B
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns)
{
__shared__ float sA[32][32]; // Tile size of 32x32
__shared__ float sB[32][32];
int Row = blockDim.y*blockIdx.y + threadIdx.y;
int Col = blockDim.x*blockIdx.x + threadIdx.x;
float Cvalue = 0.0;
sA[threadIdx.y][threadIdx.x] = 0.0;
sB[threadIdx.y][threadIdx.x] = 0.0;
for (int k = 0; k < (((numAColumns - 1)/ 32) + 1); k++)
{
if ( (Row < numARows) && (threadIdx.x + (k*32)) < numAColumns)
{
sA[threadIdx.y][threadIdx.x] = A[(Row*numAColumns) + threadIdx.x + (k*32)];
}
else
{
sA[threadIdx.y][threadIdx.x] = 0.0;
}
if ( Col < numBColumns && (threadIdx.y + k*32) < numBRows)
{
sB[threadIdx.y][threadIdx.x] = B[(threadIdx.y + k*32)*numBColumns + Col];
}
else
{
sB[threadIdx.y][threadIdx.x] = 0.0;
}
__syncthreads();
for (int j = 0; j < 32; ++j)
{
Cvalue += sA[threadIdx.y][j] * sB[j][threadIdx.x];
}
}
if (Row < numCRows && Col < numCColumns)
{
C[Row*numCColumns + Col] = Cvalue;
}
}
void matMultiplyOnHost(float * A, float * B, float * C, int numARows,
int numAColumns, int numBRows, int numBColumns,
int numCRows, int numCColumns)
{
for (int i=0; i < numARows; i ++)
{
for (int j = 0; j < numAColumns; j++)
{
C[i*numCColumns + j ] = 0.0;
for (int k = 0; k < numCColumns; k++)
{
C[i*numCColumns + j ] += A[i*numAColumns + k] * B [k*numBColumns + j];
}
}
}
return;
}
int main(int argc, char ** argv) {
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * hostComputedC;
float * deviceA;
float * deviceB;
float * deviceC;
// Please adjust rows and columns according to you need.
int numARows = 512; // number of rows in the matrix A
int numAColumns = 512; // number of columns in the matrix A
int numBRows = 512; // number of rows in the matrix B
int numBColumns = 512; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
hostA = (float *) malloc(sizeof(float)*numARows*numAColumns);
hostB = (float *) malloc(sizeof(float)*numBRows*numBColumns);
for (int i = 0; i < numARows*numAColumns; i++)
{
hostA[i] = (rand() % MAX_RANGE) / 2.0;
}
for (int i = 0; i < numBRows*numBColumns; i++)
{
hostB[i] = (rand() % MAX_RANGE) / 2.0;
}
// Setting numCRows and numCColumns
numCRows = numARows;
numCColumns = numBColumns;
hostC = (float *) malloc(sizeof(float)*numCRows*numCColumns);
hostComputedC = (float *) malloc(sizeof(float)*numCRows*numCColumns);
// Allocating GPU memory
funcCheck(cudaMalloc((void **)&deviceA, sizeof(float)*numARows*numAColumns));
funcCheck(cudaMalloc((void **)&deviceB, sizeof(float)*numBRows*numBColumns));
funcCheck(cudaMalloc((void **)&deviceC, sizeof(float)*numCRows*numCColumns));
// Copy memory to the GPU
funcCheck(cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice));
funcCheck(cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice));
// Initialize the grid and block dimensions
dim3 dimBlock(32, 32, 1);
dim3 dimGrid((numCColumns/32) + 1, (numCRows/32) + 1, 1);
//@@ Launch the GPU Kernel here
matrixMultiplyShared<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);
cudaError_t err1 = cudaPeekAtLastError();
cudaDeviceSynchronize();
printf( "Got CUDA error ... %s \n", cudaGetErrorString(err1));
// Copy the results in GPU memory back to the CPU
funcCheck(cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns, cudaMemcpyDeviceToHost));
matMultiplyOnHost(hostA, hostB, hostComputedC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);
for (int i=0; i < numCColumns*numCRows; i++)
{
if (hostComputedC[i] != hostC[i] )
{
printf("Mismatch at Row = %d Col = %d hostComputed[] = %f --device[] %f\n", i / numCColumns, i % numCColumns, hostComputedC[i], hostC[i]);
break;
}
}
// Free the GPU memory
funcCheck(cudaFree(deviceA));
funcCheck(cudaFree(deviceB));
funcCheck(cudaFree(deviceC));
free(hostA);
free(hostB);
free(hostC);
free(hostComputedC);
return 0;
}
```

Hello, when i tried to run this program i got this error as:

ReplyDelete"Got CUDA error ... invalid configuration argument

Mismatch at Row = 0 Col = 0 hostComputed[] = 3429991680.000000 --device[] 2529568768.000000" how can i fix this ?

This is most probably due to your card not supporting 1024 threads in a threadblock, check with your cards compute capability specs, the workaround for this is to change all the places where 32 is used with tile size support your card, so supposeif its 256 max. threads than your tile size should 16x16. So two places to change --

Delete1.) all the places in the kernel where there is '32' should be made '16'

2.) while deciding declaring, dim3 dimBlock(32, 32, 1); dim3 dimGrid((numCColumns/32) + 1, (numCRows/32) + 1, 1);

this should be --> dim3 dimBlock(16, 16, 1); dim3 dimGrid((numCColumns/16) + 1, (numCRows/16) + 1, 1);

Thank you so much for your reply

ReplyDeleteI am getting the same error but with the modified values such as

"Got CUDA error ... no error

Mismatch at Row = 0 Col = 3 hostComputed[] = 3197444864.000000 --device[] 3197444608.000000"

when i applied all the specifications you mentioned above.

In kernel call , the argument "deviceC" is a matrix, while in the kernel function from "C[Row*numCColumns + Col] = Cvalue;"it appears it is a vector.

ReplyDeleteMohit, deviceC is a matrix only. we only allocate it in 1D fashion as this makes allocation-wise and memory performance access-wise better basically we do this as global memory coalescing can be easily achieved if the matrix is allocated as 1-dimensional

Delete1)are you using shared memory of size 32x32 instead of globalmemory ?

ReplyDelete2)the input matrix is 512x512 but you are using shared memory of size 32 x32 wont it overflow?where is the division of matrix into submatices and allocating them to the shared memory ?

Hi,

DeleteBelow are the answers for your questions,

1)are you using shared memory of size 32x32 instead of globalmemory ?

--> Yes the code uses 32x32 shared memory, each thread block reads its chunk of the matrix A and B into there respective shared memory of 32x32. One shouldn't use global memory for carrying out the matrix multiplication as the i) the global memory is very slow compared to shared memory. ii) Whenever there is any operations in which data reuse is happening you are always leverage the fast shared memory.

2)the input matrix is 512x512 but you are using shared memory of size 32 x32 wont it overflow? where is the division of matrix into submatices and allocating them to the shared memory?

--> The input matrix of 512x512 is divided amongst the 2D thread blocks "dimBlock(32, 32, 1)", this is all happening in the "matrixMultiplyShared" kernel's for-loop. There is no overflow here as the matrix 512x512 is divided into sub-matrices 32x32 and that is loaded into the shared memory.