Hello, I am doing the matrix multiply in TVM, that is , P=M*N,where M(N0, K), N(K, M0), and I wish TVM can generate some similar code as I write:
__global__ void cut3_on_gpu(float* M, float* N, float* P, int N0, int K, int M0)
{
// define shared memory used in the block
__shared__ float ds_M[TILE_WIDTH][TILE_WIDTH];
__shared__ float ds_N[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int Row = by * blockDim.y + ty;
int Col = bx * blockDim.x + tx;
float Pvalue = 0;
// Loop over the M and N tiles required to compute the P element
for (int p = 0; p < (K-1)/TILE_WIDTH+1; ++p) {
// Collaborative loading of M and N tiles into shared memory
// boundary check when loading data
if(Row<N0 && p*TILE_WIDTH+tx < K)
ds_M[ty][tx] = M[Row * K + p*TILE_WIDTH+tx];
else
ds_M[ty][tx] = 0.0;
if(p*TILE_WIDTH+ty<K && Col < M0)
ds_N[ty][tx] = N[(p*TILE_WIDTH+ty)*M0 + Col];
else
ds_N[ty][tx] = 0.0;
__syncthreads();
if(Row < N0 && Col < M0)
{
for (int i = 0; i < TILE_WIDTH; ++i)Pvalue += ds_M[ty][i] * ds_N[i][tx];
}
__syncthreads();
}
if(Row<N0 && Col < M0)
P[Row*M0+Col] = Pvalue;
}
But I am not sure how I can do Collaborative loading into shared memory and do calculation multiple times.Anyone can tell me some solutions or suggestions about the problem? Thanks!