Topi dense() operator crashed using GPU

Hi
I’m currently writing a program to test TVM’s performance on GPU and here is my code.

import tvm
import topi
import numpy as np
import time

tgt_host = "llvm"
tgt = tvm.target.cuda("llvm device=0")


Input = 16
Batch_size = 128 * 1000 * 1000
Output = 1
X = tvm.placeholder((Batch_size, Input), name="X")
W = tvm.placeholder((Output, Input), name="W")
B = tvm.placeholder((Output,), name="B")

C = topi.nn.dense(X, W, B)

with tvm.target.cuda():
    s = topi.generic.schedule_dense(C)

fadd = tvm.build(s, [X, W, B, C],  tgt, target_host = "llvm", name="percp")

x_np = np.random.uniform(size=(Batch_size, Input)).astype(X.dtype)
w_np = np.random.uniform(size=(Output, Input)).astype(W.dtype)
b_np = np.random.uniform(size=(Output,)).astype(B.dtype)

startTime = time.time()

ctx = tvm.gpu(0)

a_nd = tvm.nd.array(
    x_np,
    ctx
)

w_nd = tvm.nd.array(
    w_np,
    ctx
)

b_nd = tvm.nd.array(
    b_np,
    ctx
)

c_nd = tvm.nd.array(
    np.zeros((Batch_size, 1), dtype=B.dtype),
    ctx
)

load_time = time.time() - startTime
print("Loading time: {} secs".format(load_time))
fadd(a_nd, w_nd, b_nd, c_nd)
print(c_nd.asnumpy())
total_time = time.time() - startTime
print("Total time: {} secs".format(total_time))

However, it crashed when batch_size became bigger than 10000 (or a certain number).

Traceback (most recent call last):
  File "perceptron.py", line 58, in <module>
    fadd(a_nd, w_nd, b_nd, c_nd)
  File "/usr/tvm/python/tvm/_ffi/function.py", line 127, in __call__
    return f(*args)
  File "/usr/tvm/python/tvm/_ffi/_ctypes/function.py", line 190, in __call__
    raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
  [bt] (3) /usr/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7ff260ef2321]
  [bt] (2) /usr/tvm/build/libtvm.so(+0x95608c) [0x7ff260f6208c]
  [bt] (1) /usr/tvm/build/libtvm.so(+0x955b12) [0x7ff260f61b12]
  [bt] (0) /usr/tvm/build/libtvm.so(+0x1443b2) [0x7ff2607503b2]
  File "/usr/tvm/src/runtime/cuda/cuda_module.cc", line 196
  File "/usr/tvm/src/runtime/module_util.cc", line 54
TVMError: Check failed: ret == 0 (-1 vs. 0) : CUDALaunch Error: CUDA_ERROR_INVALID_VALUE
 grid=(1,128000000,1),  block=(64,1,1)
// func_name=percp_kernel0
// CUDA Source
// -----------
extern "C" __global__ void percp_kernel0( float* __restrict__ X,  float* __restrict__ W,  float* __restrict__ compute,  float* __restrict__ B) {
   float compute_rf[1];
  __shared__ float red_buf0[64];
  __shared__ float compute1[1];
  compute_rf[0] = 0.000000e+00f;
  if (((int)threadIdx.x) < 16) {
    compute_rf[0] = (compute_rf[0] + (X[((((int)blockIdx.y) * 16) + ((int)threadIdx.x))] * W[((int)threadIdx.x)]));
  }
  __syncthreads();
  ((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = compute_rf[0];
  __syncthreads();
  if (((int)threadIdx.x) < 32) {
    ((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 32)]);
  }
  __syncthreads();
  if (((int)threadIdx.x) < 16) {
    ((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 16)]);
    ((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 8)]);
    ((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 4)]);
    ((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 2)]);
    ((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 1)]);
  }
  __syncthreads();
  if (((int)threadIdx.x) == 0) {
    compute1[0] = ((volatile __shared__ float*)red_buf0)[0];
  }
  if (((int)threadIdx.x) == 0) {
    compute[((int)blockIdx.y)] = (compute1[0] + B[0]);
  }
}

The program is running on a Nvidia-docker with tvm 0.6.
My cuda version is 10.0

Thanks

It seems like the number of threads launched by the runtime exceeds maximum allowed.

Do you see any warnings about a fallback config being used? It could be that the fallback schedule uses too many threads. You can try tuning to fix this. https://docs.tvm.ai/tutorials/autotvm/tune_relay_cuda.html

Actually, I’ve seen that kind of warning before when I was compiling nnvm’s predefined model (vgg11) with batch size bigger than 1.

But no warnings came out when running the code I posted above.

I will try tuning and see if it can fix the problem.

Also, topi.generic.schedule_reduce doesn’t seem to have such issue.

The following code worked well

import tvm
import topi
import numpy as np
import time

use_gpu = True

tgt_host = "llvm"

if use_gpu:
    tgt = tvm.target.cuda("llvm device=0")
else:
    tgt = "llvm"

Dimension = 16
Batch_size = 128 * 1000 * 1000
X = tvm.placeholder((Batch_size, Dimension), name="X", dtype="float32")
Y = tvm.placeholder((Batch_size, Dimension), name="Y", dtype="float32")

Result = topi.sum(topi.power((X - Y), 2), 1) / Dimension
if use_gpu:
    with tvm.target.cuda():
        s = topi.generic.schedule_reduce(Result)
else:
    s = tvm.create_schedule(Result.op)
f = tvm.build(s, [X, Y, Result], tgt, target_host='llvm', name="my_func")

x_np = np.random.uniform(size=(Batch_size, Dimension)).astype(X.dtype)
y_np = np.random.uniform(size=(Batch_size, Dimension)).astype(Y.dtype)

startTime = time.time()

if use_gpu:
    ctx = tvm.gpu(0)
else:
    ctx = tvm.cpu(0)


x_nd = tvm.nd.array(
    x_np,
    ctx
)

y_nd = tvm.nd.array(
    y_np,
    ctx
)

c_nd = tvm.nd.array(
    np.zeros((Batch_size,), dtype=Result.dtype),
    ctx
)

f(x_nd, y_nd, c_nd)

c_nd.asnumpy()

total_time = time.time() - startTime
print("Total time: {} secs".format(total_time))