Hi
I’m currently writing a program to test TVM’s performance on GPU and here is my code.
import tvm
import topi
import numpy as np
import time
tgt_host = "llvm"
tgt = tvm.target.cuda("llvm device=0")
Input = 16
Batch_size = 128 * 1000 * 1000
Output = 1
X = tvm.placeholder((Batch_size, Input), name="X")
W = tvm.placeholder((Output, Input), name="W")
B = tvm.placeholder((Output,), name="B")
C = topi.nn.dense(X, W, B)
with tvm.target.cuda():
s = topi.generic.schedule_dense(C)
fadd = tvm.build(s, [X, W, B, C], tgt, target_host = "llvm", name="percp")
x_np = np.random.uniform(size=(Batch_size, Input)).astype(X.dtype)
w_np = np.random.uniform(size=(Output, Input)).astype(W.dtype)
b_np = np.random.uniform(size=(Output,)).astype(B.dtype)
startTime = time.time()
ctx = tvm.gpu(0)
a_nd = tvm.nd.array(
x_np,
ctx
)
w_nd = tvm.nd.array(
w_np,
ctx
)
b_nd = tvm.nd.array(
b_np,
ctx
)
c_nd = tvm.nd.array(
np.zeros((Batch_size, 1), dtype=B.dtype),
ctx
)
load_time = time.time() - startTime
print("Loading time: {} secs".format(load_time))
fadd(a_nd, w_nd, b_nd, c_nd)
print(c_nd.asnumpy())
total_time = time.time() - startTime
print("Total time: {} secs".format(total_time))
However, it crashed when batch_size became bigger than 10000 (or a certain number).
Traceback (most recent call last):
File "perceptron.py", line 58, in <module>
fadd(a_nd, w_nd, b_nd, c_nd)
File "/usr/tvm/python/tvm/_ffi/function.py", line 127, in __call__
return f(*args)
File "/usr/tvm/python/tvm/_ffi/_ctypes/function.py", line 190, in __call__
raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
[bt] (3) /usr/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7ff260ef2321]
[bt] (2) /usr/tvm/build/libtvm.so(+0x95608c) [0x7ff260f6208c]
[bt] (1) /usr/tvm/build/libtvm.so(+0x955b12) [0x7ff260f61b12]
[bt] (0) /usr/tvm/build/libtvm.so(+0x1443b2) [0x7ff2607503b2]
File "/usr/tvm/src/runtime/cuda/cuda_module.cc", line 196
File "/usr/tvm/src/runtime/module_util.cc", line 54
TVMError: Check failed: ret == 0 (-1 vs. 0) : CUDALaunch Error: CUDA_ERROR_INVALID_VALUE
grid=(1,128000000,1), block=(64,1,1)
// func_name=percp_kernel0
// CUDA Source
// -----------
extern "C" __global__ void percp_kernel0( float* __restrict__ X, float* __restrict__ W, float* __restrict__ compute, float* __restrict__ B) {
float compute_rf[1];
__shared__ float red_buf0[64];
__shared__ float compute1[1];
compute_rf[0] = 0.000000e+00f;
if (((int)threadIdx.x) < 16) {
compute_rf[0] = (compute_rf[0] + (X[((((int)blockIdx.y) * 16) + ((int)threadIdx.x))] * W[((int)threadIdx.x)]));
}
__syncthreads();
((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = compute_rf[0];
__syncthreads();
if (((int)threadIdx.x) < 32) {
((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 32)]);
}
__syncthreads();
if (((int)threadIdx.x) < 16) {
((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 16)]);
((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 8)]);
((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 4)]);
((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 2)]);
((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] = (((volatile __shared__ float*)red_buf0)[((int)threadIdx.x)] + ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.x) + 1)]);
}
__syncthreads();
if (((int)threadIdx.x) == 0) {
compute1[0] = ((volatile __shared__ float*)red_buf0)[0];
}
if (((int)threadIdx.x) == 0) {
compute[((int)blockIdx.y)] = (compute1[0] + B[0]);
}
}
The program is running on a Nvidia-docker with tvm 0.6.
My cuda version is 10.0
Thanks