[CUDA] How to compile 50000 cuda kernels?

TVM seems to fail when the number of cuda kernels to build is very large.

Minimum reproducible:

import tvm
from tvm.contrib import cc, tar
import numpy as _np
import topi


m = tvm.var('m')
n = tvm.var('n')
dtype = 'float32'
ctx = tvm.context('cuda', 0)
num_thread = 64


def add_gpu():
    a = tvm.placeholder((m, n), name='a', dtype=dtype)
    b = tvm.placeholder((m, n), name='b', dtype=dtype)
    c = tvm.compute((m, n), lambda i, j: a[i, j] + b[i, j], name='c')
    s = tvm.create_schedule(c.op)
    c_list = [c]
    for t in c_list:
        block_x = tvm.thread_axis("blockIdx.x")
        thread_x = tvm.thread_axis("threadIdx.x")
        axes = [axis for axis in t.op.axis]
        fused = s[t].fuse(*axes)
        bx, tx = s[t].split(fused, factor=num_thread)
        s[t].bind(bx, block_x)
        s[t].bind(tx, thread_x)
    return s, [a, b, c]


s, args = add_gpu()
stmt = tvm.lower(s, args, simple_mode=True, name='add_cpu')
print(stmt)

print('build 1 time...')
func_list_cuda = [tvm.lower(s, args, name='add_gpu')]
lowered_funcs = {"cuda": func_list_cuda}
func_binary_cuda = tvm.build(lowered_funcs, name='lib')
# test
a = tvm.nd.array(_np.array([[1, 2], [3, 4]], dtype=dtype), ctx=ctx)
b = tvm.nd.array(_np.array([[5, 6], [7, 8]], dtype=dtype), ctx=ctx)
c = tvm.nd.array(_np.zeros((2, 2), dtype=dtype), ctx=ctx)
func_binary_cuda['add_gpu'](a, b, c)
print(c)

print('build 50000 times...')
print('lower...')
func_list_cuda = [tvm.lower(s, args, name='add_cpu_{}'.format(i)) for i in range(50000)]
lowered_funcs = {"cuda": func_list_cuda}
print('build...')
func_binary_cuda = tvm.build(lowered_funcs, name='lib')

The output is

produce c {
  // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = floordiv(((m*n) + 63), 64)
  // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 64
  if (likely((floordiv(((blockIdx.x*64) + threadIdx.x), n) < m))) {
    if (likely((floormod(((blockIdx.x*64) + threadIdx.x), n) < n))) {
      if (likely((((blockIdx.x*64) + threadIdx.x) < (m*n)))) {
        if (likely((0 <= floordiv(((blockIdx.x*64) + threadIdx.x), n)))) {
          if (likely((floordiv(((blockIdx.x*64) + threadIdx.x), n) < m))) {
            if (likely((0 <= floormod(((blockIdx.x*64) + threadIdx.x), n)))) {
              if (likely((floormod(((blockIdx.x*64) + threadIdx.x), n) < n))) {
                c[((floordiv(((blockIdx.x*64) + threadIdx.x), n)*stride) + (floormod(((blockIdx.x*64) + threadIdx.x), n)*stride))] = (a[((floordiv(((blockIdx.x*64) + threadIdx.x), n)*stride) + (floormod(((blockIdx.x*64) + threadIdx.x), n)*stride))] + b[((floordiv(((blockIdx.x*64) + threadIdx.x), n)*stride) + (floormod(((blockIdx.x*64) + threadIdx.x), n)*stride))])
              }
            }
          }
        }
      }
    }
  }
}

build 1 time...
[[ 6.  8.]
 [10. 12.]]
build 50000 times...
lower...
build...
Traceback (most recent call last):

  File "test.py", line 57, in <module>
    func_binary_cuda = tvm.build(lowered_funcs, name='lib')

  File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/build_module.py", line 637, in build
    fhost, mdev = _build_for_device(flist, tar, target_host)

  File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/build_module.py", line 503, in _build_for_device
    mdev = codegen.build_module(fdevice, str(target)) if fdevice else None

  File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/codegen.py", line 36, in build_module
    return _Build(lowered_func, target)

  File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/_ffi/_ctypes/function.py", line 207, in __call__
    raise get_last_ffi_error()

tvm._ffi.base.TVMError: Traceback (most recent call last):
  [bt] (5) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(TVMFuncCall+0x61) [0x7f0107e0c071]
  [bt] (4) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(+0xa81b0e) [0x7f0107650b0e]
  [bt] (3) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::codegen::Build(tvm::Array<tvm::LoweredFunc, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x1e3) [0x7f0107769683]
  [bt] (2) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(std::_Function_handler<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*), tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::Array<tvm::LoweredFunc, void>)>::AssignTypedLambda<tvm::runtime::Module (*)(tvm::Array<tvm::LoweredFunc, void>)>(tvm::runtime::Module (*)(tvm::Array<tvm::LoweredFunc, void>))::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}>::_M_invoke(std::_Any_data const&, tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)+0x4e) [0x7f010779630e]
  [bt] (1) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::codegen::BuildCUDA(tvm::Array<tvm::LoweredFunc, void>)+0x402) [0x7f0107dad622]
  [bt] (0) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(+0x12387bb) [0x7f0107e077bb]
  File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/_ffi/_ctypes/function.py", line 72, in cfun
    rv = local_pyfunc(*pyargs)
  File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/autotvm/measure/measure_methods.py", line 585, in tvm_callback_cuda_compile
    ptx = nvcc.compile_cuda(code, target="ptx", arch=AutotvmGlobalScope.current.cuda_target_arch)
  File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/contrib/nvcc.py", line 98, in compile_cuda
    raise RuntimeError(msg)
RuntimeError: Compilation error:
<unnamed>: parse Invalid record

Can we compile them kernel by kernel?