Dynamic reduce sizes in CUDA code

I want to have dynamic reduce sizes in a tensor expression. Code seems to compile and lower fine, but when I try to generate CUDA code I get the error:

Not all Vars are passed in api_args: 'a_lu' 'a_lb' 'b_lu' 'b_lb' does not appeared in api_args

No idea what I am doing wrong here. I continue to find the error messages in TVM beyond perplexing.

def dia():
    batch = tvm.var("batch")
    size = 100
    #offsets = tvm.placeholder((4,), dtype="int32", name="offsets")
    a_lu, a_lb, b_lu, b_lb = tvm.var("a_lu"), tvm.var("a_lb"), tvm.var("b_lu"), tvm.var("b_lb")
    #a_lu, a_lb, b_lu, b_lb = 1, 1, 1, 1
    result_lu = a_lu + b_lu 
    result_lb = a_lb + b_lb
    self_width = a_lu + a_lb + 1
    b_width = b_lu + b_lb + 1
    out_width = result_lu + result_lb + 1
    k = tvm.reduce_axis((0, self_width), name='k')
    val = 0.0
    A = tvm.placeholder((batch, size, self_width), name="A")
    B = tvm.placeholder((batch, size, b_width), name="B")

    B_pad = tvm.compute(
        (batch, size, size),
        lambda batch, k, j: 
        tvm.if_then_else(
                tvm.any(k - j < b_lb, j - k > b_lu),
                val, B[batch, k, k - j - b_lb]), name="B_pad"
    )
    C = tvm.compute(
            (batch, size, out_width),
            lambda batch, i, j: tvm.sum(
                A[batch, i, k] * B_pad[batch, k, j],
                axis=[k]), name='Out')

    s = tvm.create_schedule(C.op)
    ...
s, args = dia()
tvm.lower(s, args, simple_mode=True)

Lowered code

produce Out {
  // attr [iter_var(blockIdx.z, , blockIdx.z)] thread_extent = batch
  // attr [A.shared] storage_scope = "shared"
  allocate A.shared[float32 * 256]
  // attr [B_pad] storage_scope = "local"
  allocate B_pad[float32 * 1]
  // attr [B_pad.shared] storage_scope = "shared"
  allocate B_pad.shared[float32 * 256]
  // attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = (((((a_lu + b_lu) + a_lb) + b_lb)/16) + 1)
  // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 7
  // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 16
  // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
  if (likely(((blockIdx.x*16) < (100 - threadIdx.x)))) {
    if (likely(((blockIdx.y*16) < (((((a_lu + b_lu) + a_lb) + b_lb) - threadIdx.y) + 1)))) {
      Out[(((blockIdx.y*16) + ((((blockIdx.z*100) + (blockIdx.x*16)) + threadIdx.x)*((((a_lu + b_lu) + a_lb) + b_lb) + 1))) + threadIdx.y)] = 0.000000f
    }
  }
  for (k.outer, 0, (((a_lu + a_lb)/16) + 1)) {
    produce A.shared {
      if (likely(((blockIdx.x*16) < (100 - threadIdx.x)))) {
        if (likely(((k.outer*16) < (((a_lu + a_lb) - threadIdx.y) + 1)))) {
          A.shared[((threadIdx.x*16) + threadIdx.y)] = A[(((k.outer*16) + ((((blockIdx.z*100) + (blockIdx.x*16)) + threadIdx.x)*((a_lu + a_lb) + 1))) + threadIdx.y)]
        }
      }
    }
    produce B_pad {
      if (likely(((k.outer*16) < (100 - threadIdx.x)))) {
        if (likely(((blockIdx.y*16) < (100 - threadIdx.y)))) {
          B_pad[0] = tvm_if_then_else((((k.outer*16) < (((b_lb + (blockIdx.y*16)) + threadIdx.y) - threadIdx.x)) || (b_lu < ((((blockIdx.y*16) + threadIdx.y) - threadIdx.x) - (k.outer*16)))), 0.000000f, B[((((((k.outer*16) + ((((blockIdx.z*100) + (k.outer*16)) + threadIdx.x)*((b_lu + b_lb) + 1))) + threadIdx.x) - threadIdx.y) - b_lb) - (blockIdx.y*16))])
        }
      }
    }
    produce B_pad.shared {
      if (likely(((k.outer*16) < (100 - threadIdx.x)))) {
        if (likely(((blockIdx.y*16) < (100 - threadIdx.y)))) {
          B_pad.shared[((threadIdx.x*16) + threadIdx.y)] = B_pad[0]
        }
      }
    }
    for (k.inner, 0, 16) {
      if (likely(((blockIdx.x*16) < (100 - threadIdx.x)))) {
        if (likely(((blockIdx.y*16) < (((((a_lu + b_lu) + a_lb) + b_lb) - threadIdx.y) + 1)))) {
          if (likely(((k.outer*16) < (((a_lu + a_lb) - k.inner) + 1)))) {
            Out[(((blockIdx.y*16) + ((((blockIdx.z*100) + (blockIdx.x*16)) + threadIdx.x)*((((a_lu + b_lu) + a_lb) + b_lb) + 1))) + threadIdx.y)] = (Out[(((blockIdx.y*16) + ((((blockIdx.z*100) + (blockIdx.x*16)) + threadIdx.x)*((((a_lu + b_lu) + a_lb) + b_lb) + 1))) + threadIdx.y)] + (A.shared[((threadIdx.x*16) + k.inner)]*B_pad.shared[((k.inner*16) + threadIdx.y)]))
          }
        }
      }
    }
  }
}

Error

TVMError: Traceback (most recent call last):
  [bt] (3) /tvm/build/libtvm.so(TVMFuncCall+0x65) [0x7f7a1e3b9525]
  [bt] (2) /tvm/build/libtvm.so(std::_Function_handler<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*), void tvm::runtime::TypedPackedFunc<tvm::LoweredFunc (HalideIR::Internal::Stmt, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::Array<tvm::NodeRef, void>, int, bool)>::AssignTypedLambda<tvm::LoweredFunc (*)(HalideIR::Internal::Stmt, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::Array<tvm::NodeRef, void>, int, bool)>(tvm::LoweredFunc (*)(HalideIR::Internal::Stmt, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::Array<tvm::NodeRef, void>, int, bool))::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}>::_M_invoke(std::_Any_data const&, tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)+0x117) [0x7f7a1dbe7a87]
  [bt] (1) /tvm/build/libtvm.so(tvm::ir::MakeAPI(HalideIR::Internal::Stmt, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::Array<tvm::NodeRef, void>, int, bool)+0x33a6) [0x7f7a1de4aee6]
  [bt] (0) /tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x43) [0x7f7a1dba4493]
  File "/content/gdrive/My Drive/tvm/src/pass/make_api.cc", line 188

TVMError: Not all Vars are passed in api_args:  'a_lu'  'a_lb'  'b_lu'  'b_lb'  does not appeared in api_args