Thanks for raising this. Currently solution of Concat is not ideal due to its recursive nature. And may result in stack overflow if number of inputs is large. I saw repeating stacktrace patterns like follows for roughly each input:
#166 0x000000000c8f077e in std::function<HalideIR::Internal::Stmt (HalideIR::Internal::LetStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::operator()(HalideIR::Internal::LetStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*) const (this=0x7ffff211a420, __args#0=0x7fff3700f020, __args#1=..., __args#2=0x7fff745ea540) at ../libgcc/include/c++/7.3.0/bits/std_function.h:706
#167 0x000000000c8ec4f7 in tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::set_dispatch<HalideIR::Internal::LetStmt>(std::function<HalideIR::Internal::Stmt (HalideIR::Internal::LetStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>)::{lambda(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)#1}::operator()(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*) const (this=0x7ffff211a420, n=..., args#0=..., args#1=0x7fff745ea540) at tvm/tvm/3rdparty/HalideIR/src/tvm/node/ir_functor.h:108
#168 0x000000000c8f9ef3 in std::_Function_handler<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*), tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::set_dispatch<HalideIR::Internal::LetStmt>(std::function<HalideIR::Internal::Stmt (HalideIR::Internal::LetStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>)::{lambda(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)#1}>::_M_invoke(std::_Any_data const&, tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*&&) (__functor=..., __args#0=..., __args#1=..., __args#2=@0x7fff74502f18: 0x7fff745ea540) at ../libgcc/include/c++/7.3.0/bits/std_function.h:302
#169 0x000000000c636b74 in std::function<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::operator()(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*) const (this=0x7ffff2031ba0, __args#0=..., __args#1=..., __args#2=0x7fff745ea540) at ../libgcc/include/c++/7.3.0/bits/std_function.h:706
#170 0x000000000c63661d in tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::operator()(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*) const (this=0x2a97f0b0 <tvm::ir::IRMutator::vtable_stmt()::inst>, n=..., args#0=..., args#1=0x7fff745ea540) at tvm/tvm/3rdparty/HalideIR/src/tvm/node/ir_functor.h:76
#171 0x000000000c634745 in tvm::ir::IRMutator::Mutate (this=0x7fff745ea540, stmt=...) at tvm/tvm/include/tvm/ir_mutator.h:44 #172 0x000000000ca52fa8 in tvm::ir::IRUseDefAnalysis::Mutate_ (this=0x7fff745ea540, op=0x7fff3700f050, s=...) at tvm/tvm/src/pass/split_host_device.cc:53 #173 0x0000000012e296ee in tvm::ir::<lambda(const HalideIR::Internal::LetStmt*, const HalideIR::Internal::Stmt&, tvm::ir::IRMutator*)>::operator()(const HalideIR::Internal::LetStmt *, const HalideIR::Internal::Stmt &, tvm::ir::IRMutator *) const (__closure=0x7ffff211a420, op=0x7fff3700f050, s=..., m=0x7fff745ea540) at tvm/tvm/src/pass/ir_mutator.cc:310
#174 0x0000000012e2eb47 in std::_Function_handler<HalideIR::Internal::Stmt(const HalideIR::Internal::LetStmt*, const HalideIR::Internal::Stmt&, tvm::ir::IRMutator*), tvm::ir::<lambda(const HalideIR::Internal::LetStmt*, const HalideIR::Internal::Stmt&, tvm::ir::IRMutator*)> >::_M_invoke(const std::_Any_data &, const HalideIR::Internal::LetStmt *&&, const HalideIR::Internal::Stmt &, tvm::ir::IRMutator *&&) (__functor=..., __args#0=@0x7fff74503158: 0x7fff3700f050, __args#1=..., __args#2=@0x7fff74503148: 0x7fff745ea540) at ../libgcc/include/c++/7.3.0/bits/std_function.h:302
#175 0x000000000c8f077e in std::function<HalideIR::Internal::Stmt (HalideIR::Internal::LetStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::operator()(HalideIR::Internal::LetStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*) const (this=0x7ffff211a420, __args#0=0x7fff3700f050, __args#1=..., __args#2=0x7fff745ea540) at ../libgcc/include/c++/7.3.0/bits/std_function.h:706
Mark concat as opaque and directly generate code that copies into the target region
Seems a good candidate solution to me.