[TVM] Failed to tensorize a 2-dims compute after fuse & split schedule

Hi,all.
I try to tensorize a 2-dims relu compute’s inner_iter, but it failed on dimension check.
Here is my code:

def test_relu():
    n = 128
    factor = 64
    A = tvm.placeholder((n, n ), name='A')
    C = tvm.compute(A.shape, lambda *i: tvm.max(A(*i), tvm.const(0, A.dtype)), name='C')
    s = tvm.create_schedule(C.op)

    A_cache = s.cache_read(A, 'shared', readers=[C])
    C_cache = s.cache_write(C, 'shared')
    print("1st source code")
    print(tvm.lower(s, [A, C], simple_mode = True))

    # fused all axises to handle n-dim input.
    fused_axis = s[C_cache].fuse(*list(s[C_cache].op.axis))

    print("2nd source code")
    print(tvm.lower(s, [A, C], simple_mode = True))

    C_tmp_iter, Ci_iter = s[C_cache].split(fused_axis, factor = factor)

    print("3rd source code")
    print(tvm.lower(s, [A, C], simple_mode = True))

    def intrin_relu(m):
        a = tvm.placeholder((m, ), name='a')
        b = tvm.compute(a.shape, lambda *i: tvm.max(a(*i), tvm.const(0, a.dtype)), name='b')

        a_buffer = tvm.decl_buffer(a.shape, a.dtype, scope = 'shared')
        b_buffer = tvm.decl_buffer(b.shape, b.dtype, scope = 'shared')

        def intrin_func(ins, outs):
            ib = tvm.ir_builder.create()
            aa = ins[0]
            bb = outs[0]
            ib.emit(tvm.call_extern(a.dtype, "relu",
                                    bb.access_ptr('w'),
                                    aa.access_ptr('r'),
                                    m))
            return ib.get()

        with tvm.build_config(offset_factor=1):
            return tvm.decl_tensor_intrin(b.op, intrin_func,
                                          binds={a: a_buffer, b: b_buffer})

    vrelu = intrin_relu(factor)
    s[C_cache].tensorize(Ci_iter, vrelu)

    print("4th source code")
    print(tvm.lower(s, [A, C], simple_mode = True))

And console output is:

1st source code
// attr [A.shared] storage_scope = "shared"
allocate A.shared[float32 * 16384]
produce A.shared {
  for (ax0, 0, 128) {
    for (ax1, 0, 128) {
      A.shared[((ax0*128) + ax1)] = A[((ax0*128) + ax1)]
    }
  }
}
produce C.shared {
  for (i0.c, 0, 128) {
    for (i1.c, 0, 128) {
      A.shared[((i0.c*128) + i1.c)] = max(A.shared[((i0.c*128) + i1.c)], 0.000000f)
    }
  }
}
produce C {
  for (i0, 0, 128) {
    for (i1, 0, 128) {
      C[((i0*128) + i1)] = A.shared[((i0*128) + i1)]
    }
  }
}

2nd source code
// attr [A.shared] storage_scope = "shared"
allocate A.shared[float32 * 16384]
produce A.shared {
  for (ax0, 0, 128) {
    for (ax1, 0, 128) {
      A.shared[((ax0*128) + ax1)] = A[((ax0*128) + ax1)]
    }
  }
}
produce C.shared {
  for (i0.c.i1.c.fused, 0, 16384) {
    A.shared[i0.c.i1.c.fused] = max(A.shared[i0.c.i1.c.fused], 0.000000f)
  }
}
produce C {
  for (i0, 0, 128) {
    for (i1, 0, 128) {
      C[((i0*128) + i1)] = A.shared[((i0*128) + i1)]
    }
  }
}

3rd source code
// attr [A.shared] storage_scope = "shared"
allocate A.shared[float32 * 16384]
produce A.shared {
  for (ax0, 0, 128) {
    for (ax1, 0, 128) {
      A.shared[((ax0*128) + ax1)] = A[((ax0*128) + ax1)]
    }
  }
}
produce C.shared {
  for (i0.c.i1.c.fused.outer, 0, 256) {
    for (i0.c.i1.c.fused.inner, 0, 64) {
      A.shared[((i0.c.i1.c.fused.outer*64) + i0.c.i1.c.fused.inner)] = max(A.shared[((i0.c.i1.c.fused.outer*64) + i0.c.i1.c.fused.inner)], 0.000000f)
    }
  }
}
produce C {
  for (i0, 0, 128) {
    for (i1, 0, 128) {
      C[((i0*128) + i1)] = A.shared[((i0*128) + i1)]
    }
  }
}

4th source code
[18:41:09] /home/***/github/tvm/src/schedule/message_passing.cc:257: use fallback inference rule in fuse
Traceback (most recent call last):
  File "split.py", line 111, in <module>
    test_relu()
  File "split.py", line 53, in test_relu
    print(tvm.lower(s, [A, C], simple_mode = True))
  File "/home/***/github/tvm/python/tvm/build_module.py", line 359, in lower
    stmt = form_body(sch)
  File "/home/***/github/tvm/python/tvm/build_module.py", line 309, in form_body
    stmt = schedule.ScheduleOps(sch, bounds)
  File "/home/***/github/tvm/python/tvm/_ffi/_ctypes/function.py", line 190, in __call__
    raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
  [bt] (8) /home/***/github/tvm/build/libtvm.so(+0x161ea2) [0x7f53dc83dea2]
  [bt] (7) /home/***/github/tvm/build/libtvm.so(+0x432b6e) [0x7f53dcb0eb6e]
  [bt] (6) /home/***/github/tvm/build/libtvm.so(+0x430176) [0x7f53dcb0c176]
  [bt] (5) /home/***/github/tvm/build/libtvm.so(tvm::ComputeOpNode::BuildProvide(tvm::Stage const&, std::unordered_map<tvm::IterVar, tvm::Range, std::hash<tvm::IterVar>, std::equal_to<tvm::IterVar>, std::allocator<std::pair<tvm::IterVar const, tvm::Range> > > const&, bool) const+0x15d) [0x7f53dcacd1bd]
  [bt] (4) /home/***/github/tvm/build/libtvm.so(+0x3de16c) [0x7f53dcaba16c]
  [bt] (3) /home/***/github/tvm/build/libtvm.so(+0x3db4c9) [0x7f53dcab74c9]
  [bt] (2) /home/***/github/tvm/build/libtvm.so(+0x3daf16) [0x7f53dcab6f16]
  [bt] (1) /home/***/github/tvm/build/libtvm.so(+0x3e67e4) [0x7f53dcac27e4]
  [bt] (0) /home/***/github/tvm/build/libtvm.so(+0x134212) [0x7f53dc810212]
  File "/home/***/github/tvm/src/op/tensorize.cc", line 204
TVMError: Check failed: is_one(e.region[i]->extent): Tensorize tensor_intrin: Input dimension mismatch with tensor intrin  expected shape=[64], given region=[range(min=0, ext=128), range(min=0, ext=128)]

Error ocurred as

Check failed: is_one(e.region[i]->extent): Tensorize tensor_intrin: Input dimension mismatch with tensor intrin  expected shape=[64], given region=[range(min=0, ext=128), range(min=0, ext=128)]

I have fused and splited C_cache’s axis to [256, 64], but why tensorize consider its axis as [128,128]?
It confused me all day, and thanks for help!

:sob: Anyone can help me? I appreciate your time.

Maybe you can try split s[C_cache].op.axis[1] first, then do tensorize on the inner iteration. like this:

co_iter, ci_iter = s[C_cache].split(s[C_cache].op.axis[1], factor = factor)
......
s[C_cache].tensorize(ci_iter, vrelu)

After that you can try fuse the outer two iterations, I’m not sure whether it works:

s[C_cache].fuse(s[C_cache].op.axis[0], co_iter)

The problem itself may due to tensorize pattern matcher, looking forward to an answer.

Thanks for your reply.
What exactly I wanna do is using a 1-dim func

relu(dst, src, length)

to handle n-dims input tensor. But the length is limited due to the RAM size. So I want to fuse the input tensor and than split to [nparts, factor] shape, then tensorize the inner iter like below:

produce C.shared {
  for (i0.c, 0, nparts) {
    for (i1.c, 0, factor) {
      A.shared[((i0.c*128) + i1.c)] = max(A.shared[((i0.c*128) + i1.c)], 0.000000f)
    }
  }
}

after tensorize

produce C.shared {
  for (i0.c, 0, nparts) {
    relu(A.shared, A.shared, factor)
  }
}