Hi,all.
I try to tensorize a 2-dims relu compute’s inner_iter, but it failed on dimension check.
Here is my code:
def test_relu():
n = 128
factor = 64
A = tvm.placeholder((n, n ), name='A')
C = tvm.compute(A.shape, lambda *i: tvm.max(A(*i), tvm.const(0, A.dtype)), name='C')
s = tvm.create_schedule(C.op)
A_cache = s.cache_read(A, 'shared', readers=[C])
C_cache = s.cache_write(C, 'shared')
print("1st source code")
print(tvm.lower(s, [A, C], simple_mode = True))
# fused all axises to handle n-dim input.
fused_axis = s[C_cache].fuse(*list(s[C_cache].op.axis))
print("2nd source code")
print(tvm.lower(s, [A, C], simple_mode = True))
C_tmp_iter, Ci_iter = s[C_cache].split(fused_axis, factor = factor)
print("3rd source code")
print(tvm.lower(s, [A, C], simple_mode = True))
def intrin_relu(m):
a = tvm.placeholder((m, ), name='a')
b = tvm.compute(a.shape, lambda *i: tvm.max(a(*i), tvm.const(0, a.dtype)), name='b')
a_buffer = tvm.decl_buffer(a.shape, a.dtype, scope = 'shared')
b_buffer = tvm.decl_buffer(b.shape, b.dtype, scope = 'shared')
def intrin_func(ins, outs):
ib = tvm.ir_builder.create()
aa = ins[0]
bb = outs[0]
ib.emit(tvm.call_extern(a.dtype, "relu",
bb.access_ptr('w'),
aa.access_ptr('r'),
m))
return ib.get()
with tvm.build_config(offset_factor=1):
return tvm.decl_tensor_intrin(b.op, intrin_func,
binds={a: a_buffer, b: b_buffer})
vrelu = intrin_relu(factor)
s[C_cache].tensorize(Ci_iter, vrelu)
print("4th source code")
print(tvm.lower(s, [A, C], simple_mode = True))
And console output is:
1st source code
// attr [A.shared] storage_scope = "shared"
allocate A.shared[float32 * 16384]
produce A.shared {
for (ax0, 0, 128) {
for (ax1, 0, 128) {
A.shared[((ax0*128) + ax1)] = A[((ax0*128) + ax1)]
}
}
}
produce C.shared {
for (i0.c, 0, 128) {
for (i1.c, 0, 128) {
A.shared[((i0.c*128) + i1.c)] = max(A.shared[((i0.c*128) + i1.c)], 0.000000f)
}
}
}
produce C {
for (i0, 0, 128) {
for (i1, 0, 128) {
C[((i0*128) + i1)] = A.shared[((i0*128) + i1)]
}
}
}
2nd source code
// attr [A.shared] storage_scope = "shared"
allocate A.shared[float32 * 16384]
produce A.shared {
for (ax0, 0, 128) {
for (ax1, 0, 128) {
A.shared[((ax0*128) + ax1)] = A[((ax0*128) + ax1)]
}
}
}
produce C.shared {
for (i0.c.i1.c.fused, 0, 16384) {
A.shared[i0.c.i1.c.fused] = max(A.shared[i0.c.i1.c.fused], 0.000000f)
}
}
produce C {
for (i0, 0, 128) {
for (i1, 0, 128) {
C[((i0*128) + i1)] = A.shared[((i0*128) + i1)]
}
}
}
3rd source code
// attr [A.shared] storage_scope = "shared"
allocate A.shared[float32 * 16384]
produce A.shared {
for (ax0, 0, 128) {
for (ax1, 0, 128) {
A.shared[((ax0*128) + ax1)] = A[((ax0*128) + ax1)]
}
}
}
produce C.shared {
for (i0.c.i1.c.fused.outer, 0, 256) {
for (i0.c.i1.c.fused.inner, 0, 64) {
A.shared[((i0.c.i1.c.fused.outer*64) + i0.c.i1.c.fused.inner)] = max(A.shared[((i0.c.i1.c.fused.outer*64) + i0.c.i1.c.fused.inner)], 0.000000f)
}
}
}
produce C {
for (i0, 0, 128) {
for (i1, 0, 128) {
C[((i0*128) + i1)] = A.shared[((i0*128) + i1)]
}
}
}
4th source code
[18:41:09] /home/***/github/tvm/src/schedule/message_passing.cc:257: use fallback inference rule in fuse
Traceback (most recent call last):
File "split.py", line 111, in <module>
test_relu()
File "split.py", line 53, in test_relu
print(tvm.lower(s, [A, C], simple_mode = True))
File "/home/***/github/tvm/python/tvm/build_module.py", line 359, in lower
stmt = form_body(sch)
File "/home/***/github/tvm/python/tvm/build_module.py", line 309, in form_body
stmt = schedule.ScheduleOps(sch, bounds)
File "/home/***/github/tvm/python/tvm/_ffi/_ctypes/function.py", line 190, in __call__
raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
[bt] (8) /home/***/github/tvm/build/libtvm.so(+0x161ea2) [0x7f53dc83dea2]
[bt] (7) /home/***/github/tvm/build/libtvm.so(+0x432b6e) [0x7f53dcb0eb6e]
[bt] (6) /home/***/github/tvm/build/libtvm.so(+0x430176) [0x7f53dcb0c176]
[bt] (5) /home/***/github/tvm/build/libtvm.so(tvm::ComputeOpNode::BuildProvide(tvm::Stage const&, std::unordered_map<tvm::IterVar, tvm::Range, std::hash<tvm::IterVar>, std::equal_to<tvm::IterVar>, std::allocator<std::pair<tvm::IterVar const, tvm::Range> > > const&, bool) const+0x15d) [0x7f53dcacd1bd]
[bt] (4) /home/***/github/tvm/build/libtvm.so(+0x3de16c) [0x7f53dcaba16c]
[bt] (3) /home/***/github/tvm/build/libtvm.so(+0x3db4c9) [0x7f53dcab74c9]
[bt] (2) /home/***/github/tvm/build/libtvm.so(+0x3daf16) [0x7f53dcab6f16]
[bt] (1) /home/***/github/tvm/build/libtvm.so(+0x3e67e4) [0x7f53dcac27e4]
[bt] (0) /home/***/github/tvm/build/libtvm.so(+0x134212) [0x7f53dc810212]
File "/home/***/github/tvm/src/op/tensorize.cc", line 204
TVMError: Check failed: is_one(e.region[i]->extent): Tensorize tensor_intrin: Input dimension mismatch with tensor intrin expected shape=[64], given region=[range(min=0, ext=128), range(min=0, ext=128)]
Error ocurred as
Check failed: is_one(e.region[i]->extent): Tensorize tensor_intrin: Input dimension mismatch with tensor intrin expected shape=[64], given region=[range(min=0, ext=128), range(min=0, ext=128)]
I have fused and splited C_cache’s axis to [256, 64], but why tensorize consider its axis as [128,128]?
It confused me all day, and thanks for help!