Hello, I have a tensor of non-power-of-two sizes, say (7,7), which is not very nice to be implemented in SIMD. I first serialize it to a tensor of size 49. Then, try to compute a tensor of size 56, which is good for SIMD. Then, move the first 49 elements to the destination tensor. I does everything in a new topi operator. However, I still see if (likely(...
in the inner loop, and the code is not in SIMD form.
The lowered function is:
// attr [compute] storage_scope = "global"
allocate compute[float32 * 47040]
// attr [compute] storage_scope = "global"
allocate compute[float32 * 49]
produce compute {
for (c, 0, 960) {
for (idx, 0, 49) {
compute[((c*49) + idx)] = placeholder[((c*49) + idx)]
}
}
}
produce compute {
for (n.c.fused, 0, 1280) {
produce compute {
for (idx.outer.init, 0, 7) {
for (idx.inner.init.s, 0, 8) {
if (likely((((idx.outer.init*8) + idx.inner.init.s) < 49))) {
compute[((idx.outer.init*8) + idx.inner.init.s)] = 0f
}
}
}
for (elem_idx, 0, (placeholder[(n.c.fused + 1)] - placeholder[n.c.fused])) {
for (idx.outer, 0, 7) {
for (idx.inner.s, 0, 8) {
if (likely((((idx.outer*8) + idx.inner.s) < 49))) {
compute[((idx.outer*8) + idx.inner.s)] = (compute[((idx.outer*8) + idx.inner.s)] + (compute[(((placeholder[(placeholder[n.c.fused] + elem_idx)]*49) + (idx.outer*8)) + idx.inner.s)]*placeholder[(placeholder[n.c.fused] + elem_idx)]))
}
}
}
}
}
for (h, 0, 7) {
for (w, 0, 7) {
compute[(((n.c.fused*49) + (h*7)) + w)] = compute[((h*7) + w)]
}
}
}
}
Even though I specified the intermediate tensor size to be 56, only 49 is allocated. I guess TVM is too smart to figure out the last 8 elements are not accessed? But that tanks the performance.
Is there a way to force the tvm.compute to honor the specified size?
Thanks.
ps. here I just use one element of the intermediate tensor, which is an extreme case:
// attr [compute] storage_scope = "global"
allocate compute[float32 * 960]
// attr [compute] storage_scope = "global"
allocate compute[float32 * 1]
produce compute {
for (c, 0, 960) {
compute[c] = placeholder[(c*49)]
}
}
produce compute {
for (n.c.fused, 0, 1280) {
produce compute {
for (idx.inner.init.s, 0, 8) {
if (likely((idx.inner.init.s < 1))) {
compute[idx.inner.init.s] = 0f
}
}
for (elem_idx, 0, (placeholder[(n.c.fused + 1)] - placeholder[n.c.fused])) {
for (idx.inner.s, 0, 8) {
if (likely((idx.inner.s < 1))) {
compute[idx.inner.s] = (compute[idx.inner.s] + (compute[(placeholder[(placeholder[n.c.fused] + elem_idx)] + idx.inner.s)]*placeholder[(placeholder[n.c.fused] + elem_idx)]))
}
}
}
}
for (h, 0, 7) {
for (w, 0, 7) {
compute[(((n.c.fused*49) + (h*7)) + w)] = compute[0]
}
}
}
}