Want to write larger intermediate tensors for SIMD purposes

Hello, I have a tensor of non-power-of-two sizes, say (7,7), which is not very nice to be implemented in SIMD. I first serialize it to a tensor of size 49. Then, try to compute a tensor of size 56, which is good for SIMD. Then, move the first 49 elements to the destination tensor. I does everything in a new topi operator. However, I still see if (likely(... in the inner loop, and the code is not in SIMD form.

The lowered function is:

// attr [compute] storage_scope = "global"
allocate compute[float32 * 47040]
// attr [compute] storage_scope = "global"
allocate compute[float32 * 49]
produce compute {
  for (c, 0, 960) {
    for (idx, 0, 49) {
      compute[((c*49) + idx)] = placeholder[((c*49) + idx)]
    }
  }
}
produce compute {
  for (n.c.fused, 0, 1280) {
    produce compute {
      for (idx.outer.init, 0, 7) {
        for (idx.inner.init.s, 0, 8) {
          if (likely((((idx.outer.init*8) + idx.inner.init.s) < 49))) {
            compute[((idx.outer.init*8) + idx.inner.init.s)] = 0f
          }
        }
      }
      for (elem_idx, 0, (placeholder[(n.c.fused + 1)] - placeholder[n.c.fused])) {
        for (idx.outer, 0, 7) {
          for (idx.inner.s, 0, 8) {
            if (likely((((idx.outer*8) + idx.inner.s) < 49))) {
              compute[((idx.outer*8) + idx.inner.s)] = (compute[((idx.outer*8) + idx.inner.s)] + (compute[(((placeholder[(placeholder[n.c.fused] + elem_idx)]*49) + (idx.outer*8)) + idx.inner.s)]*placeholder[(placeholder[n.c.fused] + elem_idx)]))
            }
          }
        }
      }
    }
    for (h, 0, 7) {
      for (w, 0, 7) {
        compute[(((n.c.fused*49) + (h*7)) + w)] = compute[((h*7) + w)]
      }
    }
  }
}

Even though I specified the intermediate tensor size to be 56, only 49 is allocated. I guess TVM is too smart to figure out the last 8 elements are not accessed? But that tanks the performance.

Is there a way to force the tvm.compute to honor the specified size?

Thanks.

ps. here I just use one element of the intermediate tensor, which is an extreme case:

// attr [compute] storage_scope = "global"
allocate compute[float32 * 960]
// attr [compute] storage_scope = "global"
allocate compute[float32 * 1]
produce compute {
  for (c, 0, 960) {
    compute[c] = placeholder[(c*49)]
  }
}
produce compute {
  for (n.c.fused, 0, 1280) {
    produce compute {
      for (idx.inner.init.s, 0, 8) {
        if (likely((idx.inner.init.s < 1))) {
          compute[idx.inner.init.s] = 0f
        }
      }
      for (elem_idx, 0, (placeholder[(n.c.fused + 1)] - placeholder[n.c.fused])) {
        for (idx.inner.s, 0, 8) {
          if (likely((idx.inner.s < 1))) {
            compute[idx.inner.s] = (compute[idx.inner.s] + (compute[(placeholder[(placeholder[n.c.fused] + elem_idx)] + idx.inner.s)]*placeholder[(placeholder[n.c.fused] + elem_idx)]))
          }
        }
      }
    }
    for (h, 0, 7) {
      for (w, 0, 7) {
        compute[(((n.c.fused*49) + (h*7)) + w)] = compute[0]
      }
    }
  }
}