Hi,
I am working on INT8 convolution implementation on Intel Skylake. In the process, I need to generate a broadcast instruction, i.e., take a scalar value and copy it multiple times in a vector.
I am new to ir_builder and intrinsics and having difficulty in generating that instruction. I might be using something incorrectly.
Can somebody please help me with this? The isolated issue is below.
def test_broadcast():
A = tvm.placeholder((1,), name='A', dtype='int16')
# Implementation 1 using intrinsics
def impl1(ins, outs):
ib = tvm.ir_builder.create()
Ascalar = ins[0].vload([0], "int16")
broadcastA = tvm.call_llvm_intrin('int16x32',
'llvm.x86.avx512.broadcastmw.512',
tvm.const(0, 'uint32'),
Ascalar);
out = broadcastA
ib.emit(outs[0].vstore(0, out))
return ib.get()
# Implementation 2 - Struggling attempt after Impl1 failed
def impl2(ins, outs):
ib = tvm.ir_builder.create()
Ascalar = ins[0].vload([0], "int16")
broadcastA = ib.pointer("int16", name="B")
with ib.for_range(0, 32) as i:
broadcastA[i] = Ascalar
out = broadcastA
ib.emit(outs[0].vstore(0, out))
return ib.get()
# Change the impl1/impl2 to trigger different implementations
D = tvm.extern((32,), [A], impl1, dtype='int16', name='D')
s = tvm.create_schedule(D.op)
print(tvm.lower(s, [A], simple_mode=True))
def check_target(target):
if not tvm.module.enabled(target):
return
f = tvm.build(s, [A], target)
f.save('temp.ll')
#f.save('temp.s')
check_target("llvm -mcpu=skylake-avx512")
Error with Implementation 1 - python: /home/ubuntu/workspace/tvm/llvm/llvm/lib/IR/Instructions.cpp:1202: void llvm::StoreInst::AssertOK(): Assertion `getOperand(0)->getType() == cast(getOperand(1)->getType())->getElementType() && “Ptr must be a pointer to Val type!”’ failed.
Error with Implementation 2 - tvm._ffi.base.TVMError: [01:28:58] /home/ubuntu/workspace/tvm/src/lang/buffer.cc:281: Check failed: dtype.element_of() == n->dtype.element_of() && dtype.lanes() % n->dtype.lanes() == 0 Cannot load handle64 from buffer of int16