Segmentation fault (core dumped) for ARM intrinsic after tvm.build

Hi, I am using LLVM ARM intrinsics. I use “llvm.aarch64.neon.fmulx.v4f32” to multiply 4 float32 elements, but some problems occurs. Here is my code

import tvm

def fmull( dtype='float32'):

    num_f32_elements=4

    A=tvm.placeholder((num_f32_elements,), dtype =dtype, name='A'  )
    B=tvm.placeholder((num_f32_elements,), dtype =dtype, name='B'  )
    C=tvm.compute((num_f32_elements,),
            lambda i: A[i].astype('float32')*
                B[i].astype('float32'),
                name='C')
    a_buffer=tvm.decl_buffer(A.shape, dtype=dtype,name ='a_buffer',offset_factor=1)
    b_buffer=tvm.decl_buffer(B.shape, dtype=dtype,name ='b_buffer',offset_factor=1)
    c_buffer=tvm.decl_buffer(C.shape, dtype=C.dtype,name ='c_buffer',offset_factor=1)

    def _intrin_func(ins,outs):
        def _instr(index):
            xx,yy=ins
            zz=outs[0]
            ib=tvm.ir_builder.create()
            if index==1:
                ib.emit(outs[0].vstore(0,tvm.const(0,'float32x4')))
                return ib.get()
            
            vec_x=xx.vload([0],dtype='float32x4')
            vec_y=yy.vload([0],dtype='float32x4')
            vec_z=zz.vload([0],dtype='float32x4')

            inst='llvm.aarch64.neon.fmulx.v4f32'

            fmulx=tvm.call_llvm_intrin('float32x4',
                    inst,
                    tvm.const(0,'uint32'),
                    vec_z,vec_x,vec_y)
            ib.emit(zz.vstore(0,fmulx))
            return ib.get()

        #body reset update
        return _instr(0), _instr(1), _instr(2)

    with tvm.build_config(offset_factor=1,partition_const_loop=True):
        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={A:a_buffer,
            B:b_buffer, C:c_buffer})


def cal():
    dtype='float32'
    num_f32_elements=64
    A=tvm.placeholder((num_f32_elements,), dtype =dtype, name='A'  )
    B=tvm.placeholder((num_f32_elements,), dtype =dtype, name='B'  )

    C=tvm.compute((num_f32_elements,),
            lambda i: A[i].astype('float32')*
                B[i].astype('float32'),
                name='C')

    s=tvm.create_schedule(C.op)
    x0,xi=s[C].split(C.op.axis[0],factor=4)
    intrin=fmull(dtype='float32')
    s[C].tensorize(xi,intrin)

    target = 'llvm  -device=arm_cpu  -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon'
    #target = 'llvm  -device=arm_cpu  -model=bcm2837 -target=arm64-none-linux-gnu -mattr=+neon'
    
    print(tvm.lower(s,[A,B,C],simple_mode=True))

    print("start to build module")
    func=tvm.build(s,[A,B,C],target=target,name='element-wise')
    assembly = func.get_source('asm')
    print(assembly)

cal()

Segmentation fault occurs when running tvm.build. I am confused and I wonder whether my description of _intrin_func is right.

Hi @varinic, Did you get anywhere with this? I am trying to do more or less the same, but struggling to make it work :slight_smile:

Thanks, Giuseppe