LLVM error when deploy mobilenet based model on raspberry pi3b


#1

I try to test the performance and deploy the mobilefacenet which trained by insightface on raspberry pi3b. but Error occurs.

/Users/yujinke/anaconda3/lib/python3.6/site-packages/h5py/init.py:36: FutureWarning: Conversion of the second argument of issubdtype from float to np.floating is deprecated. In future, it will be treated as np.float64 == np.dtype(float).type.
from ._conv import register_converters as _register_converters
[19:08:43] src/nnvm/legacy_json_util.cc:209: Loading symbol saved by previous version v1.2.0. Attempting to upgrade…
[19:08:43] src/nnvm/legacy_json_util.cc:217: Symbol successfully upgraded!
Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘conv2d’, (1, 3, 112, 112, ‘float32’), (64, 3, 3, 3, ‘float32’), (2, 2), (1, 1), (1, 1), ‘NCHW’, ‘float32’). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘conv2d’, (1, 128, 28, 28, ‘float32’), (64, 128, 1, 1, ‘float32’), (1, 1), (0, 0), (1, 1), ‘NCHW’, ‘float32’). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘conv2d’, (1, 64, 28, 28, ‘float32’), (128, 64, 1, 1, ‘float32’), (1, 1), (0, 0), (1, 1), ‘NCHW’, ‘float32’). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘conv2d’, (1, 64, 28, 28, ‘float32’), (256, 64, 1, 1, ‘float32’), (1, 1), (0, 0), (1, 1), ‘NCHW’, ‘float32’). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘conv2d’, (1, 128, 14, 14, ‘float32’), (256, 128, 1, 1, ‘float32’), (1, 1), (0, 0), (1, 1), ‘NCHW’, ‘float32’). A fallback configuration is used,
which may bring great performance regression.
Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘conv2d’, (1, 128, 14, 14, ‘float32’), (512, 128, 1, 1, ‘float32’), (1, 1), (0, 0), (1, 1), ‘NCHW’, ‘float32’). A fallback configuration is used,
which may bring great performance regression.
Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘conv2d’, (1, 128, 7, 7, ‘float32’), (256, 128, 1, 1, ‘float32’), (1, 1), (0, 0), (1, 1), ‘NCHW’, ‘float32’). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘conv2d’, (1, 256, 7, 7, ‘float32’), (128, 256, 1, 1, ‘float32’), (1, 1), (0, 0), (1, 1), ‘NCHW’, ‘float32’). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘conv2d’, (1, 128, 7, 7, ‘float32’), (512, 128, 1, 1, ‘float32’), (1, 1), (0, 0), (1, 1), ‘NCHW’, ‘float32’). A fallback configuration is used, which may bring great performance regression.
WARNING:autotvm:Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘depthwise_conv2d_nchw’, (1, 64, 56, 56, ‘float32’), (64, 1, 3, 3, ‘float32’), (1, 1), (1, 1), (1, 1), ‘float32’). A fallback configuration is used, which may bring great performance regression.
WARNING:autotvm:Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘depthwise_conv2d_nchw’, (1, 128, 28, 28, ‘float32’), (128, 1, 3, 3, ‘float32’), (1, 1), (1, 1), (1, 1), ‘float32’). A fallback configuration is used, which may bring great performance regression.
WARNING:autotvm:Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘depthwise_conv2d_nchw’, (1, 256, 14, 14, ‘float32’), (256, 1, 3, 3, ‘float32’), (1, 1), (1, 1), (1, 1), ‘float32’). A fallback configuration is used, which may bring great performance regression.
WARNING:autotvm:Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘depthwise_conv2d_nchw’, (1, 256, 7, 7, ‘float32’), (256, 1, 3, 3, ‘float32’), (1, 1), (1, 1), (1, 1), ‘float32’). A fallback configuration is used, which may bring great performance regression.
WARNING:autotvm:Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=(‘depthwise_conv2d_nchw’, (1, 512, 7, 7, ‘float32’), (512, 1, 7, 7, ‘float32’), (1, 1), (0, 0), (1, 1), ‘float32’). A fallback configuration is used, which may bring great performance regression.
LLVM ERROR: Cannot select: 0x7fdadfc7b220: ch = br_cc 0x7fdae10204d0, setgt:ch, 0x7fdadfbc3070, 0x7fdadfbc1508, BasicBlock:ch<if_end 0x7fdadeaca780>
0x7fdadfbc3070: v4f32 = fadd 0x7fdadfc7b4f8, 0x7fdadfc77700
0x7fdadfc7b4f8: v4f32,ch = CopyFromReg 0x7fdae40036c0, Register:v4f32 %37
0x7fdadfc927b8: v4f32 = Register %37
0x7fdadfc77700: v4f32,ch = CopyFromReg 0x7fdae40036c0, Register:v4f32 %11
0x7fdadfc9ba20: v4f32 = Register %11
0x7fdadfbc1508: v4f32 = bitcast 0x7fdae1021308
0x7fdae1021308: v4i32 = ARMISD::VMOVIMM TargetConstant:i32<0>
0x7fdadfc9b3a0: i32 = TargetConstant<0>
In function: __tvm_parallel_lambda.63

My Code:

import tvm
import nnvm.compiler
import nnvm.testing
from tvm import rpc
from tvm.contrib import util,graph_runtime as runtime
import numpy as np
import mxnet as mx
from mxnet import ndarray as nd
from tvm.contrib.util import tempdir
from util import get_network, print_progress
prefix,epoch = “model_mfn”,0
#prefix,epoch = “mneti”,0
sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
image_size = (112,112)
#image_size = (-1,-1)
opt_level = 3
shape_dict = {‘data’: (1, 3, *image_size)}
#target = tvm.target.create(“llvm -mcpu=broadwell”)
target = tvm.target.arm_cpu(‘rasp3b’)
nnvm_sym, nnvm_params = nnvm.frontend.from_mxnet(sym, arg_params, aux_params)
with nnvm.compiler.build_config(opt_level=opt_level):
graph, lib, params = nnvm.compiler.build(nnvm_sym, target, shape_dict, params=nnvm_params)
lib.export_library("./deploy_lib.tar")
print(‘lib succeeded’)
with open(“deploy_graph.json”, “w”) as fo:
fo.write(graph.json())
with open(“deploy_param.params”, “wb”) as fo:
fo.write(nnvm.compiler.save_param_dict(params))
local_demo = False
if local_demo:
remote = rpc.LocalSession()
else:
# The following is my environment, change this to the IP address of your target device
host = ‘192.168.2.105’
#host = ‘172.19.0.12’
port = 9090
remote = rpc.connect(host, port)

upload the library to remote device and load it

lib_fname = ‘deploy_lib.tar’
remote.upload(lib_fname)
rlib = remote.load_module(‘deploy_lib.tar’)

upload the parameter (this may take a while)

ctx = remote.cpu(0)
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}

create the remote runtime module

module = runtime.create(graph, rlib, ctx)

set parameter

module.set_input(**rparams)

set input data

import numpy as np
network = “emore1”

print(“load succeess. input size [1 3 112 112]”)
print("%-20s %-19s (%s)" % (“name”, “mean”, “-+”))

module.set_input(‘data’, tvm.nd.array(np.zeros(shape = (1,3,image_size[0],image_size[1]),dtype=np.float32)))
repeat= 10
print_progress("%-20s evaluating…" % network)
ftimer = module.module.time_evaluator(“run”, ctx, number=1, repeat=repeat)
prof_res = np.array(ftimer().results) * 1000 # multiply 1000 for converting to millisecond
print("%-20s %-19s (%s)" % (network, “%.2f ms” % np.mean(prof_res), “%.2f ms” % np.std(prof_res)))

but i set opt_level to 0 .it works well. but the model run extremely slow on arm device.


#2

Both mobilefacenet and mobilenet models from insightface can not be compiled using tvm on arm devices.
Need help!