[x86][relay] auto-tune mobilefacenet error using relay


#1

I follow the tutorial on tune_relay_x86. When i auto-tune mobilefacenet, I encountered the following error


#2

This is an known issue. You need to make this change https://github.com/dmlc/tvm/pull/2184/files#diff-7e201f96ab2ff5c019d70eeabde2dc87R210. Will fix it in that PR


#3

@ kevinthesun I have pull this PR, the auto tune is ok now. but compile is error.


#4

the auto-tune is fine.


#5

Looks like dense is loading a schedule, instead of using default schedule. Is the number of schedules in model_graph_opt.log the same as the number of conv2d in the model?


#6

the conv2d number is not same, a full-connect layer is missing. after the full-connect layer is the batchnorm layer.
I have try auto-tune the resnet18_v1, the compile is ok except a warning of dense layer fallback.


may be it is the problem of batchnorm layer?


#7

I remove the full-connect layer and the following batchnorm layer, compile the network using the auto-tuned model_graph_opt.log.


#8

It called arm_cpu schedules. Is the target set correctly?


#9

I set target = “llvm -mcpu=broadwell” and ctx = tvm.cpu() according to the tune_relay_x86.py example.


#10

import os
import numpy as np

import tvm
from tvm import autotvm
from tvm import relay
from tvm.relay import testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
import tvm.contrib.graph_runtime as runtime
import mxnet as mx

def get_network(name, batch_size):
“”“Get the symbol definition and random weight of a network”""
input_shape = (batch_size, 3, 224, 224)
output_shape = (batch_size, 1024)

if "resnet" in name:
    n_layer = int(name.split('-')[1])
    net, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
elif "vgg" in name:
    n_layer = int(name.split('-')[1])
    net, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
elif name == 'mobilenet':
    net, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
elif name == 'squeezenet_v1.1':
    net, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype)
elif name == 'inception_v3':
    input_shape = (1, 3, 299, 299)
    net, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
elif name == 'mxnet':
    # an example for mxnet model
    from mxnet.gluon.model_zoo.vision import get_model
    block = get_model('resnet18_v1', pretrained=True)
    net, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
    net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
elif name == 'model':
    shape_dict = {'data':input_shape}
    mx_sym, args, auxs = mx.model.load_checkpoint('model',115)
    net, params = relay.frontend.from_mxnet(mx_sym, shape_dict, args, auxs)
else:
    raise ValueError("Unsupported network: " + name)

return net, params, input_shape, output_shape

target = “llvm -mcpu=broadwell”

batch_size = 1
dtype = “float32”
model_name = “mxnet”
log_file = “%s.log” % model_name
graph_opt_sch_file = “%s_graph_opt.log” % model_name

num_threads = 1
os.environ[“TVM_NUM_THREADS”] = str(num_threads)

tuning_option = {
‘log_filename’: log_file,
‘tuner’: ‘random’,
‘early_stopping’: None,

'measure_option': autotvm.measure_option(
    builder=autotvm.LocalBuilder(),
    runner=autotvm.LocalRunner(number=10, repeat=1,
                               min_repeat_ms=1000),
),

}

def tune_kernels(tasks,
measure_option,
tuner=‘gridsearch’,
early_stopping=None,
log_filename=‘tuning.log’):

for i, tsk in enumerate(tasks):
    prefix = "[Task %2d/%2d] " % (i+1, len(tasks))


    op_name = tsk.workload[0]
    if op_name == 'conv2d':
        func_create = 'topi_x86_conv2d_NCHWc'
    elif op_name == 'depthwise_conv2d_nchw':
        func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw'
    else:
        raise ValueError("Tuning {} is not supported on x86".format(op_name))

    task = autotvm.task.create(func_create, args=tsk.args,
                               target=target, template_key='direct')
    task.workload = tsk.workload


    if tuner == 'xgb' or tuner == 'xgb-rank':
        tuner_obj = XGBTuner(task, loss_type='rank')
    elif tuner == 'ga':
        tuner_obj = GATuner(task, pop_size=50)
    elif tuner == 'random':
        tuner_obj = RandomTuner(task)
    elif tuner == 'gridsearch':
        tuner_obj = GridSearchTuner(task)
    else:
        raise ValueError("Invalid tuner: " + tuner)


    n_trial=len(task.config_space)
    tuner_obj.tune(n_trial=n_trial,
                   early_stopping=early_stopping,
                   measure_option=measure_option,
                   callbacks=[
                       autotvm.callback.progress_bar(n_trial, prefix=prefix),
                       autotvm.callback.log_to_file(log_filename)])

def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True):
target_op = [relay.nn.conv2d]
Tuner = DPTuner if use_DP else PBQPTuner
executor = Tuner(graph, {“data”: dshape}, records, target_op, target)
executor.benchmark_layout_transform(min_exec_num=2000)
executor.run()
executor.write_opt_sch2record_file(opt_sch_file)

def tune_and_evaluate(tuning_opt):
# extract workloads from relay program
print(“Extract tasks…”)
net, params, data_shape, out_shape = get_network(model_name, batch_size)
tasks = autotvm.task.extract_from_program(net, target=target,
params=params, ops=(relay.op.nn.conv2d,))

# run tuning tasks
print("Tuning...")
tune_kernels(tasks, **tuning_opt)
tune_graph(net, data_shape, log_file, graph_opt_sch_file)

# compile kernels with graph-level best records
with autotvm.apply_graph_best(graph_opt_sch_file):
    print("Compile...")
    with relay.build_config(opt_level=3):
        graph, lib, params = relay.build_module.build(
            net, target=target,  params=params)

    # upload parameters to device
    ctx = tvm.cpu()
    data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
    module = runtime.create(graph, lib, ctx)
    module.set_input('data', data_tvm)
    module.set_input(**params)

    # evaluate
    print("Evaluate inference time cost...")
    ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
    prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
    print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
          (np.mean(prof_res), np.std(prof_res)))

tune_and_evaluate(tuning_option)


#11

here is the mxnet model file
link: https://pan.baidu.com/s/14O76Xex6eX1pYPvzwxMYGg code: 4kgw

input_shape = (batch_size, 3, 112, 112)
output_shape = (batch_size, 512)


#12

could you try to auto-tune this model for help? thank you so much!


#13

Looks like t setting target = “llvm -mcpu=broadwell” will make tvm use arm_cpu schedule. Can you verify this with a smaller network? If it is the case, this issue is not related to autotvm.


#14

I have try on other computers, llvm -mcpu=haswell, and use the origin mobilefacenent(a smaller network 3.92M), the error is same.


if I don’t do auto-tune, I can compile it using relay and nnvm. nnvm is faster.
I can also use nnvm to auto-tune, the compile is ok. The “tile_k” key not found error occurs only if I auto-tune using relay. so I think it is the problem of autotvm with relay.


#15

This sounds weird to me. Can you take a look at the log file after autotune and make sure x86 target is used?


#16

model_graph_opt.log is like this:
{“i”: [“llvm -mcpu=broadwell”, “topi_x86_conv2d_NCHWc”, [[“TENSOR”, [1, 3, 112, 112], “float32”], [“TENSOR”, [64, 3, 3, 3], “float32”], [2, 2], [1, 1], [1, 1], “NCHW”, “float32”], {}, [“conv2d”, [1, 3, 112, 112, “float32”], [64, 3, 3, 3, “float32”], [2, 2], [1, 1], [1, 1], “NCHW”, “float32”], {“i”: 24, “t”: “direct”, “c”: null, “e”: [[“tile_ic”, “sp”, [3, 1]], [“tile_oc”, “sp”, [2, 32]], [“tile_ow”, “sp”, [28, 2]], [“unroll_kw”, “ot”, true]]}], “r”: [[0.00022216637243020992], 0, 2.0850822925567627, 1557991675.785746], “v”: 0.1}
{“i”: [“llvm -mcpu=broadwell”, “topi_x86_conv2d_NCHWc”, [[“TENSOR”, [1, 64, 56, 56], “float32”], [“TENSOR”, [64, 64, 1, 1], “float32”], [1, 1], [0, 0], [1, 1], “NCHW”, “float32”], {}, [“conv2d”, [1, 64, 56, 56, “float32”], [64, 64, 1, 1, “float32”], [1, 1], [0, 0], [1, 1], “NCHW”, “float32”], {“i”: 432, “t”: “direct”, “c”: null, “e”: [[“tile_ic”, “sp”, [2, 32]], [“tile_oc”, “sp”, [2, 32]], [“tile_ow”, “sp”, [56, 1]], [“tile_oh”, “ot”, 2]]}], “r”: [[0.00047231593158072126], 0, 3.1119916439056396, 1557989850.5378458], “v”: 0.1}
{“i”: [“llvm -mcpu=broadwell”, “topi_x86_depthwise_conv2d_NCHWc_from_nchw”, [[“TENSOR”, [1, 64, 56, 56], “float32”], [“TENSOR”, [64, 1, 3, 3], “float32”], [1, 1], [1, 1], [1, 1], “float32”], {}, [“depthwise_conv2d_nchw”, [1, 64, 56, 56, “float32”], [64, 1, 3, 3, “float32”], [1, 1], [1, 1], [1, 1], “float32”], {“i”: 89, “t”: “direct”, “c”: null, “e”: [[“tile_ic”, “sp”, [2, 32]], [“tile_oc”, “sp”, [2, 32]], [“tile_ow”, “sp”, [28, 2]]]}], “r”: [[0.00027373629171905875], 0, 3.663689613342285, 1557991313.0582528], “v”: 0.1}
{“i”: [“llvm -mcpu=broadwell”, “topi_x86_conv2d_NCHWc”, [[“TENSOR”, [1, 64, 56, 56], “float32”], [“TENSOR”, [64, 64, 1, 1], “float32”], [1, 1], [0, 0], [1, 1], “NCHW”, “float32”], {}, [“conv2d”, [1, 64, 56, 56, “float32”], [64, 64, 1, 1, “float32”], [1, 1], [0, 0], [1, 1], “NCHW”, “float32”], {“i”: 432, “t”: “direct”, “c”: null, “e”: [[“tile_ic”, “sp”, [2, 32]], [“tile_oc”, “sp”, [2, 32]], [“tile_ow”, “sp”, [56, 1]], [“tile_oh”, “ot”, 2]]}], “r”: [[0.00047231593158072126], 0, 3.1119916439056396, 1557989850.5378458], “v”: 0.1}

“tile_k” is not in this log.


#17

Can you try to use autotvm.apply_history_best(log_file) as dispatch context and see what happens?