I am the first time user of TVM and I am doing some inference benchmarks of quantized models on CPU. Model is imported from mxnet, quantized and auto tuned. I am running the test on Google cloud, skyllake dual-core cpu. The int8 is always slower than int16 before and after the auto-tuning. Should I expect this happen in TVM?
Target: llvm -mcpu=skylake-avx512
bit | before tuning (ms) | after tuning (ms) |
---|---|---|
float32 | 197.82 | 62.44 |
8 | 128.55 | 59.57 |
16 | 120.42 | 46.44 |
Here is the snippets of my code
from __future__ import absolute_import, print_function
from collections import namedtuple
import argparse, json, os, requests, sys, time
from io import BytesIO
from os.path import join, isfile
from PIL import Image
import numpy as np
from matplotlib import pyplot as plt
from collections import namedtuple
import tvm
from tvm import relay
from tvm.relay import quantize as qtz
from tvm.contrib import download
from tvm import autotvm
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.contrib.util import tempdir
import mxnet as mx
from mxnet import gluon
import logging
import os
import time
import logging
logging.basicConfig(level=logging.INFO)
Config = namedtuple('Config', ['model', 'nbit_input', 'dtype_input', 'nbit_output', 'dtype_output', 'global_scale', 'batch_size'])
# Set number of threads used for tuning based on the number of
# physical CPU cores on your machine.
num_threads = 2
os.environ["TVM_NUM_THREADS"] = str(num_threads)
def get_model(model_name, batch_size, qconfig, target=None, original=False, simulated=False):
gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True)
img_size = 299 if model_name == 'inceptionv3' else 224
input_shape = (batch_size, 3, img_size, img_size)
mod, params = relay.frontend.from_mxnet(gluon_model, {"data": input_shape})
net = mod['main']
start_time = time.time()
with relay.build_config(opt_level=3):
qfunc = relay.quantize.prerequisite_optimize(net, params=params)
logging.debug('original')
logging.debug(qfunc.astext(show_meta_data=False))
if original:
return qfunc
with qconfig:
logging.debug('current quantize config')
logging.debug(qtz.current_qconfig())
qfunc = qtz.quantize(qfunc)
logging.debug('after quantize')
logging.debug(qfunc.astext(show_meta_data=False))
build_time = time.time() - start_time
logging.info(model_name + " inference graph build in {0:.2f}s".format(build_time))
return qfunc, params, input_shape
###################################################################
# Begin Tuning
# ------------
# Now we can extract tuning tasks from the network and begin tuning.
# Here, we provide a simple utility function to tune a list of tasks.
# This function is just an initial implementation which tunes them in sequential order.
# We will introduce a more sophisticated tuning scheduler in the future.
# You can skip the implementation of this function for this tutorial.
def tune_tasks(tasks,
measure_option,
tuner='xgb',
n_trial=1000,
early_stopping=None,
log_filename='tuning.log',
use_transfer_learning=True):
# create tmp log file
tmp_log_file = log_filename + ".tmp"
if os.path.exists(tmp_log_file):
os.remove(tmp_log_file)
for i, tsk in enumerate(reversed(tasks)):
prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
# create tuner
if tuner == 'xgb' or tuner == 'xgb-rank':
tuner_obj = XGBTuner(tsk, loss_type='rank')
elif tuner == 'ga':
tuner_obj = GATuner(tsk, pop_size=100)
elif tuner == 'random':
tuner_obj = RandomTuner(tsk)
elif tuner == 'gridsearch':
tuner_obj = GridSearchTuner(tsk)
else:
raise ValueError("Invalid tuner: " + tuner)
if use_transfer_learning:
if os.path.isfile(tmp_log_file):
tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
# do tuning
tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
early_stopping=early_stopping,
measure_option=measure_option,
callbacks=[
autotvm.callback.progress_bar(n_trial, prefix=prefix),
autotvm.callback.log_to_file(tmp_log_file)])
# pick best records to a cache file
autotvm.record.pick_best(tmp_log_file, log_filename)
os.remove(tmp_log_file)
########################################################################
# Finally, we launch tuning jobs and evaluate the end-to-end performance.
def tune_and_evaluate(tuning_opt, cfg, target, ctx, log_file):
qconfig = qtz.qconfig(skip_conv_layers=[0],
nbit_input=cfg.nbit_input,
nbit_weight=cfg.nbit_input,
global_scale=cfg.global_scale,
dtype_input=cfg.dtype_input,
dtype_weight=cfg.dtype_input,
dtype_activation=cfg.dtype_output,
debug_enabled_ops=None)
# extract workloads from relay program
logging.info("Extract tasks...")
mod, params, input_shape = get_model(cfg.model, cfg.batch_size, qconfig, target)
tasks = autotvm.task.extract_from_program(mod, target=target,
params=params, ops=(relay.op.nn.conv2d,))
for i in range(len(tasks)):
op_name = tasks[i].workload[0]
if op_name == 'conv2d':
func_create = 'topi_x86_conv2d_NCHWc'
elif op_name == 'depthwise_conv2d_nchw':
func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw'
else:
print ("Tuning {} is not supported on x86")
raise ValueError("Tuning {} is not supported on x86".format(op_name))
print ( "[Create Task %2d/%2d (%s, %s) ] " % (i+1, len(tasks), tasks[i].name, tasks[i].workload[0]))
tsk = autotvm.task.create(func_create, args=tasks[i].args,
target=tasks[i].target, template_key='direct')
tsk.workload = tasks[i].workload
tasks[i] = tsk
# run tuning tasks
logging.info("Tuning...")
tune_tasks(tasks, **tuning_opt)
# compile kernels with history best records
with autotvm.apply_history_best(log_file):
logging.info("Compile...")
with relay.build_config(opt_level=3):
graph, lib, params = relay.build_module.build(
mod, target=target, params=params)
# export library
tmp = tempdir()
filename = "net.tar"
lib.export_library(tmp.relpath(filename))
# load parameters
module = tvm.contrib.graph_runtime.create(graph, lib, ctx)
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype('float32'))
module.set_input('data', data_tvm)
module.set_input(**params)
# evaluate
logging.info("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=60)
prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
logging.info("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
if __name__ == "__main__":
target = 'llvm -mcpu=skylake-avx512'
ctx = tvm.cpu()
configs = [
Config('resnet18_v1', nbit_input=8, dtype_input='int8', nbit_output=8, dtype_output='int8', global_scale=8.0, batch_size=1),
Config('resnet18_v1', nbit_input=16, dtype_input='int16', nbit_output=16, dtype_output='int16', global_scale=8.0, batch_size=1),
# Config('mobilenetv2_1.0', nbit_input=8, dtype_input='int8', nbit_output=8, dtype_output='int8', global_scale=4.0, batch_size=1),
# Config('mobilenetv2_1.0', nbit_input=16, dtype_input='int16', nbit_output=16, dtype_output='int16', global_scale=4.0, batch_size=1),
]
for config in configs:
logging.info('Start testing for %s', config.model)
log_file = "%s_%s.log" % (config.model, config.dtype_input)
if os.path.exists(log_file):
os.remove(log_file)
#### TUNING OPTION ####
tuning_option = {
'log_filename': log_file,
'tuner': 'random',
'n_trial': 10,
'early_stopping': None,
'measure_option': autotvm.measure_option(
builder=autotvm.LocalBuilder(timeout=10),
runner=autotvm.LocalRunner(number=10, repeat=1, min_repeat_ms=1000),
# runner=autotvm.RPCRunner(
# '1080ti', # change the device key to your key
# '0.0.0.0', 9190,
# number=20, repeat=3, timeout=4, min_repeat_ms=150)
),
}
tune_and_evaluate(tuning_option, config, target, ctx, log_file)