My hardware is tesla V100
my inference code is:
def run_mxnet():
executor = sym_mxnet.simple_bind(ctx=mx.gpu(
0), data=batch_shape, grad_req='null', force_rebind=True)
executor.copy_params_from(arg_params, aux_params)
print('Warming up MXNet')
for i in range(0, 10):
y_gen = executor.forward(is_train=False, data=x)
y_gen[0].wait_to_read()
# Timing
print('Starting MXNet timed run')
start = time.process_time()
for i in range(0, 1000):
y_gen = executor.forward(is_train=False, data=x)
y_gen[0].wait_to_read()
end = time.time()
print(time.process_time() - start)
and my quantization code is:
def run_quantize():
sym, _ = relay.frontend.from_mxnet(sym_mxnet, {'data': batch_shape})
sym, params = testing.create_workload(sym['main'])
# mod, params = relay.frontend.from_mxnet(sym, shape={'data': batch_shape}, arg_params=arg_params, aux_params=aux_params)
# pdb.set_trace()
with relay.quantize.qconfig(skip_k_conv=0, round_for_shift=True):
net = relay.quantize.quantize(sym['main'], params=params)
with relay.build_config(opt_level=3):
graph, lib, params = relay.build(net, 'cuda', 'llvm', params=params)
m = graph_runtime.create(graph, lib, ctx)
# x = np.random.uniform(size=batch_shape)
data_tvm = tvm.nd.array(x.astype('float32'))
m.set_input(**{k: tvm.nd.array(v, ctx) for k, v in params.items()})
print('Warming up TVM')
for i in range(0, 10):
m.set_input("data", data_tvm)
m.run()
tvm_output = m.get_output(0)
print('Starting TVM timed run')
start = time.process_time()
m.set_input("data", data_tvm)
for i in range(0, 1000):
# m.set_input("data", data_tvm)
m.run()
tvm_output = m.get_output(0)
end = time.time()
print(time.process_time() - start)
as a result, the total time mxnet used is 292s, and quantized is 384s.