Hi, I’m trying TVM post-quantization recently and I found a bit weird problem that the output tensor have only 0.0 in the data after I applied TVM relay.quantize.quantize
in KL divergence mode.
Is there anyone who knows about this or encountered?
I put the actual code and the output from shell below
import tvm
import numpy as np
import mxnet as mx
from tvm.contrib import graph_runtime as runtime
from tvm import relay
from tvm.relay import testing
from tvm.contrib.download import download_testdata
calibration_rec = download_testdata(
'http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec',
'val_256_q90.rec')
def get_val_data(num_workers=4):
mean_rgb = [123.68, 116.779, 103.939]
std_rgb = [58.393, 57.12, 57.375]
def batch_fn(batch):
return batch.data[0].asnumpy(), batch.label[0].asnumpy()
img_size = 224
val_data = mx.io.ImageRecordIter(
path_imgrec=calibration_rec,
preprocess_threads=num_workers,
shuffle=False,
batch_size=batch_size,
resize=256,
data_shape=(3, img_size, img_size),
mean_r=mean_rgb[0],
mean_g=mean_rgb[1],
mean_b=mean_rgb[2],
std_r=std_rgb[0],
std_g=std_rgb[1],
std_b=std_rgb[2],
)
return val_data, batch_fn
calibration_samples = 10
def calibrate_dataset():
val_data, batch_fn = get_val_data()
val_data.reset()
for i, batch in enumerate(val_data):
if i * batch_size >= calibration_samples:
break
data, _ = batch_fn(batch)
yield {'data': data}
oc = 16
batch_size = 1
data_aware = True
data = relay.var("data", relay.TensorType((batch_size, 3, 224, 224), "float32"))
weight = relay.var("weight")
bn_gamma = relay.var("bn_gamma")
bn_beta = relay.var("bn_beta")
bn_mmean = relay.var("bn_mean")
bn_mvar = relay.var("bn_var")
simple_net = relay.nn.conv2d(data=data, weight=weight, kernel_size=(3,3), channels=oc, padding=(1, 1))
simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
simple_net = relay.nn.relu(simple_net)
simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)
data_shape = (batch_size, 3, 224, 224)
mod, params = testing.create_workload(simple_net)
main = mod["main"]
import logging
logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion
def run(mod, params, quantize=False, calibrate_dataset=None):
if(quantize):
with relay.build_config(opt_level=3):
if calibrate_dataset is None:
with relay.quantize.qconfig(global_scale=8.0,
skip_conv_layers=[]):
mod = relay.quantize.quantize(mod, params)
main = mod["main"]
else:
with relay.quantize.qconfig(calibrate_mode='kl_divergence', weight_scale='max',
skip_conv_layers=[]):
mod = relay.quantize.quantize(mod, params, dataset=calibrate_dataset())
main = mod["main"]
else:
main = mod["main"]
target = "llvm"
target_host = "llvm"
graph, lib, params = relay.build(
main, target=target,
params=params, target_host=target_host)
ctx = tvm.context(target, 0)
data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
module = runtime.create(graph, lib, ctx)
module.set_input(**params)
module.set_input("data", data)
module.run()
out_shape = (batch_size, oc, 224, 224)
out = module.get_output(0, tvm.nd.empty(out_shape))
return out.asnumpy()
out = run(mod, params, True, calibrate_dataset)
print(out)
INFO:root:collecting statistics for calibration...
DEBUG:autotvm:Finish loading 35 records
WARNING:autotvm:Cannot find config for target=llvm, workload=('conv2d', (1, 3, 224, 224, 'float32'), (16, 3, 3, 3, 'float32'), (1, 1), (1, 1), (1, 1), 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
[05:05:04] src/io/iter_image_recordio_2.cc:172: ImageRecordIOParser2: /root/.tvm_test_data/val_256_q90.rec, use 4 threads for decoding..
INFO:root:finding threshold with kl for calibration...
DEBUG:autotvm:Finish loading 35 records
WARNING:autotvm:Cannot find config for target=llvm, workload=('conv2d', (1, 3, 224, 224, 'int8'), (16, 3, 3, 3, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW', 'int32'). A fallback configuration is used, which may bring great performance regression.
[[[[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
...
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]]]
I referenced Using External Libraries in Relay and Deploy a Quantized Model on Cuda tutorials when I wrote this.