Zero output after quantization in KL divergence mode

Hi, I’m trying TVM post-quantization recently and I found a bit weird problem that the output tensor have only 0.0 in the data after I applied TVM relay.quantize.quantize in KL divergence mode.
Is there anyone who knows about this or encountered?
I put the actual code and the output from shell below

import tvm
import numpy as np
import mxnet as mx
from tvm.contrib import graph_runtime as runtime
from tvm import relay
from tvm.relay import testing
from tvm.contrib.download import download_testdata

calibration_rec = download_testdata(
    'http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec',
    'val_256_q90.rec')

def get_val_data(num_workers=4):
    mean_rgb = [123.68, 116.779, 103.939]
    std_rgb = [58.393, 57.12, 57.375]

    def batch_fn(batch):
        return batch.data[0].asnumpy(), batch.label[0].asnumpy()

    img_size = 224
    val_data = mx.io.ImageRecordIter(
        path_imgrec=calibration_rec,
        preprocess_threads=num_workers,
        shuffle=False,
        batch_size=batch_size,
        resize=256,
        data_shape=(3, img_size, img_size),
        mean_r=mean_rgb[0],
        mean_g=mean_rgb[1],
        mean_b=mean_rgb[2],
        std_r=std_rgb[0],
        std_g=std_rgb[1],
        std_b=std_rgb[2],
    )
    return val_data, batch_fn

calibration_samples = 10

def calibrate_dataset():
    val_data, batch_fn = get_val_data()
    val_data.reset()
    for i, batch in enumerate(val_data):
        if i * batch_size >= calibration_samples:
            break
        data, _ = batch_fn(batch)
        yield {'data': data}

oc = 16
batch_size = 1
data_aware = True

data = relay.var("data", relay.TensorType((batch_size, 3, 224, 224), "float32"))
weight = relay.var("weight")
bn_gamma = relay.var("bn_gamma")
bn_beta = relay.var("bn_beta")
bn_mmean = relay.var("bn_mean")
bn_mvar = relay.var("bn_var")

simple_net = relay.nn.conv2d(data=data, weight=weight, kernel_size=(3,3), channels=oc, padding=(1, 1))
simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
simple_net = relay.nn.relu(simple_net)
simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)

data_shape = (batch_size, 3, 224, 224)
mod, params = testing.create_workload(simple_net)
main = mod["main"]

import logging
logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion

def run(mod, params, quantize=False, calibrate_dataset=None):

    if(quantize):
        with relay.build_config(opt_level=3):
            if calibrate_dataset is None:
                with relay.quantize.qconfig(global_scale=8.0,
                                            skip_conv_layers=[]):
                    mod = relay.quantize.quantize(mod, params)
                    main = mod["main"]
            else:
                with relay.quantize.qconfig(calibrate_mode='kl_divergence', weight_scale='max',
                                            skip_conv_layers=[]):
                    mod = relay.quantize.quantize(mod, params, dataset=calibrate_dataset())
                    main = mod["main"]
    else:
        main = mod["main"]

    target = "llvm"
    target_host = "llvm"

    graph, lib, params = relay.build(
        main, target=target,
        params=params, target_host=target_host)

    ctx = tvm.context(target, 0)
    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
    module = runtime.create(graph, lib, ctx)
    module.set_input(**params)
    module.set_input("data", data)
    module.run()
    out_shape = (batch_size, oc, 224, 224)
    out = module.get_output(0, tvm.nd.empty(out_shape))
    return out.asnumpy()

out = run(mod, params, True, calibrate_dataset)
print(out)
INFO:root:collecting statistics for calibration...
DEBUG:autotvm:Finish loading 35 records
WARNING:autotvm:Cannot find config for target=llvm, workload=('conv2d', (1, 3, 224, 224, 'float32'), (16, 3, 3, 3, 'float32'), (1, 1), (1, 1), (1, 1), 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
[05:05:04] src/io/iter_image_recordio_2.cc:172: ImageRecordIOParser2: /root/.tvm_test_data/val_256_q90.rec, use 4 threads for decoding..
INFO:root:finding threshold with kl for calibration...
DEBUG:autotvm:Finish loading 35 records
WARNING:autotvm:Cannot find config for target=llvm, workload=('conv2d', (1, 3, 224, 224, 'int8'), (16, 3, 3, 3, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW', 'int32'). A fallback configuration is used, which may bring great performance regression.
[[[[0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   ...
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]
   [0. 0. 0. ... 0. 0. 0.]]]]

I referenced Using External Libraries in Relay and Deploy a Quantized Model on Cuda tutorials when I wrote this.