Hi,
I noticed after using kl-divergence to calibrate the quantized weight, it will cast the output to float32 and multiply a float32 value, then round, clip and cast it back to int32. I am wondering why we are not using int32 multiplcation directly here?
Here is the graph after Quantization Pass without kl-divergence and with kl-divergency:
Quantization Pass
%14 = %13(%12) /* ty=Tensor[(1, 4, 56, 56, 16), uint8] /;
%21 = fn (%p05: Tensor[(1, 4, 56, 56, 16), uint8], %p11: Tensor[(4, 4, 3, 3, 4, 16, 4), int8], %p21: Tensor[(4, 1, 1, 16), int32], Primitive=1) -> Tensor[(1, 4, 56, 56, 16), uint8] {
%15 = nn.contrib_conv2d_NCHWc_int8(%p05, %p11, meta[relay.attrs.Conv2DAttrs][1]) / ty=Tensor[(1, 4, 56, 56, 16), int32] /;
%16 = add(%15, %p21) / ty=Tensor[(1, 4, 56, 56, 16), int32] /;
%17 = nn.relu(%16) / ty=Tensor[(1, 4, 56, 56, 16), int32] /;
%18 = add(%17, 64 / ty=int32 /) / ty=Tensor[(1, 4, 56, 56, 16), int32] /;
%19 = right_shift(%18, 7 / ty=int32 /) / ty=Tensor[(1, 4, 56, 56, 16), int32] /;
%20 = clip(%19, a_min=-127f, a_max=127f) / ty=Tensor[(1, 4, 56, 56, 16), int32] /;
cast(%20, dtype=“uint8”) / ty=Tensor[(1, 4, 56, 56, 16), uint8] */
};
Quantization Pass with kl-divergence
%13 = %12(%10) /* ty=Tensor[(1, 4, 56, 56, 16), uint8] /;
%27 = fn (%p04: Tensor[(1, 4, 56, 56, 16), uint8], %p11: Tensor[(4, 4, 3, 3, 4, 16, 4), int8], %p21: Tensor[(4, 1, 1, 16), int32], %p3: Tensor[(4, 1, 1, 16), int32], Primitive=1) -> Tensor[(1, 4, 56, 56, 16), uint8] {
%14 = nn.contrib_conv2d_NCHWc_int8(%p04, %p11, meta[relay.attrs.Conv2DAttrs][1]) / ty=Tensor[(1, 4, 56, 56, 16), int32] /;
%15 = cast(%14, dtype=“float32”) / ty=Tensor[(1, 4, 56, 56, 16), float32] /;
%16 = multiply(%15, 0.0018843f / ty=float32 /) / ty=Tensor[(1, 4, 56, 56, 16), float32] /;
%17 = round(%16) / ty=Tensor[(1, 4, 56, 56, 16), float32] /;
%18 = clip(%17, a_min=-127f, a_max=127f) / ty=Tensor[(1, 4, 56, 56, 16), float32] /;
%19 = cast(%18, dtype=“int32”) / ty=Tensor[(1, 4, 56, 56, 16), int32] /;
%20 = multiply(%19, %p21) / ty=Tensor[(1, 4, 56, 56, 16), int32] /;
%21 = add(%20, %p3) / ty=Tensor[(1, 4, 56, 56, 16), int32] /;
%22 = nn.relu(%21) / ty=Tensor[(1, 4, 56, 56, 16), int32] /;
%23 = cast(%22, dtype=“float32”) / ty=Tensor[(1, 4, 56, 56, 16), float32] /;
%24 = multiply(%23, 0.0340026f / ty=float32 /) / ty=Tensor[(1, 4, 56, 56, 16), float32] /;
%25 = round(%24) / ty=Tensor[(1, 4, 56, 56, 16), float32] /;
%26 = clip(%25, a_min=-127f, a_max=127f) / ty=Tensor[(1, 4, 56, 56, 16), float32] /;
cast(%26, dtype=“uint8”) / ty=Tensor[(1, 4, 56, 56, 16), uint8] */
};
@ziheng @vinx13 @janimesh