Hi, I’m testing the annotation + partitioning added in #4570. Here is a simple network of conv + bn + relu layers.
out_channels = 16
batch_size = 1
def get_layers(prefix, data):
weight = relay.var(prefix+"weight")
bn_gamma = relay.var(prefix+"bn_gamma")
bn_beta = relay.var(prefix+"bn_beta")
bn_mmean = relay.var(prefix+"bn_mean")
bn_mvar = relay.var(prefix+"bn_var")
layer = relay.nn.conv2d(data=data, weight=weight,
kernel_size=(3,3), channels=out_channels, padding=(1, 1))
layer = relay.nn.batch_norm(layer, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
layer = relay.nn.relu(layer)
return layer
data = relay.var("data", relay.TensorType((batch_size, 3, 224, 224), "float32"))
layer1 = get_layers("layer1_", data)
layer2 = get_layers("layer2_", layer1)
layer3 = get_layers("layer3_", layer2)
last = layer1
# layer4 = get_layers("layer1_2_", data)
# last = relay.concatenate((layer1, layer4), axis=1)
net = relay.Function(relay.analysis.free_vars(last), last)
And here is my annotator that tries to group conv + bn + relu:
class ConvBNReluAnnotator(ExprMutator):
def __init__(self, backend):
super(ConvBNReluAnnotator, self).__init__()
self.in_compiler = 0
self.backend = backend
def annotate_call(self, call):
new_args = []
for arg in call.args:
new_arg = super().visit(arg)
if isinstance(new_arg, relay.expr.Var):
new_arg = compiler_begin(new_arg, self.backend)
new_args.append(new_arg)
return relay.Call(call.op, new_args, call.attrs, call.type_args)
def visit_call(self, call):
if call.op.name == "nn.conv2d":
if self.in_compiler == 1:
self.in_compiler = 2
return self.annotate_call(call)
elif call.op.name == "nn.batch_norm":
if self.in_compiler == 1:
return self.annotate_call(call)
elif call.op.name == "nn.relu":
self.in_compiler = 1
op = self.annotate_call(call)
op = compiler_end(op, self.backend)
self.in_compiler = 0
return op
return super().visit_call(call)
If I run annotation + partitioning on a single conv + bn + relu combo, the result looks correct.
fn (%data: Tensor[(1, 3, 224, 224), float32], %layer1_weight: Tensor[(16, 3, 3, 3), float32], %layer1_bn_gamma: Tensor[(16), float32], %layer1_bn_beta: Tensor[(16), float32], %layer1_bn_mean: Tensor[(16), float32], %layer1_bn_var: Tensor[(16), float32]) -> Tensor[(1, 16, 224, 224), float32] {
%0 = annotation.compiler_begin(%data, meta[relay.attrs.CompilerAttrs][0]) /* ty=Tensor[(1, 3, 224, 224), float32] */;
%1 = annotation.compiler_begin(%layer1_weight, meta[relay.attrs.CompilerAttrs][1]) /* ty=Tensor[(16, 3, 3, 3), float32] */;
%2 = nn.conv2d(%0, %1, padding=[1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */;
%3 = annotation.compiler_begin(%layer1_bn_gamma, meta[relay.attrs.CompilerAttrs][2]) /* ty=Tensor[(16), float32] */;
%4 = annotation.compiler_begin(%layer1_bn_beta, meta[relay.attrs.CompilerAttrs][3]) /* ty=Tensor[(16), float32] */;
%5 = annotation.compiler_begin(%layer1_bn_mean, meta[relay.attrs.CompilerAttrs][4]) /* ty=Tensor[(16), float32] */;
%6 = annotation.compiler_begin(%layer1_bn_var, meta[relay.attrs.CompilerAttrs][5]) /* ty=Tensor[(16), float32] */;
%7 = nn.batch_norm(%2, %3, %4, %5, %6) /* ty=(Tensor[(1, 16, 224, 224), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
%8 = %7.0;
%9 = nn.relu(%8) /* ty=Tensor[(1, 16, 224, 224), float32] */;
annotation.compiler_end(%9, meta[relay.attrs.CompilerAttrs][6]) /* ty=Tensor[(1, 16, 224, 224), float32] */
}
// meta data omitted. you can use show_meta_data=True to include meta data
v0.0.4
fn (%data: Tensor[(1, 3, 224, 224), float32], %layer1_weight: Tensor[(16, 3, 3, 3), float32], %layer1_bn_gamma: Tensor[(16), float32], %layer1_bn_beta: Tensor[(16), float32], %layer1_bn_mean: Tensor[(16), float32], %layer1_bn_var: Tensor[(16), float32]) -> Tensor[(1, 16, 224, 224), float32] {
%3 = fn (%dnnl_input0: Tensor[(1, 3, 224, 224), float32], %dnnl_input1: Tensor[(16, 3, 3, 3), float32], %dnnl_input2: Tensor[(16), float32], %dnnl_input3: Tensor[(16), float32], %dnnl_input4: Tensor[(16), float32], %dnnl_input5: Tensor[(16), float32], Compiler="dnnl", ExternalSymbol="dnnl_0", Primitive=1) -> Tensor[(1, 16, 224, 224), float32] {
%0 = nn.conv2d(%dnnl_input0, %dnnl_input1, padding=[1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */;
%1 = nn.batch_norm(%0, %dnnl_input2, %dnnl_input3, %dnnl_input4, %dnnl_input5) /* ty=(Tensor[(1, 16, 224, 224), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
%2 = %1.0;
nn.relu(%2) /* ty=Tensor[(1, 16, 224, 224), float32] */
};
%3(%data, %layer1_weight, %layer1_bn_gamma, %layer1_bn_beta, %layer1_bn_mean, %layer1_bn_var) /* ty=Tensor[(1, 16, 224, 224), float32] */
}
But if I run annotation + partitioning on multiple, consecutive conv + bn + relu combos, annotation looks correct but the partitioning result is not correct. The earlier layers are nested deeper.
fn (%data: Tensor[(1, 3, 224, 224), float32], %layer1_weight: Tensor[(16, 3, 3, 3), float32], %layer1_bn_gamma: Tensor[(16), float32], %layer1_bn_beta: Tensor[(16), float32], %layer1_bn_mean: Tensor[(16), float32], %layer1_bn_var: Tensor[(16), float32], %layer2_weight: Tensor[(16, 16, 3, 3), float32], %layer2_bn_gamma: Tensor[(16), float32], %layer2_bn_beta: Tensor[(16), float32], %layer2_bn_mean: Tensor[(16), float32], %layer2_bn_var: Tensor[(16), float32]) -> Tensor[(1, 16, 224, 224), float32] {
%0 = annotation.compiler_begin(%data, meta[relay.attrs.CompilerAttrs][0]) /* ty=Tensor[(1, 3, 224, 224), float32] */;
%1 = annotation.compiler_begin(%layer1_weight, meta[relay.attrs.CompilerAttrs][1]) /* ty=Tensor[(16, 3, 3, 3), float32] */;
%2 = nn.conv2d(%0, %1, padding=[1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */;
%3 = annotation.compiler_begin(%layer1_bn_gamma, meta[relay.attrs.CompilerAttrs][2]) /* ty=Tensor[(16), float32] */;
%4 = annotation.compiler_begin(%layer1_bn_beta, meta[relay.attrs.CompilerAttrs][3]) /* ty=Tensor[(16), float32] */;
%5 = annotation.compiler_begin(%layer1_bn_mean, meta[relay.attrs.CompilerAttrs][4]) /* ty=Tensor[(16), float32] */;
%6 = annotation.compiler_begin(%layer1_bn_var, meta[relay.attrs.CompilerAttrs][5]) /* ty=Tensor[(16), float32] */;
%7 = nn.batch_norm(%2, %3, %4, %5, %6) /* ty=(Tensor[(1, 16, 224, 224), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
%8 = %7.0;
%9 = nn.relu(%8) /* ty=Tensor[(1, 16, 224, 224), float32] */;
%10 = annotation.compiler_end(%9, meta[relay.attrs.CompilerAttrs][6]) /* ty=Tensor[(1, 16, 224, 224), float32] */;
%11 = annotation.compiler_begin(%layer2_weight, meta[relay.attrs.CompilerAttrs][7]) /* ty=Tensor[(16, 16, 3, 3), float32] */;
%12 = nn.conv2d(%10, %11, padding=[1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */;
%13 = annotation.compiler_begin(%layer2_bn_gamma, meta[relay.attrs.CompilerAttrs][8]) /* ty=Tensor[(16), float32] */;
%14 = annotation.compiler_begin(%layer2_bn_beta, meta[relay.attrs.CompilerAttrs][9]) /* ty=Tensor[(16), float32] */;
%15 = annotation.compiler_begin(%layer2_bn_mean, meta[relay.attrs.CompilerAttrs][10]) /* ty=Tensor[(16), float32] */;
%16 = annotation.compiler_begin(%layer2_bn_var, meta[relay.attrs.CompilerAttrs][11]) /* ty=Tensor[(16), float32] */;
%17 = nn.batch_norm(%12, %13, %14, %15, %16) /* ty=(Tensor[(1, 16, 224, 224), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
%18 = %17.0;
%19 = nn.relu(%18) /* ty=Tensor[(1, 16, 224, 224), float32] */;
annotation.compiler_end(%19, meta[relay.attrs.CompilerAttrs][12]) /* ty=Tensor[(1, 16, 224, 224), float32] */
}
// meta data omitted. you can use show_meta_data=True to include meta data
v0.0.4
fn (%data: Tensor[(1, 3, 224, 224), float32], %layer1_weight: Tensor[(16, 3, 3, 3), float32], %layer1_bn_gamma: Tensor[(16), float32], %layer1_bn_beta: Tensor[(16), float32], %layer1_bn_mean: Tensor[(16), float32], %layer1_bn_var: Tensor[(16), float32], %layer2_weight: Tensor[(16, 16, 3, 3), float32], %layer2_bn_gamma: Tensor[(16), float32], %layer2_bn_beta: Tensor[(16), float32], %layer2_bn_mean: Tensor[(16), float32], %layer2_bn_var: Tensor[(16), float32]) -> Tensor[(1, 16, 224, 224), float32] {
%8 = fn (%dnnl_input6: Tensor[(16, 16, 3, 3), float32], %dnnl_input7: Tensor[(16), float32], %dnnl_input8: Tensor[(16), float32], %dnnl_input9: Tensor[(16), float32], %dnnl_input10: Tensor[(16), float32], Compiler="dnnl", ExternalSymbol="dnnl_0", Primitive=1) -> Tensor[(1, 16, 224, 224), float32] {
%3 = fn (%dnnl_input0: Tensor[(1, 3, 224, 224), float32], %dnnl_input1: Tensor[(16, 3, 3, 3), float32], %dnnl_input2: Tensor[(16), float32], %dnnl_input3: Tensor[(16), float32], %dnnl_input4: Tensor[(16), float32], %dnnl_input5: Tensor[(16), float32], Compiler="dnnl", ExternalSymbol="dnnl_1", Primitive=1) -> Tensor[(1, 16, 224, 224), float32] {
%0 = nn.conv2d(%dnnl_input0, %dnnl_input1, padding=[1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */;
%1 = nn.batch_norm(%0, %dnnl_input2, %dnnl_input3, %dnnl_input4, %dnnl_input5) /* ty=(Tensor[(1, 16, 224, 224), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
%2 = %1.0;
nn.relu(%2) /* ty=Tensor[(1, 16, 224, 224), float32] */
};
%4 = %3(%data, %layer1_weight, %layer1_bn_gamma, %layer1_bn_beta, %layer1_bn_mean, %layer1_bn_var) /* ty=Tensor[(1, 16, 224, 224), float32] */;
%5 = nn.conv2d(%4, %dnnl_input6, padding=[1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 224, 224), float32] */;
%6 = nn.batch_norm(%5, %dnnl_input7, %dnnl_input8, %dnnl_input9, %dnnl_input10) /* ty=(Tensor[(1, 16, 224, 224), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
%7 = %6.0;
nn.relu(%7) /* ty=Tensor[(1, 16, 224, 224), float32] */
};
%8(%layer2_weight, %layer2_bn_gamma, %layer2_bn_beta, %layer2_bn_mean, %layer2_bn_var) /* ty=Tensor[(1, 16, 224, 224), float32] */
}
I also confirmed that if there are multiple, parallel conv + bn + relu branches instead of consecutive ones, the partitioning works correctly.
So I suspect there is an bug in the partitioning code which is triggered by multiple consecutive subgraphs. Operator fusion should be possible with manual annotation by now, so I think we need to fix this. cc @zhiics @comaniac