Hi. When I use auto-tvm to tune Resnet50 and MobileNetv2 on ARM CPU(single A53 on rk3399), I find that the inference time is not better than the code optimized with assembly ourselves. Resnet50 inference time on the ARM Cortex A53 is 1.036x slower and MobileNetv2 inference time is 1.613x slower. And after profiling, I find the main reasons:
- TVM conv2d time cost: conv1(1) is 1.3x slower and depthwise conv3(1) is 2.9x slower compared with our code. Workloads will be attached in the end.
- I take res.costs in the tuning log as the time cost of conv2d. The time ratio of conv2d in TVM is 72% and 69% for Resnet50 and MobileNetv2, while the time ratio of conv in our code is 96.3% and 85.3% for Resnet50 and MobileNetv2. I wonder if it contains the cost of data transformation and so on. Why is the time ratio of conv2d in TVM so much lower? Do I get something wrong when profiling?
How can I improve the performance result? Could you give some advices? Thanks in advance.
Test Environment: single A53 on rk3399, frequency is fixed to be 1.008 GHz
Model: Resnet50
the code optimized with assembly ourselves (ms): 2462
TVM autotune(n_trial=2000, early_stopping=1000, opt_level=3, timeout=1e9) (ms):2549.65
Speedup: 0.9656x
Model: MobileNetv2
the code optimized with assembly ourselves (ms): 244.82
TVM autotune(n_trial=2000, early_stopping=1000, opt_level=3, timeout=1e9) (ms):394.89
Speedup: 0.62x
Resnet50 tune log(after pick_best):
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 3, 224, 224], “float32”], [“TENSOR”, [64, 3, 7, 7], “float32”], [2, 2], [3, 3], “NCHW”, “float32”], {}, [“conv2d”, [1, 3, 224, 224, “float32”], [64, 3, 7, 7, “float32”], [2, 2], [3, 3], “NCHW”, “float32”], {“i”: 77282, “c”: null, “e”: [[“tile_co”, “sp”, [16, 4]], [“tile_oh”, “sp”, [112, 1]], [“tile_ow”, “sp”, [14, 8]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“unroll”, “unroll”]], [“ann_spatial”, “an”, [“unroll”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.074890018], 0, 1.7477071285247803, 1539139066.319467], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 64, 56, 56], “float32”], [“TENSOR”, [64, 64, 1, 1], “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 64, 56, 56, “float32”], [64, 64, 1, 1, “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {“i”: 20001, “c”: null, “e”: [[“tile_co”, “sp”, [16, 4]], [“tile_oh”, “sp”, [28, 2]], [“tile_ow”, “sp”, [4, 14]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“unroll”, “none”]], [“ann_spatial”, “an”, [“none”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.010285772], 0, 0.3331129550933838, 1539141548.854048], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 64, 56, 56], “float32”], [“TENSOR”, [64, 64, 3, 3], “float32”], [1, 1], [1, 1], “NCHW”, “float32”], {}, [“conv2d”, [1, 64, 56, 56, “float32”], [64, 64, 3, 3, “float32”], [1, 1], [1, 1], “NCHW”, “float32”], {“i”: 712, “c”: null, “e”: [[“tile_p”, “sp”, [49, 4]], [“tile_k”, “sp”, [16, 4]], [“tile_c”, “sp”, [8, 8]], [“ann_reduce”, “an”, [“unroll”]], [“ann_spatial”, “an”, [“none”, “vec”]]], “t”: “winograd”}], “r”: [[0.04315077525], 0, 1.4104440212249756, 1539142745.342937], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 64, 56, 56], “float32”], [“TENSOR”, [256, 64, 1, 1], “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 64, 56, 56, “float32”], [256, 64, 1, 1, “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {“i”: 24483, “c”: null, “e”: [[“tile_co”, “sp”, [32, 8]], [“tile_oh”, “sp”, [56, 1]], [“tile_ow”, “sp”, [7, 8]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“none”, “unroll”]], [“ann_spatial”, “an”, [“none”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.03853981625], 0, 0.5232648849487305, 1539145301.373015], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 256, 56, 56], “float32”], [“TENSOR”, [64, 256, 1, 1], “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 256, 56, 56, “float32”], [64, 256, 1, 1, “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {“i”: 36067, “c”: null, “e”: [[“tile_co”, “sp”, [8, 8]], [“tile_oh”, “sp”, [56, 1]], [“tile_ow”, “sp”, [7, 8]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“none”, “none”]], [“ann_spatial”, “an”, [“unroll”, “none”, “vec”]]], “t”: “direct”}], “r”: [[0.03775319125], 0, 0.6328468322753906, 1539147075.391883], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 256, 56, 56], “float32”], [“TENSOR”, [128, 256, 1, 1], “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 256, 56, 56, “float32”], [128, 256, 1, 1, “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {“i”: 30723, “c”: null, “e”: [[“tile_co”, “sp”, [16, 8]], [“tile_oh”, “sp”, [28, 1]], [“tile_ow”, “sp”, [2, 14]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“none”, “unroll”]], [“ann_spatial”, “an”, [“unroll”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.01797935575], 0, 0.456265926361084, 1539148968.302433], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 128, 28, 28], “float32”], [“TENSOR”, [128, 128, 3, 3], “float32”], [1, 1], [1, 1], “NCHW”, “float32”], {}, [“conv2d”, [1, 128, 28, 28, “float32”], [128, 128, 3, 3, “float32”], [1, 1], [1, 1], “NCHW”, “float32”], {“i”: 695, “c”: null, “e”: [[“tile_p”, “sp”, [7, 7]], [“tile_k”, “sp”, [32, 4]], [“tile_c”, “sp”, [8, 16]], [“ann_reduce”, “an”, [“unroll”]], [“ann_spatial”, “an”, [“vec”, “none”]]], “t”: “winograd”}], “r”: [[0.035037045], 0, 1.7447781562805176, 1539150881.999021], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 128, 28, 28], “float32”], [“TENSOR”, [512, 128, 1, 1], “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 128, 28, 28, “float32”], [512, 128, 1, 1, “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {“i”: 38312, “c”: null, “e”: [[“tile_co”, “sp”, [128, 4]], [“tile_oh”, “sp”, [4, 7]], [“tile_ow”, “sp”, [7, 4]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“none”, “unroll”]], [“ann_spatial”, “an”, [“unroll”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.0323195865], 0, 0.6552119255065918, 1539153410.616602], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 256, 56, 56], “float32”], [“TENSOR”, [512, 256, 1, 1], “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 256, 56, 56, “float32”], [512, 256, 1, 1, “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {“i”: 16083, “c”: null, “e”: [[“tile_co”, “sp”, [64, 8]], [“tile_oh”, “sp”, [28, 1]], [“tile_ow”, “sp”, [2, 14]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“unroll”, “none”]], [“ann_spatial”, “an”, [“none”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.068800382], 0, 0.797644853591919, 1539156560.553953], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 512, 28, 28], “float32”], [“TENSOR”, [128, 512, 1, 1], “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 512, 28, 28, “float32”], [128, 512, 1, 1, “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {“i”: 12109, “c”: null, “e”: [[“tile_co”, “sp”, [4, 32]], [“tile_oh”, “sp”, [14, 2]], [“tile_ow”, “sp”, [28, 1]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“none”, “unroll”]], [“ann_spatial”, “an”, [“none”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.03443890975], 0, 0.6284029483795166, 1539159635.885853], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 512, 28, 28], “float32”], [“TENSOR”, [256, 512, 1, 1], “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 512, 28, 28, “float32”], [256, 512, 1, 1, “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {“i”: 5871, “c”: null, “e”: [[“tile_co”, “sp”, [32, 8]], [“tile_oh”, “sp”, [14, 1]], [“tile_ow”, “sp”, [1, 14]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“none”, “none”]], [“ann_spatial”, “an”, [“none”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.01643840775], 0, 0.542029857635498, 1539162443.906005], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 256, 14, 14], “float32”], [“TENSOR”, [256, 256, 3, 3], “float32”], [1, 1], [1, 1], “NCHW”, “float32”], {}, [“conv2d”, [1, 256, 14, 14, “float32”], [256, 256, 3, 3, “float32”], [1, 1], [1, 1], “NCHW”, “float32”], {“i”: 518, “c”: null, “e”: [[“tile_p”, “sp”, [2, 8]], [“tile_k”, “sp”, [32, 8]], [“tile_c”, “sp”, [256, 1]], [“ann_reduce”, “an”, [“none”]], [“ann_spatial”, “an”, [“none”, “vec”]]], “t”: “winograd”}], “r”: [[0.0290548155], 0, 1.66587495803833, 1539165118.931159], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 256, 14, 14], “float32”], [“TENSOR”, [1024, 256, 1, 1], “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 256, 14, 14, “float32”], [1024, 256, 1, 1, “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {“i”: 18449, “c”: null, “e”: [[“tile_co”, “sp”, [256, 4]], [“tile_oh”, “sp”, [7, 2]], [“tile_ow”, “sp”, [1, 14]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“none”, “none”]], [“ann_spatial”, “an”, [“unroll”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.03087116975], 0, 0.6397318840026855, 1539169079.866169], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 512, 28, 28], “float32”], [“TENSOR”, [1024, 512, 1, 1], “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 512, 28, 28, “float32”], [1024, 512, 1, 1, “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {“i”: 4007, “c”: null, “e”: [[“tile_co”, “sp”, [128, 8]], [“tile_oh”, “sp”, [14, 1]], [“tile_ow”, “sp”, [1, 14]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“unroll”, “unroll”]], [“ann_spatial”, “an”, [“none”, “none”, “vec”]]], “t”: “direct”}], “r”: [[0.06387566275], 0, 0.8418159484863281, 1539171836.658565], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 1024, 14, 14], “float32”], [“TENSOR”, [256, 1024, 1, 1], “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 1024, 14, 14, “float32”], [256, 1024, 1, 1, “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {“i”: 3182, “c”: null, “e”: [[“tile_co”, “sp”, [8, 32]], [“tile_oh”, “sp”, [7, 2]], [“tile_ow”, “sp”, [14, 1]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“unroll”, “unroll”]], [“ann_spatial”, “an”, [“none”, “none”, “vec”]]], “t”: “direct”}], “r”: [[0.0322977115], 0, 1.2555630207061768, 1539175528.747063], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 1024, 14, 14], “float32”], [“TENSOR”, [512, 1024, 1, 1], “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 1024, 14, 14, “float32”], [512, 1024, 1, 1, “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {“i”: 1863, “c”: null, “e”: [[“tile_co”, “sp”, [64, 8]], [“tile_oh”, “sp”, [7, 1]], [“tile_ow”, “sp”, [1, 7]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“unroll”, “unroll”]], [“ann_spatial”, “an”, [“none”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.016050637], 0, 0.6193010807037354, 1539178288.041898], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 512, 7, 7], “float32”], [“TENSOR”, [512, 512, 3, 3], “float32”], [1, 1], [1, 1], “NCHW”, “float32”], {}, [“conv2d”, [1, 512, 7, 7, “float32”], [512, 512, 3, 3, “float32”], [1, 1], [1, 1], “NCHW”, “float32”], {“i”: 404, “c”: null, “e”: [[“tile_p”, “sp”, [1, 4]], [“tile_k”, “sp”, [32, 16]], [“tile_c”, “sp”, [256, 2]], [“ann_reduce”, “an”, [“unroll”]], [“ann_spatial”, “an”, [“none”, “vec”]]], “t”: “winograd”}], “r”: [[0.02539016925], 0, 3.192999839782715, 1539180726.346742], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 512, 7, 7], “float32”], [“TENSOR”, [2048, 512, 1, 1], “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 512, 7, 7, “float32”], [2048, 512, 1, 1, “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {“i”: 4155, “c”: null, “e”: [[“tile_co”, “sp”, [256, 8]], [“tile_oh”, “sp”, [7, 1]], [“tile_ow”, “sp”, [1, 7]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“unroll”, “unroll”]], [“ann_spatial”, “an”, [“unroll”, “none”, “vec”]]], “t”: “direct”}], “r”: [[0.03523997225], 0, 0.71940016746521, 1539184531.843269], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 1024, 14, 14], “float32”], [“TENSOR”, [2048, 1024, 1, 1], “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 1024, 14, 14, “float32”], [2048, 1024, 1, 1, “float32”], [2, 2], [0, 0], “NCHW”, “float32”], {“i”: 5259, “c”: null, “e”: [[“tile_co”, “sp”, [256, 8]], [“tile_oh”, “sp”, [7, 1]], [“tile_ow”, “sp”, [1, 7]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 9, 7, 8]], [“ann_reduce”, “an”, [“unroll”, “none”]], [“ann_spatial”, “an”, [“unroll”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.06342715225], 0, 0.8785848617553711, 1539187590.13863], “v”: 0.1}
{“i”: [“llvm -device=arm_cpu -model=rk3399 -target=aarch64-linux-gnu”, “topi_nn_conv2d”, [[“TENSOR”, [1, 2048, 7, 7], “float32”], [“TENSOR”, [512, 2048, 1, 1], “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {}, [“conv2d”, [1, 2048, 7, 7, “float32”], [512, 2048, 1, 1, “float32”], [1, 1], [0, 0], “NCHW”, “float32”], {“i”: 1623, “c”: null, “e”: [[“tile_co”, “sp”, [64, 8]], [“tile_oh”, “sp”, [7, 1]], [“tile_ow”, “sp”, [1, 7]], [“reorder_0”, “re”, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], [“ann_reduce”, “an”, [“none”, “none”]], [“ann_spatial”, “an”, [“none”, “unroll”, “vec”]]], “t”: “direct”}], “r”: [[0.03507117025], 0, 0.7908809185028076, 1539191122.870224], “v”: 0.1}
Due to the limit of characters in one post, Mobilenetv2 tune log is omitted.