Yolo v3 on video is too slow


#1

Hi all,

I try to modify the exmaple on Yolo V3 (specifically, from_darknet.py in the tutorial folder). I modify it to read from video and run it using CPU (i7, 7800) LLVM 7, and without GPU nor any accelerator.

import nnvm
import nnvm.frontend.darknet
import nnvm.testing.yolo_detection
import nnvm.testing.darknet
import matplotlib.pyplot as plt
import numpy as np
import tvm
import sys
import cv2

from ctypes import *
from tvm.contrib.download import download
from nnvm.testing.darknet import __darknetffi__

# Model name
MODEL_NAME = 'yolov3'

######################################################################
# Download required files
# -----------------------
# Download cfg and weights file if first time.
CFG_NAME = MODEL_NAME + '.cfg'
WEIGHTS_NAME = MODEL_NAME + '.weights'
REPO_URL = 'https://github.com/siju-samuel/darknet/blob/master/'
CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
WEIGHTS_URL = 'https://pjreddie.com/media/files/' + WEIGHTS_NAME

download(CFG_URL, CFG_NAME)
download(WEIGHTS_URL, WEIGHTS_NAME)

# Download and Load darknet library
if sys.platform in ['linux', 'linux2']:
    DARKNET_LIB = 'libdarknet2.0.so'
    DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
elif sys.platform == 'darwin':
    DARKNET_LIB = 'libdarknet_mac2.0.so'
    DARKNET_URL = REPO_URL + 'lib_osx/' + DARKNET_LIB + '?raw=true'
else:
    err = "Darknet lib is not supported on {} platform".format(sys.platform)
    raise NotImplementedError(err)

download(DARKNET_URL, DARKNET_LIB)

DARKNET_LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
cfg = "./" + str(CFG_NAME)
weights = "./" + str(WEIGHTS_NAME)
net = DARKNET_LIB.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
dtype = 'float32'
batch_size = 1

print("Converting darknet to nnvm symbols...")
sym, params = nnvm.frontend.darknet.from_darknet(net, dtype)

######################################################################
# Compile the model on NNVM
# -------------------------
# compile the model
target = 'llvm'
ctx = tvm.cpu(0)
data = np.empty([batch_size, net.c, net.h, net.w], dtype)
shape = {'data': data.shape}
print("Compiling the model...")
dtype_dict = {}
with nnvm.compiler.build_config(opt_level=2):
    graph, lib, params = nnvm.compiler.build(sym, target, shape, dtype_dict, params)

[neth, netw] = shape['data'][2:] # Current image shape is 608x608
######################################################################
# Execute on TVM Runtime
# ----------------------
# The process is no different from other examples.
from tvm.contrib import graph_runtime

m = graph_runtime.create(graph, lib, ctx)
m.set_input(**params)
thresh = 0.5
nms_thresh = 0.45
coco_name = 'coco.names'
coco_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + coco_name + '?raw=true'
font_name = 'arial.ttf'
font_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + font_name + '?raw=true'
download(coco_url, coco_name)
download(font_url, font_name)

with open(coco_name) as f:
    content = f.readlines()

names = [x.strip() for x in content]
vcap = cv2.VideoCapture("myvid.mp4")
while(1):
    ret, frame = vcap.read()   
    img = np.array(frame)
    img = img.transpose((2, 0, 1))
    img = np.divide(img, 255.0)
    img = np.flip(img, 0)
    data = nnvm.testing.darknet._letterbox_image(img, netw, neth)
    # set inputs
    m.set_input('data', tvm.nd.array(data.astype(dtype)))
    # execute
    print("Running the test image...")

    m.run()
    # get outputs
    tvm_out = []
    for i in range(3):
        layer_out = {}
        layer_out['type'] = 'Yolo'
        # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
        layer_attr = m.get_output(i*4+3).asnumpy()
        layer_out['biases'] = m.get_output(i*4+2).asnumpy()
        layer_out['mask'] = m.get_output(i*4+1).asnumpy()
        out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
                     layer_attr[2], layer_attr[3])
        layer_out['output'] = m.get_output(i*4).asnumpy().reshape(out_shape)
        layer_out['classes'] = layer_attr[4]
        tvm_out.append(layer_out)
    
    _, im_h, im_w = img.shape
    dets = nnvm.testing.yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh,
                                                      1, tvm_out)
    last_layer = net.layers[net.n - 1]
    nnvm.testing.yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
    nnvm.testing.yolo_detection.draw_detections(img, dets, thresh, names, last_layer.classes)

    cv2.imshow('VIDEO', img.transpose(1, 2, 0))
    cv2.waitKey(5)

But the problem, it runs very slow, around 5 seconds per frame. Is it normal? Or I made any mistake? I wonder, since if I use OpenVINO, I can get around 7 FPS using the same CPU.

Thank you for supporting me.

Ganba


#2

To maximize performance on cpu, you need to set target according to your device. Since i7 7800x has avx512, you can set target as “llvm -mcpu=skylake-avx512”. You also need to set opt_level to be 3 to use conv2d_NCHWc layout which is much faster. You should be able to a better result after setting these. However, you can get better performance following autotvm tutorial for x86. After graph tuner is merged, you can get even more performance boost.