Yolo v3 on video is too slow

Hi all,

I try to modify the exmaple on Yolo V3 (specifically, from_darknet.py in the tutorial folder). I modify it to read from video and run it using CPU (i7, 7800) LLVM 7, and without GPU nor any accelerator.

import nnvm
import nnvm.frontend.darknet
import nnvm.testing.yolo_detection
import nnvm.testing.darknet
import matplotlib.pyplot as plt
import numpy as np
import tvm
import sys
import cv2

from ctypes import *
from tvm.contrib.download import download
from nnvm.testing.darknet import __darknetffi__

# Model name
MODEL_NAME = 'yolov3'

######################################################################
# Download required files
# -----------------------
# Download cfg and weights file if first time.
CFG_NAME = MODEL_NAME + '.cfg'
WEIGHTS_NAME = MODEL_NAME + '.weights'
REPO_URL = 'https://github.com/siju-samuel/darknet/blob/master/'
CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
WEIGHTS_URL = 'https://pjreddie.com/media/files/' + WEIGHTS_NAME

download(CFG_URL, CFG_NAME)
download(WEIGHTS_URL, WEIGHTS_NAME)

# Download and Load darknet library
if sys.platform in ['linux', 'linux2']:
    DARKNET_LIB = 'libdarknet2.0.so'
    DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
elif sys.platform == 'darwin':
    DARKNET_LIB = 'libdarknet_mac2.0.so'
    DARKNET_URL = REPO_URL + 'lib_osx/' + DARKNET_LIB + '?raw=true'
else:
    err = "Darknet lib is not supported on {} platform".format(sys.platform)
    raise NotImplementedError(err)

download(DARKNET_URL, DARKNET_LIB)

DARKNET_LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
cfg = "./" + str(CFG_NAME)
weights = "./" + str(WEIGHTS_NAME)
net = DARKNET_LIB.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
dtype = 'float32'
batch_size = 1

print("Converting darknet to nnvm symbols...")
sym, params = nnvm.frontend.darknet.from_darknet(net, dtype)

######################################################################
# Compile the model on NNVM
# -------------------------
# compile the model
target = 'llvm'
ctx = tvm.cpu(0)
data = np.empty([batch_size, net.c, net.h, net.w], dtype)
shape = {'data': data.shape}
print("Compiling the model...")
dtype_dict = {}
with nnvm.compiler.build_config(opt_level=2):
    graph, lib, params = nnvm.compiler.build(sym, target, shape, dtype_dict, params)

[neth, netw] = shape['data'][2:] # Current image shape is 608x608
######################################################################
# Execute on TVM Runtime
# ----------------------
# The process is no different from other examples.
from tvm.contrib import graph_runtime

m = graph_runtime.create(graph, lib, ctx)
m.set_input(**params)
thresh = 0.5
nms_thresh = 0.45
coco_name = 'coco.names'
coco_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + coco_name + '?raw=true'
font_name = 'arial.ttf'
font_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + font_name + '?raw=true'
download(coco_url, coco_name)
download(font_url, font_name)

with open(coco_name) as f:
    content = f.readlines()

names = [x.strip() for x in content]
vcap = cv2.VideoCapture("myvid.mp4")
while(1):
    ret, frame = vcap.read()   
    img = np.array(frame)
    img = img.transpose((2, 0, 1))
    img = np.divide(img, 255.0)
    img = np.flip(img, 0)
    data = nnvm.testing.darknet._letterbox_image(img, netw, neth)
    # set inputs
    m.set_input('data', tvm.nd.array(data.astype(dtype)))
    # execute
    print("Running the test image...")

    m.run()
    # get outputs
    tvm_out = []
    for i in range(3):
        layer_out = {}
        layer_out['type'] = 'Yolo'
        # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
        layer_attr = m.get_output(i*4+3).asnumpy()
        layer_out['biases'] = m.get_output(i*4+2).asnumpy()
        layer_out['mask'] = m.get_output(i*4+1).asnumpy()
        out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
                     layer_attr[2], layer_attr[3])
        layer_out['output'] = m.get_output(i*4).asnumpy().reshape(out_shape)
        layer_out['classes'] = layer_attr[4]
        tvm_out.append(layer_out)
    
    _, im_h, im_w = img.shape
    dets = nnvm.testing.yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh,
                                                      1, tvm_out)
    last_layer = net.layers[net.n - 1]
    nnvm.testing.yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
    nnvm.testing.yolo_detection.draw_detections(img, dets, thresh, names, last_layer.classes)

    cv2.imshow('VIDEO', img.transpose(1, 2, 0))
    cv2.waitKey(5)

But the problem, it runs very slow, around 5 seconds per frame. Is it normal? Or I made any mistake? I wonder, since if I use OpenVINO, I can get around 7 FPS using the same CPU.

Thank you for supporting me.

Ganba

To maximize performance on cpu, you need to set target according to your device. Since i7 7800x has avx512, you can set target as “llvm -mcpu=skylake-avx512”. You also need to set opt_level to be 3 to use conv2d_NCHWc layout which is much faster. You should be able to a better result after setting these. However, you can get better performance following autotvm tutorial for x86. After graph tuner is merged, you can get even more performance boost.

1 Like