Here is the time figure (compared to tensorflow) and code.
Is my test code missing something?
It is tested on Nvidia Tesla 100 gpu.
time is measured like below:
start = timeit.default_timer()
# run it; TODO: around index is 1030, delta time increased to 100x.
func(a, w, b)
end = timeit.default_timer()
Here is the all code
# perf tvm conv2d
import numpy as np
import tvm
import tensorflow as tf
import topi
from topi.util import get_const_tuple
import cProfile
import timeit
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 22})
save_path = './'
# test count
count = 2000
# whether warm up for first time
warm_up = True
h = 14
w = 14
ic = 256
oc = 512
k_size = 3
stride = 2
batch = 1
padding = 'SAME'
in_height = h
in_width = w
kernel = k_size
in_channel = ic
num_filter = oc
dump_all = True
fig_together = plt.figure(figsize=(14,15))
ax1 = fig_together.add_subplot(111)
plt.xlabel("count")
plt.ylabel("run time /ms")
def print_meantime(total, c, tag):
print(tag, ':mean run time in ms:', total/c*1000)
def dump_image(xdata, ydata, tag):
if dump_all:
return
fig = plt.figure(figsize=(4,5))
plt.plot(xdata, ydata)
plt.savefig(save_path + tag + '_times.png')
plt.close(fig)
def test_conv2d_tvm():
A = tvm.placeholder((in_height, in_width, in_channel, batch), name='A')
W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
B = topi.nn.conv2d_hwcn(A, W, stride, padding)
s1 = topi.cuda.schedule_conv2d_hwcn([B])
a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype)
device = 'cuda'
ctx = tvm.context(device, 0)
a = tvm.nd.array(a_np, ctx)
w = tvm.nd.array(w_np, ctx)
b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
total = 0
x_count = []
y_time = []
with tvm.build_config():
func = tvm.build(s1, [A, W, B], device)
# pre run to warm up
if warm_up:
for x in range(0, 20):
func(a, w, b)
for x in range(0, count):
start = timeit.default_timer()
# run it; TODO: around index is 1030, delta time increased to 100x.
func(a, w, b)
end = timeit.default_timer()
delta = end - start
delta_ms = delta * 1000
total += delta
x_count.append(x)
y_time.append(delta_ms)
print_meantime(total, count, 'tvm')
dump_image(x_count, y_time, 'tvm')
ax1.scatter(x_count, y_time, label='tvm')
def test_conv2d_tf():
with tf.device('/device:GPU:0'):
input = tf.placeholder(tf.float32, shape=(batch, h, w, ic))
real_data = np.random.rand(batch, h, w, ic)
kernel = tf.Variable(tf.random_uniform([k_size, k_size, ic, oc]))
res = tf.nn.conv2d(input, kernel, [1, stride, stride, 1], padding)
config = tf.ConfigProto(device_count = {'GPU': 1})
config.gpu_options.allow_growth = True
with tf.Session(config = config) as sess:
tf.global_variables_initializer().run()
# pre run to warm up
if warm_up:
for x in range(0, 20):
result = sess.run(res, feed_dict={input: real_data})
total = 0
xdata = []
times = []
for x in range(0, count):
start = timeit.default_timer()
result = sess.run(res, feed_dict={input: real_data})
#sess.run(res)
end = timeit.default_timer()
total += end - start
delta = end - start
time_in_ms = delta * 1000
multi = 1
xdata.append(x)
times.append(time_in_ms * multi)
dump_image(xdata, times, 'tf')
print_meantime(total, count, 'tf')
ax1.scatter(xdata, times, label='tf')
if __name__ == '__main__':
test_conv2d_tf()
test_conv2d_tvm()
plt.legend(loc='upper left');
fig_together.savefig(save_path + 'all' + '_times.png')
plt.close(fig_together)