Execute time suddenly increase to 100x of conv2d when executed 1030 times


#1

Here is the time figure (compared to tensorflow) and code.
Is my test code missing something?
It is tested on Nvidia Tesla 100 gpu.

time is measured like below:

			start = timeit.default_timer()
			# run it; TODO: around index is 1030, delta time increased to 100x.
			func(a, w, b)
			end = timeit.default_timer()

Here is the all code

# perf tvm conv2d
import numpy as np
import tvm
import tensorflow as tf
import topi
from topi.util import get_const_tuple
import cProfile
import timeit
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 22})

save_path = './'
# test count
count = 2000
# whether warm up for first time
warm_up = True

h = 14
w = 14
ic = 256
oc = 512
k_size = 3
stride = 2

batch = 1
padding = 'SAME'

in_height = h
in_width = w
kernel = k_size
in_channel = ic
num_filter = oc

dump_all = True
fig_together = plt.figure(figsize=(14,15))
ax1 = fig_together.add_subplot(111)
plt.xlabel("count")
plt.ylabel("run time /ms")

def print_meantime(total, c, tag):
	print(tag, ':mean run time in ms:', total/c*1000)

def dump_image(xdata, ydata, tag):
	if dump_all:
		return
	fig = plt.figure(figsize=(4,5))
	plt.plot(xdata, ydata)
	plt.savefig(save_path + tag + '_times.png')
	plt.close(fig)

def test_conv2d_tvm():
	A = tvm.placeholder((in_height, in_width, in_channel, batch), name='A')
	W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
	B = topi.nn.conv2d_hwcn(A, W, stride, padding)
	s1 = topi.cuda.schedule_conv2d_hwcn([B])
	a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
	w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype)
	device = 'cuda'
	ctx = tvm.context(device, 0)
	a = tvm.nd.array(a_np, ctx)
	w = tvm.nd.array(w_np, ctx)
	b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
	total = 0
	x_count = []
	y_time = []
	with tvm.build_config():
		func = tvm.build(s1, [A, W, B], device)
		# pre run to warm up
		if warm_up:
			for x in range(0, 20):
				func(a, w, b)
		for x in range(0, count):
			start = timeit.default_timer()
			# run it; TODO: around index is 1030, delta time increased to 100x.
			func(a, w, b)
			end = timeit.default_timer()
			delta = end - start
			delta_ms = delta * 1000
			total += delta
			x_count.append(x)
			y_time.append(delta_ms)
		print_meantime(total, count, 'tvm')
		dump_image(x_count, y_time, 'tvm')
		ax1.scatter(x_count, y_time, label='tvm')

def test_conv2d_tf():
	with tf.device('/device:GPU:0'):
		input = tf.placeholder(tf.float32, shape=(batch, h, w, ic))
		real_data = np.random.rand(batch, h, w, ic)
		kernel = tf.Variable(tf.random_uniform([k_size, k_size, ic, oc]))
		res = tf.nn.conv2d(input, kernel, [1, stride, stride, 1], padding)

	config = tf.ConfigProto(device_count = {'GPU': 1})
	config.gpu_options.allow_growth = True
	with tf.Session(config = config) as sess:
		tf.global_variables_initializer().run()
		# pre run to warm up
		if warm_up:
			for x in range(0, 20):
				result = sess.run(res, feed_dict={input: real_data})
		total = 0
		xdata = []
		times = []
		for x in range(0, count):
			start = timeit.default_timer()
			result = sess.run(res, feed_dict={input: real_data})
			#sess.run(res)
			end = timeit.default_timer()
			total += end - start
			delta = end - start
			time_in_ms = delta * 1000
			multi = 1
			xdata.append(x)
			times.append(time_in_ms * multi)
		dump_image(xdata, times, 'tf')
		print_meantime(total, count, 'tf')
		ax1.scatter(xdata, times, label='tf')

if __name__ == '__main__':
    test_conv2d_tf()
    test_conv2d_tvm()
    plt.legend(loc='upper left');
    fig_together.savefig(save_path + 'all' + '_times.png')
    plt.close(fig_together)

#2

I believe by default here the execution is lazy, so there is no guarantee that the computation has finished when the call (in this case func(a, w, b)) returns. A convenient way to time tvm functions and to synchronize properly is to use a time evaluator.