Execute time suddenly increase to 100x of conv2d when executed 1030 times

Here is the time figure (compared to tensorflow) and code.
Is my test code missing something?
It is tested on Nvidia Tesla 100 gpu.

time is measured like below:

			start = timeit.default_timer()
			# run it; TODO: around index is 1030, delta time increased to 100x.
			func(a, w, b)
			end = timeit.default_timer()

Here is the all code

# perf tvm conv2d
import numpy as np
import tvm
import tensorflow as tf
import topi
from topi.util import get_const_tuple
import cProfile
import timeit
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 22})

save_path = './'
# test count
count = 2000
# whether warm up for first time
warm_up = True

h = 14
w = 14
ic = 256
oc = 512
k_size = 3
stride = 2

batch = 1
padding = 'SAME'

in_height = h
in_width = w
kernel = k_size
in_channel = ic
num_filter = oc

dump_all = True
fig_together = plt.figure(figsize=(14,15))
ax1 = fig_together.add_subplot(111)
plt.xlabel("count")
plt.ylabel("run time /ms")

def print_meantime(total, c, tag):
	print(tag, ':mean run time in ms:', total/c*1000)

def dump_image(xdata, ydata, tag):
	if dump_all:
		return
	fig = plt.figure(figsize=(4,5))
	plt.plot(xdata, ydata)
	plt.savefig(save_path + tag + '_times.png')
	plt.close(fig)

def test_conv2d_tvm():
	A = tvm.placeholder((in_height, in_width, in_channel, batch), name='A')
	W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
	B = topi.nn.conv2d_hwcn(A, W, stride, padding)
	s1 = topi.cuda.schedule_conv2d_hwcn([B])
	a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
	w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype)
	device = 'cuda'
	ctx = tvm.context(device, 0)
	a = tvm.nd.array(a_np, ctx)
	w = tvm.nd.array(w_np, ctx)
	b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
	total = 0
	x_count = []
	y_time = []
	with tvm.build_config():
		func = tvm.build(s1, [A, W, B], device)
		# pre run to warm up
		if warm_up:
			for x in range(0, 20):
				func(a, w, b)
		for x in range(0, count):
			start = timeit.default_timer()
			# run it; TODO: around index is 1030, delta time increased to 100x.
			func(a, w, b)
			end = timeit.default_timer()
			delta = end - start
			delta_ms = delta * 1000
			total += delta
			x_count.append(x)
			y_time.append(delta_ms)
		print_meantime(total, count, 'tvm')
		dump_image(x_count, y_time, 'tvm')
		ax1.scatter(x_count, y_time, label='tvm')

def test_conv2d_tf():
	with tf.device('/device:GPU:0'):
		input = tf.placeholder(tf.float32, shape=(batch, h, w, ic))
		real_data = np.random.rand(batch, h, w, ic)
		kernel = tf.Variable(tf.random_uniform([k_size, k_size, ic, oc]))
		res = tf.nn.conv2d(input, kernel, [1, stride, stride, 1], padding)

	config = tf.ConfigProto(device_count = {'GPU': 1})
	config.gpu_options.allow_growth = True
	with tf.Session(config = config) as sess:
		tf.global_variables_initializer().run()
		# pre run to warm up
		if warm_up:
			for x in range(0, 20):
				result = sess.run(res, feed_dict={input: real_data})
		total = 0
		xdata = []
		times = []
		for x in range(0, count):
			start = timeit.default_timer()
			result = sess.run(res, feed_dict={input: real_data})
			#sess.run(res)
			end = timeit.default_timer()
			total += end - start
			delta = end - start
			time_in_ms = delta * 1000
			multi = 1
			xdata.append(x)
			times.append(time_in_ms * multi)
		dump_image(xdata, times, 'tf')
		print_meantime(total, count, 'tf')
		ax1.scatter(xdata, times, label='tf')

if __name__ == '__main__':
    test_conv2d_tf()
    test_conv2d_tvm()
    plt.legend(loc='upper left');
    fig_together.savefig(save_path + 'all' + '_times.png')
    plt.close(fig_together)

I believe by default here the execution is lazy, so there is no guarantee that the computation has finished when the call (in this case func(a, w, b)) returns. A convenient way to time tvm functions and to synchronize properly is to use a time evaluator.