I’m seeing 10X performance regression between relay graphruntime and relay vm for a fused dense add. Here is the script to reproduce in c5.18xlarge
instance:
## Relay GraphRuntime
import numpy as np
import tvm
from tvm import relay
def load_func():
x = relay.var('x', shape=(512, 512))
result = relay.nn.dense(x, x) + x
func = relay.Function(relay.analysis.free_vars(result), result)
return relay.Module.from_expr(func), None
mod, params = load_func()
target = 'llvm -mcpu=skylake-avx512'
with relay.build_config(opt_level=3):
graph, lib, params = relay.build(mod, params=params, target=target)
data = np.random.rand(512, 512).astype('float32')
import time
from tvm.contrib.debugger import debug_runtime as graph_runtime
ctx = tvm.cpu(0)
model = graph_runtime.create(graph, lib, ctx)
model.set_input(**params)
nums = 1
data_input = tvm.nd.array(data)
start = time.time()
for i in range(nums):
model.set_input('x', data_input)
model.run()
end = time.time()
print((end-start)/nums * 1000)
## RelayVM
import numpy as np
import tvm
from tvm import relay
def load_func():
x = relay.var('x', shape=(512, 512))
result = relay.nn.dense(x, x) + x
func = relay.Function(relay.analysis.free_vars(result), result)
return relay.Module.from_expr(func), None
mod, params = load_func()
with relay.build_config(opt_level=3):
compiler = relay.profiler_vm.VMCompilerProfiler()
ctx = tvm.cpu()
target = 'llvm -mcpu=skylake-avx512'
vm = compiler.compile(mod, target)
vm.init(ctx)
import pdb
data = np.random.rand(512, 512).astype('float32')
import time
nums = 1
start = time.time()
for i in range(nums):
res = vm.invoke("main", [data])
end = time.time()
print(vm.get_stat())
print((end-start)/nums * 1000)
I dumped the IR before calling codegen.codegen_llvm
, they are identical. Also the target string is the same. What else could affect the generated packed function’s performance?