I feel confused that both CPU and GPUs are not busy during tuning.
Actually I register five NVIDIA T4, following the steps from the tutorial, but the nvidia-smi
shows most of time the GPUs are free and only one GPU is used which looks strange.
Here is my code snippet:
#### DEVICE CONFIG ####
target = tvm.target.cuda()
#### TUNING OPTION ####
network = 'aa'
log_file = "%s.log" % network
dtype = 'float32'
tuning_option = {
'log_filename': log_file,
'tuner': 'xgb',
'n_trial': 1500,
'early_stopping': 400,
'measure_option': autotvm.measure_option(
builder=autotvm.LocalBuilder(timeout=10),
#runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),
runner=autotvm.RPCRunner(
'T4', # change the device key to your key
'0.0.0.0', 9190,
number=20, repeat=3, timeout=4, min_repeat_ms=150)
),
}
def tune_tasks(tasks,
measure_option,
tuner='xgb',
n_trial=1000,
early_stopping=None,
log_filename='tuning.log',
use_transfer_learning=True):
# create tmp log file
tmp_log_file = log_filename + ".tmp"
if os.path.exists(tmp_log_file):
os.remove(tmp_log_file)
for i, tsk in enumerate(reversed(tasks)):
prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
# create tuner
if tuner == 'xgb' or tuner == 'xgb-rank':
tuner_obj = XGBTuner(tsk, loss_type='rank')
elif tuner == 'ga':
tuner_obj = GATuner(tsk, pop_size=100)
elif tuner == 'random':
tuner_obj = RandomTuner(tsk)
elif tuner == 'gridsearch':
tuner_obj = GridSearchTuner(tsk)
else:
raise ValueError("Invalid tuner: " + tuner)
if use_transfer_learning:
if os.path.isfile(tmp_log_file):
tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
# do tuning
tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
early_stopping=early_stopping,
measure_option=measure_option,
callbacks=[
autotvm.callback.progress_bar(n_trial, prefix=prefix),
autotvm.callback.log_to_file(tmp_log_file)])
# pick best records to a cache file
autotvm.record.pick_best(tmp_log_file, log_filename)
os.remove(tmp_log_file)
def tune_and_evaluate(tuning_opt):
# extract workloads from relay program
print("Extract tasks...")
mod, params, input_shape, out_shape = get_network(network, batch_size=1)
with relay.quantize.qconfig(store_lowbit_output=False):
mod['main'] = relay.quantize.quantize(mod['main'], params=params)
tasks = autotvm.task.extract_from_program(mod['main'], target=target,
params=params, ops=(relay.op.nn.conv2d,))
for i in range(len(tasks)):
tsk = tasks[i]
input_channel = tsk.workload[1][1]
output_channel = tsk.workload[1][0]
if output_channel % 4 == 0 and input_channel % 4 == 0:
tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
tasks[i].target, tasks[i].target_host, 'int8')
tasks[i] = tsk
# run tuning tasks
print("Tuning...")
tune_tasks(tasks, **tuning_opt)
# compile kernels with history best records
with autotvm.apply_history_best(log_file):
print("Compile...")
with relay.build_config(opt_level=3):
graph, lib, params = relay.build_module.build(
mod, target=target, params=params)
# export library
tmp = tempdir()
filename = "net.tar"
lib.export_library(tmp.relpath(filename))
# load parameters
ctx = tvm.context(str(target), 0)
module = runtime.create(graph, lib, ctx)
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
module.set_input('data', data_tvm)
module.set_input(**params)
# evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=60)
prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
(np.mean(prof_res), np.std(prof_res)))
tune_and_evaluate(tuning_option)