Compiling the basic yolo function from mxnet/gluoncv used to work in 0.6.0, but since I’ve updated to 0.7.dev1 (config: cuda, cudnn, cublas, thrust), it seems like all my code has broken. As of now, compiling and testing code in python works fine, but my deployment code in C++ is broken: I keep receiving a radix_sort error that only happens when I use c++ code, but not when testing in python. Have I overlooked C++ api changes that now don’t align with the python api?
Searching around makes it seem like it may have something to do with https://github.com/apache/incubator-tvm/blob/master/src/runtime/contrib/thrust/thrust.cu#L61: thrust::sort_by_key
, but I can’t tell for sure.
Tried on 1660, 1080Ti, TitanX, Tx2: all the same result for me.
Error Output C++
terminate called after throwing an instance of 'dmlc::Error'
what(): [14:36:16] /opt/src/tvm/src/runtime/library_module.cc:78: Check failed: ret == 0 (-1 vs. 0) : radix_sort: failed on 2nd step: cudaErrorInvalidValue: invalid argument
Stack trace:
[bt] (0) ./main(dmlc::LogMessageFatal::~LogMessageFatal()+0x61) [0x41a331]
[bt] (1) /usr/local/lib/libtvm_runtime.so(+0x76aa3) [0x7fdb5537caa3]
[bt] (2) /usr/local/lib/libtvm_runtime.so(+0xe15f7) [0x7fdb553e75f7]
[bt] (3) /usr/local/lib/libtvm_runtime.so(tvm::runtime::GraphRuntime::Run()+0x47) [0x7fdb553e7687]
[bt] (4) ./main(MinimalYolo::forward_full(cv::Mat)+0x68c) [0x417a4c]
[bt] (5) ./main(main+0x492) [0x412dc2]
[bt] (6) /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0) [0x7fdb3b2ab830]
[bt] (7) ./main(_start+0x29) [0x413169]
Python compile + test on street_small image
import numpy as np
import mxnet as mx
from tvm import relay
from gluoncv import model_zoo, data, utils
import tvm
from tvm.contrib import graph_runtime
import logging
from mxnet.gluon.data.vision import transforms
im_fname = download_testdata('https://github.com/dmlc/web-data/blob/master/' +
'gluoncv/detection/street_small.jpg?raw=true',
'street_small.jpg', module='data')
TRANSFORM_FN = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([.485, .456, .406], [.229, .224, .225])
])
img = cv2.imread(im_fname)[..., ::-1]
img_t = cv2.resize(img, (256, 256))
img_t = TRANSFORM_FN(mx.nd.array(img_t))
img_n = img_t.expand_dims(0).asnumpy()
ctx = tvm.gpu(0)
target = 'cuda -libs=cudnn,cublas -model=titanx'
block = model_zoo.get_model('yolo3_mobilenet1.0_coco', pretrained=True)
mod, params = relay.frontend.from_mxnet(block, shape={'data': (1,3,256,256)}, dtype='float32')
net = mod["main"]
net = relay.Function(net.params, net.body, None, net.type_params, net.attrs)
mod = tvm.IRModule.from_expr(net)
# target = tvm.target.cuda('titanx')
with tvm.transform.PassContext(opt_level=3):
graph, lib, params = relay.build_module.build(
mod, target=target, params=params)
module = graph_runtime.create(graph, lib, ctx)
module.set_input(**params)
module.set_input('data', tvm.nd.array(img_n).astype('float32')))
module.run()
output0 = module.get_output(0)
output1 = module.get_output(1)
output2 = module.get_output(2)
print(output0)
output_name = '256.yolo.cuda.titanx'
lib.export_library(
"{}.so".format(output_name))
print('lib export success')
with open("{}.json".format(output_name), "w") as fo:
fo.write(graph)
print("graph export success")
with open("{}.params".format(output_name), "wb") as fo:
fo.write(relay.save_param_dict(params))
print("params export success")
C++ deploy code using compiled lib
#include <cstdio>
#include <opencv2/opencv.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include <dlpack/dlpack.h>
#include <tvm/runtime/module.h>
#include <tvm/runtime/registry.h>
#include <tvm/runtime/packed_func.h>
#include <fstream>
#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <map>
#include <cmath>
#include <random>
class MinimalYolo{
private:
std::unique_ptr<tvm::runtime::Module> detector_handle;
public:
std::string deploy_lib_path;
std::string deploy_graph_path;
std::string deploy_param_path;
bool gpu = true;
int device_id;// = 0;
int dtype_code = kDLFloat;
int dtype_bits = 32;
int dtype_lanes = 1;
int device_type = kDLGPU;
int detector_width = 256;
int detector_height = 256;
int detector_total_input = 3 * detector_width * detector_height;
int in_ndim = 4;
int detector_out_ndim = 3;
int64_t tvm_id_and_score_size[3] = {1, 100, 1};
int64_t tvm_box_size[3] = {1, 100, 4};
/**
* function that reads both the yolo detector and the pose detector
*
*/
MinimalYolo(std::string detector_path) {
std::cout << "start model_config reading" << std::endl;
std::string detector_deploy_lib_path = detector_path + ".so";
std::string detector_deploy_graph_path = detector_path + ".json";
std::string detector_deploy_param_path = detector_path + ".params";
tvm::runtime::Module detector_mod_syslib = tvm::runtime::Module::LoadFromFile(detector_deploy_lib_path);
std::ifstream detector_json_in(detector_deploy_graph_path, std::ios::in);
std::string detector_json_data((std::istreambuf_iterator<char>(detector_json_in)), std::istreambuf_iterator<char>());
detector_json_in.close();
tvm::runtime::Module detector_mod = (*tvm::runtime::Registry::Get("tvm.graph_runtime.create"))(detector_json_data, detector_mod_syslib,
device_type, device_id);
this->detector_handle.reset(new tvm::runtime::Module(detector_mod));
std::ifstream detector_params_in(detector_deploy_param_path, std::ios::binary);
std::string detector_params_data((std::istreambuf_iterator<char>(detector_params_in)), std::istreambuf_iterator<char>());
detector_params_in.close();
TVMByteArray detector_params_arr;
detector_params_arr.data = detector_params_data.c_str();
detector_params_arr.size = detector_params_data.length();
tvm::runtime::PackedFunc detector_load_params = detector_mod.GetFunction("load_params");
detector_load_params(detector_params_arr);
}
/**
* \brief function to normalize an image before it's processed by the network
* \param[in] the raw cv::mat image
* \return the normalized version of the iamge.
*/
cv::Mat preprocess_image(cv::Mat frame, int width, int height, bool convert){
cv::Size new_size = cv::Size(width, height);
cv::Mat resized_image;
if (convert){
cv::Mat rgb;
cv::cvtColor(frame, rgb, cv::COLOR_BGR2RGB);
cv::resize(rgb, resized_image, new_size);
} else {
cv::resize(frame, resized_image, new_size);
}
cv::Mat resized_image_floats(new_size, CV_32FC3);
resized_image.convertTo(resized_image_floats, CV_32FC3, 1.0f/255.0f);
cv::Mat normalized_image(new_size, CV_32FC3);
cv::Mat mean(new_size, CV_32FC3, cv::Scalar(0.485, 0.456, 0.406));
cv::Mat theta(new_size, CV_32FC3, cv::Scalar(0.229, 0.224, 0.225));
cv::Mat temp;
temp = resized_image_floats - mean;
normalized_image = temp / theta;
return normalized_image;
}
/**
* \brief fminimal example of inference
* \param[in] the raw cv::mat image
*/
void forward_full(cv::Mat frame)
{
std::cout << "starting function" << std::endl;
cv::Size image_size = frame.size();
float img_height = static_cast<float>(image_size.height);
float img_width = static_cast<float>(image_size.width);
int64_t in_shape[4] = {1, 3, detector_height, detector_width};
int total_input = 3 * detector_width * detector_height;
std::cout << "width: " << detector_width << std::endl;
std::cout << "height: " << detector_height << std::endl;
std::cout << "total_input: " << total_input << std::endl;
std::cout << "device_id: " << device_id << std::endl;
std::cout << "dtype_code: " << dtype_code << std::endl;
std::cout << "dtype_bits: " << dtype_bits << std::endl;
std::cout << "dtype_lanes: " << dtype_lanes << std::endl;
std::cout << "device_type: " << device_type << std::endl;
DLTensor *output_tensor_ids;
DLTensor *output_tensor_scores;
DLTensor *output_tensor_bboxes;
DLTensor *input;
float *data_x = (float *) malloc(total_input * sizeof(float));
std::cout << "about to allocate info" << std::endl;
// allocate DLTensor memory on device for all the vars needed
TVMArrayAlloc(in_shape, in_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &input);
TVMArrayAlloc(tvm_id_and_score_size, detector_out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &output_tensor_ids);
TVMArrayAlloc(tvm_id_and_score_size, detector_out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &output_tensor_scores);
TVMArrayAlloc(tvm_box_size, detector_out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &output_tensor_bboxes);
std::cout << "allocate info finished" << std::endl;
//copy processed image to DLTensor
std::cout << "about to preprocess" << std::endl;
cv::Mat processed_image = preprocess_image(frame, detector_width, detector_height, true);
std::cout << "preprocess finished" << std::endl;
cv::Mat split_mat[3];
cv::split(processed_image, split_mat);
memcpy(data_x, split_mat[2].ptr<float>(), processed_image.cols * processed_image.rows * sizeof(float));
memcpy(data_x + processed_image.cols * processed_image.rows, split_mat[1].ptr<float>(),
processed_image.cols * processed_image.rows * sizeof(float));
memcpy(data_x + processed_image.cols * processed_image.rows * 2, split_mat[0].ptr<float>(),
processed_image.cols * processed_image.rows * sizeof(float));
TVMArrayCopyFromBytes(input, data_x, total_input * sizeof(float));
std::cout << "TVMArrayCopyFromBytes finished" << std::endl;
// standard tvm module run
tvm::runtime::Module *mod = (tvm::runtime::Module *) detector_handle.get();
tvm::runtime::PackedFunc set_input = mod->GetFunction("set_input");
set_input("data", input);
tvm::runtime::PackedFunc run = mod->GetFunction("run");
run();
tvm::runtime::PackedFunc get_output = mod->GetFunction("get_output");
std::cout << "run/getoutput/setinput finished" << std::endl;
// https://github.com/apache/incubator-tvm/issues/979?from=timeline
TVMSynchronize(device_type, device_id, nullptr);
get_output(0, output_tensor_ids);
get_output(1, output_tensor_scores);
get_output(2, output_tensor_bboxes);
std::cout << "TVMSynchronize finished" << std::endl;
TVMArrayFree(input);
TVMArrayFree(output_tensor_ids);
TVMArrayFree(output_tensor_scores);
TVMArrayFree(output_tensor_bboxes);
input = nullptr;
output_tensor_ids = nullptr;
output_tensor_scores = nullptr;
output_tensor_bboxes = nullptr;
free(data_x);
data_x = nullptr;
}
};
int main(int argc, char** argv)
{
cv::Mat raw_image;
raw_image = cv::imread("street_small.jpg");
MinimalYolo yolo("256.yolo.cuda.titanx");
yolo.forward_full(raw_image);
}