I’m not quite sure why I’m receiving the below error considering the function works on the first pass, but fails on the second pass when reading from a cv::VideoCapture
. The code I have is nearly identical to
to the forward function here, https://github.com/markson14/Face-Recognition-Cpp/blob/master/facetracking.hpp (which seemingly works). C++ is still new to me, so I may be overlooking something entirely. Any help would be appreciated!
(target_host='llvm'
on an ubuntu machine with a 1080Titan and a jetson tx2 with target_host='llvm -target=aarch64-linux-gnu'
. Both are using target='cuda -libs=cudnn,cublas'
and both are failng in the same way)
Thanks,
Matt
terminate called after throwing an instance of 'dmlc::Error'
what(): [18:19:44] /home/mkrzus/github/incubator-tvm/src/runtime/module_util.cc:72: Check failed: ret == 0 (-1 vs. 0) : Assert fail: (1 == int32(arg1.shape[0])), Argument arg1.shape[0] has an unsatisfied constraint
Stack trace:
[bt] (0) ./test-func(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x4552c2]
[bt] (1) ./test-func() [0x436ce5]
[bt] (2) ./test-func() [0x435a97]
[bt] (3) ./test-func(tvm::runtime::GraphRuntime::Run()+0x47) [0x435277]
[bt] (4) ./test-func(YoloTVM::forward(cv::Mat, float)+0x6cf) [0x49d16f]
[bt] (5) ./test-func(main+0xc08) [0x432c18]
[bt] (6) /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0) [0x7f287a05d830]
[bt] (7) ./test-func(_start+0x29) [0x433659]
...
...
yoloresults* forward(cv::Mat processed_image, float thresh)
{
using Clock = std::chrono::high_resolution_clock;
using Timepoint = Clock::time_point;
using Duration = std::chrono::duration<double>;
auto start = Clock::now();
auto tic1 = Clock::now();
cv::Mat tensor = cv::dnn::blobFromImage(processed_image, 1.0, cv::Size(), cv::Scalar(), false, false);
auto toc1 = Clock::now();
auto elapsed1 = Duration(toc1 - tic1).count();
std::cout << "image to tensor time elapsed: " << elapsed1 << std::endl;
if (!tensor.data)
printf("no data. load error");
auto yoloboxstart = Clock::now();
yoloresults* results = (yoloresults*)calloc(1, sizeof(yoloresults));
results->num = 100;
results->boxes = (bbox_result*)calloc(100, sizeof(bbox_result));
auto yoloboxend = Clock::now();
auto yoloboxend_elapsed = Duration(yoloboxend - yoloboxstart).count();
std::cout << "yolobox creation time elapsed: " << yoloboxend_elapsed << std::endl;
auto tic2 = Clock::now();
DLTensor* input = nullptr;;
DLTensor* output_tensor_ids = nullptr;
DLTensor* output_tensor_scores = nullptr;
DLTensor* output_tensor_bboxes = nullptr;
int id_and_score_size[3] = { 1, 100, 1 };
int box_size[3] = { 1, 100, 4 };
constexpr int dtype_code = kDLFloat;
constexpr int dtype_bits = 32;
constexpr int dtype_lanes = 1;
constexpr int device_type = kDLGPU;
constexpr int device_id = 0;
constexpr int in_ndim = 4;
constexpr int out_ndim = 3;
const int64_t in_shape[4] = {1, 3, 512, 512};
const int64_t tvm_id_and_score_size[3] = {1, 100, 1};
const int64_t tvm_box_size[3] = {1, 100, 4};
TVMArrayAlloc(in_shape, in_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &input);//
std::cout << "TVMArrayAlloc sucess\n" << std::endl;
TVMArrayCopyFromBytes(input, tensor.data, 1*3*512*512*4);
std::cout << "TVMArrayCopyFromBytes sucess\n" << std::endl;
// not even sure this is necessary.
TVMArrayAlloc(tvm_id_and_score_size, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &output_tensor_ids);
TVMArrayAlloc(tvm_id_and_score_size, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &output_tensor_scores);
TVMArrayAlloc(tvm_box_size, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &output_tensor_bboxes);
tvm::runtime::Module *mod = (tvm::runtime::Module *) handle.get();
std::cout << "Module sucess\n" << std::endl;
tvm::runtime::PackedFunc set_input = mod->GetFunction("set_input");
std::cout << "PackedFunc sucess\n" << std::endl;
set_input("data", input);
std::cout << "set_input sucess\n" << std::endl;
//GetDataSize
std::cout << tvm::runtime::GetDataSize(*input) << " input shape\n" << std::endl;
std::cout << input->ndim << " ndim shape\n" << std::endl;
tvm::runtime::PackedFunc run = mod->GetFunction("run");
std::cout << "GetFunction sucess\n" << std::endl;
// failing on second pass here...
run();
std::cout << "run sucess\n" << std::endl;
tvm::runtime::PackedFunc get_output = mod->GetFunction("get_output");
std::cout << "getting output\n" << std::endl;
output_tensor_ids = get_output(0);
output_tensor_scores = get_output(1);
output_tensor_bboxes = get_output(2);
auto toc2 = Clock::now();
auto elapsed2 = Duration(toc2 - tic2).count();
std::cout << "tvm setup/model-runtime time elapsed: " << elapsed2 << std::endl;
auto torchboxsstart = Clock::now();
torch::Tensor ndarray_ids = torch::zeros({1, 100, 1}, at::kFloat);
torch::Tensor ndarray_scores = torch::zeros({1, 100, 1}, at::kFloat);
torch::Tensor ndarray_bboxes = torch::zeros({1, 100, 4}, at::kFloat);
auto torchboxsend = Clock::now();
auto torchboxsend_elapsed = Duration(torchboxsend - torchboxsstart).count();
std::cout << "torch inital tensor creation time elapsed: " << torchboxsend_elapsed << std::endl;
auto tvmarraycopystart = Clock::now();
std::cout << "copying output\n" << std::endl;
TVMArrayCopyToBytes(output_tensor_ids, ndarray_ids.data_ptr(), 1 * 100 * 1 * sizeof(float));
TVMArrayCopyToBytes(output_tensor_scores, ndarray_scores.data_ptr(), 1 * 100 * 1 * sizeof(float));
TVMArrayCopyToBytes(output_tensor_bboxes, ndarray_bboxes.data_ptr(), 1 * 100 * 4 * sizeof(float));
auto tvmarraycopyend = Clock::now();
auto tvmarraycopyend_elapsed = Duration(tvmarraycopyend - tvmarraycopystart).count();
std::cout << "TVMArrayCopyToBytes time elapsed: " << tvmarraycopyend_elapsed << std::endl;
auto accessorcreationstart = Clock::now();
auto ndarray_scores_a = ndarray_scores.accessor<float,3>();
auto ndarray_ids_a = ndarray_ids.accessor<float,3>();
auto ndarray_bboxes_a = ndarray_bboxes.accessor<float,3>();
auto accessorcreationend = Clock::now();
auto accessorcreationend_elapsed = Duration(accessorcreationend - accessorcreationstart).count();
std::cout << "accessor time elapsed: " << accessorcreationend_elapsed << std::endl;
int new_num = 0;
auto tic3 = Clock::now();
std::cout << "itering through tensors\n" << std::endl;
//int num = 100;
for (int i = 0; i < max_yolo_boxes; ++i) {
float xmin;
float ymin;
float xmax;
float ymax;
float score = ndarray_scores_a[0][i][0]; //TODO change 00i
float label = ndarray_ids_a[0][i][0];
if (score < thresh) continue;
if (label < 0) continue;
int cls_id = static_cast<int>(label);
xmin = ndarray_bboxes_a[0][i][0];
ymin = ndarray_bboxes_a[0][i][1];
xmax = ndarray_bboxes_a[0][i][2];
ymax = ndarray_bboxes_a[0][i][3];
results->boxes[i].xmin = xmin * (640.0/512.0); // move down to 480 space
results->boxes[i].ymin = ymin / (512.0/480.0); // move up to 640
results->boxes[i].xmax = xmax * (640.0/512.0);
results->boxes[i].ymax = ymax / (512.0/480.0);
results->boxes[i].id = cls_id;
results->boxes[i].score = score;
new_num+=1;
};
auto toc3 = Clock::now();
auto elapsed3 = Duration(toc3 - tic3).count();
std::cout << "bbox filtering/iter time elapsed: " << elapsed3 << std::endl;
results->num = new_num;
//free + delete??
auto arrayfreestart = Clock::now();
std::cout << "setting data\n" << std::endl;
TVMArrayFree(input);
std::cout << "input freed\n" << std::endl;
TVMArrayFree(output_tensor_ids);
std::cout << "ids freed\n" << std::endl;
TVMArrayFree(output_tensor_scores);
std::cout << "scores freed\n" << std::endl;
TVMArrayFree(output_tensor_bboxes);
input = nullptr;
output_tensor_ids = nullptr;
output_tensor_scores = nullptr;
output_tensor_bboxes = nullptr;
delete input;
delete output_tensor_ids;
delete output_tensor_scores;
delete output_tensor_bboxes;
auto arrayfreeend = Clock::now();
auto arrayfreeend_elapsed = Duration(arrayfreeend - arrayfreestart).count();
std::cout << "TVMArrayFree time elapsed: " << arrayfreeend_elapsed << std::endl;
std::cout << "output freed\n" << std::endl;
std::cout << "funct finished\n" << std::endl;
std::cout << "num output\n" << results->num << std::endl;
auto end = Clock::now();
auto total = Duration(end - start).count();
std::cout << "function time: " << total << std::endl;
return results;
below is the output of my test function running yolov3 on a VideoCapture.
./test-func people.jpg
frame resolution: 512*512
image to tensor time elapsed: 0.000886221
yolobox creation time elapsed: 3.237e-06
TVMArrayAlloc sucess
TVMArrayCopyFromBytes sucess
Module sucess
PackedFunc sucess
set_input sucess
3145728 input shape
4 ndim shape
GetFunction sucess
run sucess
getting output
tvm setup/model-runtime time elapsed: 0.0267213
torch inital tensor creation time elapsed: 0.0187789
copying output
TVMArrayCopyToBytes time elapsed: 0.116137
accessor time elapsed: 8.34e-07
itering through tensors
bbox filtering/iter time elapsed: 2.46e-06
setting data
input freed
ids freed
scores freed
TVMArrayFree time elapsed: 0.000176008
output freed
funct finished
num output
14
function time: 0.162808
Start Ploting with visualize score threshold:
0.3
getting info from boxes start:
id: person, scores: 0.979075/n
getting info from boxes start:
id: person, scores: 0.965895/n
getting info from boxes start:
id: person, scores: 0.869158/n
getting info from boxes start:
id: person, scores: 0.840991/n
getting info from boxes start:
id: person, scores: 0.804186/n
getting info from boxes start:
id: person, scores: 0.801172/n
getting info from boxes start:
id: person, scores: 0.751121/n
getting info from boxes start:
id: person, scores: 0.691966/n
getting info from boxes start:
id: person, scores: 0.658644/n
getting info from boxes start:
id: person, scores: 0.648285/n
getting info from boxes start:
id: person, scores: 0.579973/n
getting info from boxes start:
id: person, scores: 0.576689/n
getting info from boxes start:
id: person, scores: 0.514259/n
getting info from boxes start:
id: person, scores: 0.512077/n
image to tensor time elapsed: 0.000822714
yolobox creation time elapsed: 1.646e-06
TVMArrayAlloc sucess
TVMArrayCopyFromBytes sucess
Module sucess
PackedFunc sucess
set_input sucess
3145728 input shape
4 ndim shape
GetFunction sucess
terminate called after throwing an instance of 'dmlc::Error'
what(): [18:19:44] /home/mkrzus/github/incubator-tvm/src/runtime/module_util.cc:72: Check failed: ret == 0 (-1 vs. 0) : Assert fail: (1 == int32(arg1.shape[0])), Argument arg1.shape[0] has an unsatisfied constraint
Stack trace:
[bt] (0) ./test-func(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x4552c2]
[bt] (1) ./test-func() [0x436ce5]
[bt] (2) ./test-func() [0x435a97]
[bt] (3) ./test-func(tvm::runtime::GraphRuntime::Run()+0x47) [0x435277]
[bt] (4) ./test-func(YoloTVM::forward(cv::Mat, float)+0x6cf) [0x49d16f]
[bt] (5) ./test-func(main+0xc08) [0x432c18]
[bt] (6) /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0) [0x7f287a05d830]
[bt] (7) ./test-func(_start+0x29) [0x433659]