Error: runtime/module_util.cc:72: Check failed: ret == 0 (-1 vs. 0) : Assert fail: (1 == int32(arg1.shape[0])), Argument arg1.shape[0] has an unsatisfied constraint

I’m not quite sure why I’m receiving the below error considering the function works on the first pass, but fails on the second pass when reading from a cv::VideoCapture. The code I have is nearly identical to to the forward function here, https://github.com/markson14/Face-Recognition-Cpp/blob/master/facetracking.hpp (which seemingly works). C++ is still new to me, so I may be overlooking something entirely. Any help would be appreciated!

(target_host='llvm' on an ubuntu machine with a 1080Titan and a jetson tx2 with target_host='llvm -target=aarch64-linux-gnu'. Both are using target='cuda -libs=cudnn,cublas' and both are failng in the same way)

Thanks,

Matt

terminate called after throwing an instance of 'dmlc::Error'
  what():  [18:19:44] /home/mkrzus/github/incubator-tvm/src/runtime/module_util.cc:72: Check failed: ret == 0 (-1 vs. 0) : Assert fail: (1 == int32(arg1.shape[0])), Argument arg1.shape[0] has an unsatisfied constraint
Stack trace:
  [bt] (0) ./test-func(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x4552c2]
  [bt] (1) ./test-func() [0x436ce5]
  [bt] (2) ./test-func() [0x435a97]
  [bt] (3) ./test-func(tvm::runtime::GraphRuntime::Run()+0x47) [0x435277]
  [bt] (4) ./test-func(YoloTVM::forward(cv::Mat, float)+0x6cf) [0x49d16f]
  [bt] (5) ./test-func(main+0xc08) [0x432c18]
  [bt] (6) /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0) [0x7f287a05d830]
  [bt] (7) ./test-func(_start+0x29) [0x433659]
...
...
        yoloresults* forward(cv::Mat processed_image, float thresh)
        {
            using Clock = std::chrono::high_resolution_clock;
            using Timepoint = Clock::time_point;
            using Duration = std::chrono::duration<double>;
            
            auto start = Clock::now();
            auto tic1 = Clock::now();

            cv::Mat tensor = cv::dnn::blobFromImage(processed_image, 1.0, cv::Size(), cv::Scalar(), false, false);

            auto toc1 = Clock::now();
            auto elapsed1 = Duration(toc1 - tic1).count();
            std::cout << "image to tensor time elapsed: " << elapsed1 << std::endl;

            if (!tensor.data)
                printf("no data. load error");

            auto yoloboxstart = Clock::now();
            yoloresults* results = (yoloresults*)calloc(1, sizeof(yoloresults));
            results->num = 100;
            results->boxes = (bbox_result*)calloc(100, sizeof(bbox_result));
            auto yoloboxend = Clock::now();
            auto yoloboxend_elapsed = Duration(yoloboxend - yoloboxstart).count();
            std::cout << "yolobox creation time elapsed: " << yoloboxend_elapsed << std::endl;

            auto tic2 = Clock::now();
            DLTensor* input = nullptr;;
            DLTensor* output_tensor_ids = nullptr;
            DLTensor* output_tensor_scores = nullptr;
            DLTensor* output_tensor_bboxes = nullptr;
            int id_and_score_size[3] = { 1, 100, 1 };
            int box_size[3] = { 1, 100, 4 };
            constexpr int dtype_code = kDLFloat;
            constexpr int dtype_bits = 32;
            constexpr int dtype_lanes = 1;
            constexpr int device_type = kDLGPU;
            constexpr int device_id = 0;
            constexpr int in_ndim = 4;
            constexpr int out_ndim = 3;
            const int64_t in_shape[4] = {1, 3, 512, 512};
            const int64_t tvm_id_and_score_size[3] = {1, 100, 1};
            const int64_t tvm_box_size[3] = {1, 100, 4};

            TVMArrayAlloc(in_shape, in_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &input);//
            std::cout << "TVMArrayAlloc sucess\n" << std::endl;
            TVMArrayCopyFromBytes(input, tensor.data, 1*3*512*512*4);
            std::cout << "TVMArrayCopyFromBytes sucess\n" << std::endl;
     
            // not even sure this is necessary.
            TVMArrayAlloc(tvm_id_and_score_size, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &output_tensor_ids);
            TVMArrayAlloc(tvm_id_and_score_size, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &output_tensor_scores);
            TVMArrayAlloc(tvm_box_size, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &output_tensor_bboxes);

            tvm::runtime::Module *mod = (tvm::runtime::Module *) handle.get();

            std::cout << "Module sucess\n" << std::endl;
            tvm::runtime::PackedFunc set_input = mod->GetFunction("set_input");
            std::cout << "PackedFunc sucess\n" << std::endl;
            set_input("data", input);
            std::cout << "set_input sucess\n" << std::endl;
            
            //GetDataSize
            std::cout << tvm::runtime::GetDataSize(*input) << " input shape\n" << std::endl;
            std::cout << input->ndim << " ndim shape\n" << std::endl;

            tvm::runtime::PackedFunc run = mod->GetFunction("run");
            std::cout << "GetFunction sucess\n" << std::endl;
            
            // failing on second pass here...
            run();

            std::cout << "run sucess\n" << std::endl;
            tvm::runtime::PackedFunc get_output = mod->GetFunction("get_output");
            
            std::cout << "getting output\n" << std::endl;
            output_tensor_ids = get_output(0);
            output_tensor_scores = get_output(1);
            output_tensor_bboxes = get_output(2);

            auto toc2 = Clock::now();
            auto elapsed2 = Duration(toc2 - tic2).count();
            std::cout << "tvm setup/model-runtime time elapsed: " << elapsed2 << std::endl;

            auto torchboxsstart = Clock::now();
            torch::Tensor ndarray_ids = torch::zeros({1, 100, 1}, at::kFloat);
            torch::Tensor ndarray_scores = torch::zeros({1, 100, 1}, at::kFloat);
            torch::Tensor ndarray_bboxes = torch::zeros({1, 100, 4}, at::kFloat);
            auto torchboxsend = Clock::now();
            auto torchboxsend_elapsed = Duration(torchboxsend - torchboxsstart).count();
            std::cout << "torch inital tensor creation time elapsed: " << torchboxsend_elapsed << std::endl;

            auto tvmarraycopystart = Clock::now();
            std::cout << "copying output\n" << std::endl;
            TVMArrayCopyToBytes(output_tensor_ids, ndarray_ids.data_ptr(),  1 * 100 * 1 * sizeof(float));
            TVMArrayCopyToBytes(output_tensor_scores, ndarray_scores.data_ptr(),  1 * 100 * 1 * sizeof(float));
            TVMArrayCopyToBytes(output_tensor_bboxes, ndarray_bboxes.data_ptr(),  1 * 100 * 4 * sizeof(float));
            auto tvmarraycopyend = Clock::now();
            auto tvmarraycopyend_elapsed = Duration(tvmarraycopyend - tvmarraycopystart).count();
            std::cout << "TVMArrayCopyToBytes time elapsed: " << tvmarraycopyend_elapsed << std::endl;

            auto accessorcreationstart = Clock::now();
            auto ndarray_scores_a = ndarray_scores.accessor<float,3>();
            auto ndarray_ids_a = ndarray_ids.accessor<float,3>();
            auto ndarray_bboxes_a = ndarray_bboxes.accessor<float,3>();
            auto accessorcreationend = Clock::now();
            auto accessorcreationend_elapsed = Duration(accessorcreationend - accessorcreationstart).count();
            std::cout << "accessor time elapsed: " << accessorcreationend_elapsed << std::endl;

            int new_num = 0;
            auto tic3 = Clock::now();
            std::cout << "itering through tensors\n" << std::endl;
            //int num = 100;
            for (int i = 0; i < max_yolo_boxes; ++i) {
                float xmin;
                float ymin;
                float xmax;
                float ymax;

                float score = ndarray_scores_a[0][i][0]; //TODO change 00i
                float label = ndarray_ids_a[0][i][0];
                if (score < thresh) continue;
                if (label < 0) continue;

                int cls_id = static_cast<int>(label);
                xmin = ndarray_bboxes_a[0][i][0];
                ymin = ndarray_bboxes_a[0][i][1];
                xmax = ndarray_bboxes_a[0][i][2];
                ymax = ndarray_bboxes_a[0][i][3];

                results->boxes[i].xmin = xmin * (640.0/512.0); // move down to 480 space
                results->boxes[i].ymin = ymin / (512.0/480.0); // move up to 640
                results->boxes[i].xmax = xmax * (640.0/512.0);
                results->boxes[i].ymax = ymax / (512.0/480.0);
                results->boxes[i].id = cls_id;
                results->boxes[i].score = score;
                new_num+=1;
            };
            auto toc3 = Clock::now();
            auto elapsed3 = Duration(toc3 - tic3).count();
            std::cout << "bbox filtering/iter time elapsed: " << elapsed3 << std::endl;

            results->num = new_num;

            //free + delete??
            auto arrayfreestart = Clock::now();
            std::cout << "setting data\n" << std::endl;
            TVMArrayFree(input);
            std::cout << "input freed\n" << std::endl;
            TVMArrayFree(output_tensor_ids);
            std::cout << "ids freed\n" << std::endl;
            TVMArrayFree(output_tensor_scores);
            std::cout << "scores freed\n" << std::endl;
            TVMArrayFree(output_tensor_bboxes);

            input = nullptr;
            output_tensor_ids = nullptr;
            output_tensor_scores = nullptr;
            output_tensor_bboxes = nullptr;
            delete input;
            delete output_tensor_ids;
            delete output_tensor_scores;
            delete output_tensor_bboxes;

            auto arrayfreeend = Clock::now();
            auto arrayfreeend_elapsed = Duration(arrayfreeend - arrayfreestart).count();
            std::cout << "TVMArrayFree time elapsed: " << arrayfreeend_elapsed << std::endl;

            std::cout << "output freed\n" << std::endl;
            std::cout << "funct finished\n" << std::endl;
            std::cout << "num output\n" << results->num <<  std::endl;
            auto end = Clock::now();
            auto total = Duration(end - start).count();
            std::cout << "function time: " << total << std::endl;
            return results;

below is the output of my test function running yolov3 on a VideoCapture.

./test-func people.jpg                                                                                                                                          
frame resolution: 512*512
image to tensor time elapsed: 0.000886221
yolobox creation time elapsed: 3.237e-06
TVMArrayAlloc sucess

TVMArrayCopyFromBytes sucess

Module sucess

PackedFunc sucess

set_input sucess

3145728 input shape

4 ndim shape

GetFunction sucess

run sucess

getting output

tvm setup/model-runtime time elapsed: 0.0267213
torch inital tensor creation time elapsed: 0.0187789
copying output

TVMArrayCopyToBytes time elapsed: 0.116137
accessor time elapsed: 8.34e-07
itering through tensors

bbox filtering/iter time elapsed: 2.46e-06
setting data

input freed

ids freed

scores freed

TVMArrayFree time elapsed: 0.000176008
output freed

funct finished

num output
14
function time: 0.162808
Start Ploting with visualize score threshold: 
0.3
getting info from boxes start: 
id: person, scores: 0.979075/n
getting info from boxes start: 
id: person, scores: 0.965895/n
getting info from boxes start: 
id: person, scores: 0.869158/n
getting info from boxes start: 
id: person, scores: 0.840991/n
getting info from boxes start: 
id: person, scores: 0.804186/n
getting info from boxes start: 
id: person, scores: 0.801172/n
getting info from boxes start: 
id: person, scores: 0.751121/n
getting info from boxes start: 
id: person, scores: 0.691966/n
getting info from boxes start: 
id: person, scores: 0.658644/n
getting info from boxes start: 
id: person, scores: 0.648285/n
getting info from boxes start: 
id: person, scores: 0.579973/n
getting info from boxes start: 
id: person, scores: 0.576689/n
getting info from boxes start: 
id: person, scores: 0.514259/n
getting info from boxes start: 
id: person, scores: 0.512077/n
image to tensor time elapsed: 0.000822714
yolobox creation time elapsed: 1.646e-06
TVMArrayAlloc sucess

TVMArrayCopyFromBytes sucess

Module sucess

PackedFunc sucess

set_input sucess

3145728 input shape

4 ndim shape

GetFunction sucess

terminate called after throwing an instance of 'dmlc::Error'
  what():  [18:19:44] /home/mkrzus/github/incubator-tvm/src/runtime/module_util.cc:72: Check failed: ret == 0 (-1 vs. 0) : Assert fail: (1 == int32(arg1.shape[0])), Argument arg1.shape[0] has an unsatisfied constraint
Stack trace:
  [bt] (0) ./test-func(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x4552c2]
  [bt] (1) ./test-func() [0x436ce5]
  [bt] (2) ./test-func() [0x435a97]
  [bt] (3) ./test-func(tvm::runtime::GraphRuntime::Run()+0x47) [0x435277]
  [bt] (4) ./test-func(YoloTVM::forward(cv::Mat, float)+0x6cf) [0x49d16f]
  [bt] (5) ./test-func(main+0xc08) [0x432c18]
  [bt] (6) /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0) [0x7f287a05d830]
  [bt] (7) ./test-func(_start+0x29) [0x433659]

I re-wrote the forward function to not use cv::dnn::blobFromImage and instead used a data buffer in the same way the individual from https://github.com/markson14/Face-Recognition-Cpp/blob/master/facetracking.hpp used. This seemed to do the trick and I’m no longer having the tvm-runtime error. Again, I’m new to C++ so I’m not sure I know why that worked, but it did. Hope this helps someone in the future. -Matt

            //input data
            float *data_x = (float *) malloc(total_input * sizeof(float));
            cv::Mat split_mat[3];
            cv::split(processed_image, split_mat);
            memcpy(data_x, split_mat[2].ptr<float>(), processed_image.cols * processed_image.rows * sizeof(float));
            memcpy(data_x + processed_image.cols * processed_image.rows, split_mat[1].ptr<float>(),
                   processed_image.cols * processed_image.rows * sizeof(float));
            memcpy(data_x + processed_image.cols * processed_image.rows * 2, split_mat[0].ptr<float>(),
                   processed_image.cols * processed_image.rows * sizeof(float));
            TVMArrayCopyFromBytes(input, data_x, total_input * sizeof(float));
...
//forward code
...
            //end of forward
            free(data_x);
            data_x = nullptr;
            return results;