Hi , @kevinthesun

everything goes ok, exception the output is wrong.

cv::Mat tensor = cv::dnn::blobFromImage(inputImageAligned,1.0,cv::Size(256,256),cv::Scalar(0,0,0),true);

constexpr int device_type = kDLCPU;

constexpr int device_id = 0;

constexpr int in_ndim = 4;

//const int64_t in_shape[in_ndim] = {1, 3, 256, 256};

const int64_t in_shape[in_ndim] = {1, 256, 256, 3}; **(did the autotune using layout of NWHC, input_shape = (1, 256, 256,3)) , or in_shape was not set in the right way for deploy?**

TVMArrayAlloc(in_shape, in_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &input);

TVMArrayCopyFromBytes(input,tensor.data,256*3*256*4);

when doing the autotune, I got the right output result as follows:

tvm_output = module.get_output(0, tvm.nd.empty(((65536,2)), ‘float32’))

tvm_output_to_numpy = tvm_output.asnumpy()

mask_1 = tvm_output_to_numpy[:,1].reshape(256,256)

mask_2 = tvm_output_to_numpy[:,0].reshape(256,256)

**could I get the right output format in the following way for deployment in C++?**

tvm::runtime::PackedFunc get_output = mod->GetFunction(“get_output”);

tvm::runtime::NDArray res = get_output(0);

cv::Mat vector(65536,2,CV_32F);

memcpy(vector.data,res->data,65536*4*2);

cv::Mat mask = vector.reshape(2, 256).clone();

thanks a lot!