Config for OpenCL causes * NDRANGE_KERNEL executed abnormally *

sunzj · September 30, 2019, 2:23am

I wrote a conv2d schedule for opencl device, after tunning,i got two config. The first config worked well, but the second config caused "*** NDRANGE_KERNEL executed abnormally *** ".

I dumped the OpenCL source code:

The first one:

__kernel void fuse_conv2d_clip_39_kernel0(__global float* restrict input0, __global float* restrict input1, __global float* restrict tensor, __global float* restrict input2) {
float compute[24];
__local float pad_temp_shared[224];
__local float input1_shared[48];
float pad_temp_shared_local[1];
float input1_shared_local[24];
for (int ff_init = 0; ff_init < 24; ++ff_init) {
compute[ff_init] = 0.000000e+00f;
}
for (int rc_outer = 0; rc_outer < 12; ++rc_outer) {
barrier(CLK_LOCAL_MEM_FENCE);
for (int ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner = 0; ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner < 2; ++ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) {
pad_temp_shared[(((((int)get_local_id(1)) * 16) + ((((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8) * 8)) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) % 8))] = input0[((((((((int)get_group_id(1)) * 784) + (((int)get_group_id(0)) * 8)) + (rc_outer * 6272)) + ((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) / 14) * 3136)) + ((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) % 14) * 56)) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) % 8))];
}
if ((((int)get_local_id(1)) * 2) < (24 - (((int)get_local_id(0)) / 2))) {
if ((((int)get_group_id(2)) * 24) < ((144 - (((int)get_local_id(1)) * 2)) - (((int)get_local_id(0)) / 2))) {
input1_shared[(((((int)get_local_id(1)) * 4) + ((((int)get_local_id(0)) / 2) * 2)) + (((int)get_local_id(0)) % 2))] = input1[(((((((int)get_group_id(2)) * 576) + (rc_outer * 2)) + (((int)get_local_id(1)) * 48)) + ((((int)get_local_id(0)) / 2) * 24)) + (((int)get_local_id(0)) % 2))];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int rc_inner = 0; rc_inner < 2; ++rc_inner) {
pad_temp_shared_local[0] = pad_temp_shared[(((((int)get_local_id(1)) * 8) + ((int)get_local_id(0))) + (rc_inner * 112))];
for (int ax0 = 0; ax0 < 24; ++ax0) {
input1_shared_local[ax0] = input1_shared[(rc_inner + (ax0 * 2))];
}
for (int ff = 0; ff < 24; ++ff) {
compute[ff] = (compute[ff] + (pad_temp_shared_local[0] * input1_shared_local[ff]));
}
}
}
for (int ax1_inner_inner = 0; ax1_inner_inner < 24; ++ax1_inner_inner) {
tensor[((((((((int)get_group_id(2)) * 75264) + (((int)get_group_id(1)) * 784)) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(1)) * 56)) + ((int)get_local_id(0))) + (ax1_inner_inner * 3136))] = max(min((compute[ax1_inner_inner] + input2[((((int)get_group_id(2)) * 24) + ax1_inner_inner)]), 6.000000e+00f), 0.000000e+00f);
}
}

The second one:

__kernel void fuse_conv2d_clip_39_kernel0(__global float* restrict input0, __global float* restrict input1, __global float* restrict tensor, __global float* restrict input2) {
float compute[24];
__local float pad_temp_shared[64];
__local float input1_shared[48];
float pad_temp_shared_local[1];
float input1_shared_local[24];
for (int ff_init = 0; ff_init < 24; ++ff_init) {
compute[ff_init] = 0.000000e+00f;
}
for (int rc_outer = 0; rc_outer < 12; ++rc_outer) {
barrier(CLK_LOCAL_MEM_FENCE);
for (int ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner = 0; ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner < 2; ++ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) {
pad_temp_shared[((((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) / 4) * 32) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) % 8)) + ((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) % 4) * 8))] = input0[((((((((int)get_group_id(1)) * 224) + (((int)get_group_id(0)) * 8)) + (rc_outer * 6272)) + ((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) / 4) * 3136)) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) % 8)) + ((((((int)get_local_id(1)) * 2) + (((((int)get_local_id(0)) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) / 8)) % 4) * 56))];
}
for (int ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner1 = 0; ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner1 < 2; ++ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner1) {
if ((((int)get_local_id(1)) * 6) < (24 - ((int)get_local_id(0)))) {
if (((((int)get_group_id(2)) * 24) + (((int)get_local_id(1)) * 6)) < (144 - ((int)get_local_id(0)))) {
input1_shared[(((((int)get_local_id(1)) * 12) + (((int)get_local_id(0)) * 2)) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner1)] = input1[(((((((int)get_group_id(2)) * 576) + (rc_outer * 2)) + (((int)get_local_id(1)) * 144)) + (((int)get_local_id(0)) * 24)) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner1)];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int rc_inner = 0; rc_inner < 2; ++rc_inner) {
pad_temp_shared_local[0] = pad_temp_shared[(((((int)get_local_id(1)) * 8) + ((int)get_local_id(0))) + (rc_inner * 32))];
for (int ax0 = 0; ax0 < 24; ++ax0) {
input1_shared_local[ax0] = input1_shared[(rc_inner + (ax0 * 2))];
}
for (int ff = 0; ff < 24; ++ff) {
compute[ff] = (compute[ff] + (pad_temp_shared_local[0] * input1_shared_local[ff]));
}
}
}
for (int ax1_inner_inner = 0; ax1_inner_inner < 24; ++ax1_inner_inner) {
tensor[((((((((int)get_group_id(2)) * 75264) + (((int)get_group_id(1)) * 224)) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(1)) * 56)) + ((int)get_local_id(0))) + (ax1_inner_inner * 3136))] = max(min((compute[ax1_inner_inner] + input2[((((int)get_group_id(2)) * 24) + ax1_inner_inner)]), 6.000000e+00f), 0.000000e+00f);
}
}

Seems the error causes by the NDRANGE exceed the limit, but i am not sure, could someone explain it, thanks very much!

maksimon · November 19, 2021, 7:56am

Hi ~ Have you sloved this issue now?

Config for OpenCL causes *** NDRANGE_KERNEL executed abnormally ***

Config for OpenCL causes * NDRANGE_KERNEL executed abnormally *