I want to achieve prefetching for w in the schedule as follows:
for(int pf=0; pf<PREFETCH; pf++) {
w[pf] = weights[weight_addr]; weight_addr += SUB_GROUP_SIZE;
}
uint wi = 0;
uint kr = 0; // kr = Kernel Row
LOOP(FILTER_SIZE_Y, kr, // LOOP is a macro that unrolls the loop.
{
uint kc = 0; // kc = Kernel Column
LOOP(FILTER_SIZE_X, kc,
{
w[wi % PREFETCH] = weights[weight_addr];
weight_addr += SUB_GROUP_SIZE;
wi++;
});
});
// addr went beyond due to prefetch so move it back to correct location.
weight_addr -= PREFETCH * SUB_GROUP_SIZE;
}
How shall I achieve that? Should I use double_buffer?