@merrymercy,
OK, fine.
I was thinking to union
too, here is my proposal focused on āmaliā below. The patch below is over the top on current PR.
 Yes i touched unroll, yt, t1, t2 on mali , it yield way much better results.
 Having tile_bnb the way is done below assures smooth convergence (very fast <500 iteration) for any 3x3.
 No issues anymore on 3x3 like (
cannot prove thread.x
) or other kinds of.
 Still issue on 5x5 and 7x7 (many early
cannot prove thread.x
messages) but its tunnable with excelent final results. For this i would like to revisit once more tile_bnb proposal but with conditional_knob
Not had time yet to check ARM cpu side too, as said i would implement the conditional_knob
first.
 a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ 205,7 +205,6 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
dilation_h, dilation_w = dilation
if len(kernel.shape) == 4:

if dilation_h != 1 or dilation_w != 1:
kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
pre_computed = False
@@ 237,14 +236,52 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
P = N * nH * nW
##### space definition begin #####
 tile_bna_candidates = [1, 2, 4, 8, 16]
+
+ ##
+ ## BNA list generator
+ # maximal range is up to CO (kernel)
+ tile_bnx_candidates = range(1, CO+1)
factors = get_factors(CO)
 cfg.define_knob('tile_bna', [x for x in tile_bna_candidates if x in factors])
 cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16])
 cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128)
 cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128)
 cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8)
 cfg.define_knob('yt', [1, 2, 4, 8, 16, 32])
+
+ ##
+ ## BNA space
+ # factors of maximal CO (kernel) size
+ cfg.define_knob('tile_bna', [x for x in tile_bnx_candidates if x in factors])
+
+ ##
+ ## BNB list generator
+ #
+ tile_bnb = []
+ # account all tile_sizes
+ for t_size in range (2,9):
+ # P as volume of tensor @data
+ # (N=1) means P is just square
+ nH = (H + t_size1) // t_size
+ nW = (W + t_size1) // t_size
+ P = N * nH * nW
+
+ P_list = []
+ # maximal range is up to CO (kernel)
+ for bnb in range(1, max(CO, CI)+1):
+ # bnb as square should fit into to P
+ P_round = (P + bnb  1) // bnb * bnb
+ assert P_round % bnb == 0
+ # search unique P_round//bnb fits
+ if P_round//bnb not in P_list:
+ P_list.append(P_round//bnb)
+ # store the bnb who generated
+ # the unique P_round//bnb subsquare
+ if bnb not in tile_bnb:
+ tile_bnb.append(bnb)
+ ##
+ ## BNB space
+ # all possible subsquares fitting
+ cfg.define_knob('tile_bnb', tile_bnb)
+
+ cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=256)
+ cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=256)
+ cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=32)
+ cfg.define_knob('yt', [1, 2, 4, 8, 16, 32, 64, 128, 256])
##### space definition end #####
if cfg.is_fallback: