Cuda kernel function after codegen compile failed

/tmp/tmps94u7qu7/my_kernel.cu(6908): Error: Formal parameter space overflowed (4544 bytes required, max 4096 bytes allowed) in function fused_reshape_gather_nd_reshape_floor_mod_less_zeros_like_where_reshape_gather_n_17367594856123799618__kernel0

this fused function has 576 params and seems each param requires 8 bytes, and the total bytes required exceeds the limited 4096.

NVIDIA suggests that we should pass a struct to avoid passing too many params. I’m not familiar with the codegen process, could this be fixed easily?

I can give an example that can reproduce this error .

import tensorflow as tf
def create_split():
graph = tf.Graph()
with graph.as_default():
    branches = []
    a = tf.placeholder(shape=(1,600), dtype=tf.int64)
    a = tf.split(a, 600, axis=1)
    for i in range(600):
        x = tf.less(a[i], i)
        branches.append(x)
    y = tf.concat(branches, axis=0, name='y')
return graph.as_graph_def()

def load_pb(filename):
with tf.gfile.FastGFile(filename, 'rb') as f:
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())
return graph_def

def save_pb(graph_def, filename):
with tf.gfile.FastGFile(filename, 'wb') as f:
    f.write(graph_def.SerializeToString())

#graph_def = create_branches()
graph_def = create_split()
save_pb(graph_def, 'example.pb')
1 Like