Input Shape Changed by TVM: leads to mismatch error


#1

Hi There

I am trying to run a frozen model on TVM platform and the actual input of [1,1200,1920,3] in NHWC format is changed to [1,3,1201,1921] ("NHCW") by TVM.

Following is the script which recreates the problem:

## Recreating input mismatch where an input of size [1,1200,1920,3] ends up being [1,3,1201,1921]
# NHWC -> NCHW
## Kshitij Srivastava
## April 8, 2019

import os
import numpy as np
import tensorflow as tf
import tvm
import tvm.relay.testing.tf as tf_testing
from tvm import relay
import ast
import argparse
from tvm import autotvm
from tvm import relay
from tvm.relay import testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
import tvm.contrib.graph_runtime as runtime
import time
from tensorflow.python.framework.graph_util import convert_variables_to_constants
from tensorflow.contrib import slim

def generate_graph(data_format=None,shape=None):
    g = tf.Graph()
    with g.as_default():
        data = tf.placeholder(dtype=tf.uint8, shape=shape, name='input')

        #with tf.variable_scope('preprocess', reuse=tf.AUTO_REUSE):

        data = tf.cast(data, tf.float32) * tf.constant(1. / 255.)
        data_format=data_format
        layer1 = data
        layer1 = slim.separable_conv2d(layer1, 32, [3,3], depth_multiplier=1, stride=2, rate=1, padding='same', data_format=data_format)
        out_c1 = tf.identity(layer1, 'outc1')
        with tf.Session(graph = g) as sess:
            sess.run(tf.global_variables_initializer())
            frozen_graph = convert_variables_to_constants(sess, sess.graph_def, ['outc1'])
    return frozen_graph, ['outc1']

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Run inference on a frozen model with random input values")
    parser.add_argument("--input_shape", type=ast.literal_eval, default = [1,1200,1920,3], help = "input shape")
    parser.add_argument("--ctx", type = str, default = "gpu", help = "context for target")
    parser.add_argument("--data_format", type = str, default = 'NHWC', help = "NHCW or NHWC")
    args = parser.parse_args()

    #Some useful parameters 
    dtype = 'uint8'
    input_shape = args.input_shape
    data_format = args.data_format
    if data_format == "NCHW":
        temp =[input_shape[0],input_shape[3],input_shape[1],input_shape[2]]
        input_shape = temp
        print(input_shape)
    if args.ctx == "cpu":
        target = 'llvm -mcpu=x86_AVX2'
        ctx = tvm.cpu(0)
        layout = None
    else:
        target = tvm.target.cuda()
        ctx = tvm.gpu(0)
        layout='NCHW'
        target_host = 'llvm'
    
    
    ## Creating dummy input numpy array 
    input_ = np.random.random_integers(0, 255, input_shape).astype(np.uint8)

    graph_def, output_name = generate_graph(data_format=data_format,shape=input_shape)

    net, params = tvm.relay.frontend.from_tensorflow(graph_def, shape={'input':input_shape}, layout=layout)

    with relay.build_config(opt_level=3):
        graph, lib, params = relay.build(net, target=target, target_host = target_host, params = params)
    
    m = runtime.create(graph,lib,ctx)
    m.set_input('input',tvm.nd.array(input_.astype(dtype)))
    m.set_input(**params)

    ## Execute
    m.run()
    tvm_output = m.get_output(0)#, tvm.nd.empty(((1, 1008)), 'float32'))
    answer = tvm_output.asnumpy()
    #answer = np.squeeze(answer)
    print(np.shape(answer))

When I run the script with batch_size =1: python script.py --input_shape [1,1200,1920,3], we can see that the 1 has been added to both H and W dimensions:

WARNING:autotvm:Cannot find config for target=cuda -model=unknown, workload=('conv2d', (1, 3, 600, 960, 'float32'), (32, 3, 1, 1, 'float32'), (1, 1), (0, 0), (1, 1), 'NCHW', 'float32'). A fallback configuration is used, which may bring great pe
rformance regression.
WARNING:autotvm:Cannot find config for target=cuda -model=unknown, workload=('depthwise_conv2d_nchw', (1, 3, 1201, 1921, 'float32'), (3, 1, 3, 3, 'float32'), (2, 2), (0, 0), (1, 1), 'float32'). A fallback configuration is used, which may bring 
great performance regression.
(1, 600, 960, 32)

However, when I run the script with a batch size more than 1 python script.py --input_shape [2,1200,1920,3], I get the following error:

Traceback (most recent call last):
  File "recreating_ip_mismatch_prob.py", line 75, in <module>
    net, params = tvm.relay.frontend.from_tensorflow(graph_def, shape={'input':input_shape}, layout=layout)
  File "/tvm/python/tvm/relay/frontend/tensorflow.py", line 1719, in from_tensorflow
    sym, params = g.from_tensorflow(graph, layout, shape, outputs)
  File "/tvm/python/tvm/relay/frontend/tensorflow.py", line 1477, in from_tensorflow
    out_type = ir_pass.infer_type(node_item)
  File "/tvm/python/tvm/relay/ir_pass.py", line 353, in infer_type
    return _ir_pass.infer_type(expr, mod)
  File "/tvm/python/tvm/_ffi/_ctypes/function.py", line 190, in __call__
    raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
  [bt] (8) /tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7f717b494381]
  [bt] (7) /tvm/build/libtvm.so(+0xb9a947) [0x7f717b05b947]
  [bt] (6) /tvm/build/libtvm.so(tvm::relay::InferType(tvm::relay::Expr const&, tvm::relay::Module const&)+0x3fd) [0x7f717b05b6fd]
  [bt] (5) /tvm/build/libtvm.so(+0xd24156) [0x7f717b1e5156]
  [bt] (4) /tvm/build/libtvm.so(+0xd236d0) [0x7f717b1e46d0]
  [bt] (3) /tvm/build/libtvm.so(tvm::relay::InferType(tvm::relay::Function const&, tvm::relay::Module const&, tvm::relay::GlobalVar const&)+0x325) [0x7f717b05bdb5]
  [bt] (2) /tvm/build/libtvm.so(+0xb9a2aa) [0x7f717b05b2aa]
  [bt] (1) /tvm/build/libtvm.so(+0xd59e73) [0x7f717b21ae73]
  [bt] (0) /tvm/build/libtvm.so(+0x831502) [0x7f717acf2502]
  [bt] (8) /tvm/build/libtvm.so(+0xd236d0) [0x7f717b1e46d0]
  [bt] (7) /tvm/build/libtvm.so(tvm::relay::InferType(tvm::relay::Function const&, tvm::relay::Module const&, tvm::relay::GlobalVar const&)+0x325) [0x7f717b05bdb5]
  [bt] (6) /tvm/build/libtvm.so(+0xb9a083) [0x7f717b05b083]
  [bt] (5) /tvm/build/libtvm.so(+0xbb7464) [0x7f717b078464]
  [bt] (4) /tvm/build/libtvm.so(+0xba9584) [0x7f717b06a584]
  [bt] (3) /tvm/build/libtvm.so(+0xc62b2b) [0x7f717b123b2b]
  [bt] (2) /tvm/build/libtvm.so(+0xbbd30f) [0x7f717b07e30f]
  [bt] (1) /tvm/build/libtvm.so(+0xbbd254) [0x7f717b07e254]
  [bt] (0) /tvm/build/libtvm.so(+0x831502) [0x7f717acf2502]
  File "/tvm/src/relay/ir/error.cc", line 112
TVMError: 
Error(s) have occurred. We have annotated the program with them:

In `main`: 
v0.0.1
%7 = fn () {
  free_var %input: Tensor[(2, 1200, 1920, 3), uint8]
  %0 = cast(%input, dtype="float32") // 
  free_var %Const: Tensor[(1,), float32]
  %1 = multiply(%0, %Const) // 
  %2 = transpose(%1, axes=[0, 3, 1, 2]) // 
  %3 = nn.pad(%2, pad_width=[[0, 0], [0, 0], [0, 1], [0, 1]]) // 
  free_var %SeparableConv2d/depthwise_weights: Tensor[(3, 3, 3, 1), float32]
  %4 = transpose(%SeparableConv2d/depthwise_weights, axes=[2, 3, 0, 1]) // 
  %5 = nn.conv2d(%3, %4, strides=[2, 2], groups=6, channels=6, kernel_size=[3, 3]) // an internal invariant was violdated while typechecking your program [18:26:43] /tvm/src/relay/pass/type_solver.cc:100: Check failed: resolved.defined(): Unabl
e to unify parent types: TensorType([6, 0, 3, 3], float32) and TensorType([3, 1, 3, 3], float32)
; 
  %6 = transpose(%5, axes=[0, 2, 3, 1]) // 
  %6
}
%7

#2

Can you check if the input shape changed problem can be isolated to the tensorflow frontend by defining the graph directly in relay?


#3

I built another container with the latest github master and the problem seems to be over. I still see that when I use a stride=1 in conv2d operation with “same” padding, I see the intended dimension of [1,1202,1922,3] and the corresponding tvm op = %3 = nn.pad(%2, pad_width=[[0, 0], [0, 0], [1, 1], [1, 1]])

However, when I use a stride of 2 with “same” padding, it still gives me [1,1201,1921,3] with a corresponding tvm op of %3 = nn.pad(%2, pad_width=[[0, 0], [0, 0], [0, 1], [0, 1]]) but there is no error this time.

Do u expect the padding to change with stride ?


#4

Here is my updated script:

## Recreating input mismatch where an input of size [1,1200,1920,3] ends up being [1,3,1201,1921]
# NHWC -> NCHW
## Kshitij Srivastava
## April 16, 2019

import os
import numpy as np
import tensorflow as tf
import tvm
import tvm.relay.testing.tf as tf_testing
from tvm import relay
import ast
import argparse
from tvm import autotvm
from tvm import relay
from tvm.relay import testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
import tvm.contrib.graph_runtime as runtime
import time
from tensorflow.python.framework.graph_util import convert_variables_to_constants
from tensorflow.contrib import slim

def generate_graph(data_format=None,shape=None):
    g = tf.Graph()
    with g.as_default():
        data = tf.placeholder(dtype=tf.uint8, shape=shape, name='input')

        #with tf.variable_scope('preprocess', reuse=tf.AUTO_REUSE):

        data = tf.cast(data, tf.float32) * tf.constant(1. / 255.)
        data_format=data_format
        conv2d_filter = tf.get_variable(shape=[3,3,3,3],name='filter')
        layer1 = data
        layer1 = tf.nn.conv2d(layer1,conv2d_filter,strides=[1,2,2,1],padding='SAME')
        layer1 = tf.nn.conv2d(layer1,conv2d_filter,strides=[1,2,2,1],padding='SAME')
        layer1 = tf.nn.conv2d(layer1,conv2d_filter,strides=[1,2,2,1],padding='SAME')
        out_c1 = tf.identity(layer1, 'outc1')
        with tf.Session(graph = g) as sess:
            sess.run(tf.global_variables_initializer())
            frozen_graph = convert_variables_to_constants(sess, sess.graph_def, ['outc1'])

    return frozen_graph, ['outc1']

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Run inference on a frozen model with random input values")
    parser.add_argument("--input_shape", type=ast.literal_eval, default = [128,300,300,3], help = "input shape")
    parser.add_argument("--ctx", type = str, default = "gpu", help = "context for target")
    parser.add_argument("--data_format", type = str, default = 'NHWC', help = "NHCW or NHWC")
    args = parser.parse_args()

    #Some useful parameters 
    dtype = 'uint8'
    input_shape = args.input_shape
    data_format = args.data_format
    if args.ctx == "cpu":
        target = 'llvm -mcpu=x86_AVX2'
        ctx = tvm.cpu(0)
        layout = None
    else:
        target = tvm.target.cuda(model="1080ti")
        ctx = tvm.gpu(0)
        layout='NCHW'
        target_host = 'llvm'
    
    
    ## Creating dummy input numpy array 
    input_ = np.random.random_integers(0, 255, input_shape).astype(np.uint8)

    graph_def, output_name = generate_graph(data_format=data_format,shape=input_shape) 
    net, params = tvm.relay.frontend.from_tensorflow(graph_def, shape={'input':input_shape}, layout=layout)
    print(net.body)
    print('===========================================================')
    with relay.build_config(opt_level=3):
        graph, lib, params = relay.build(net, target=target, target_host = target_host, params = params)
    
    m = runtime.create(graph,lib,ctx)
    m.set_input('input',tvm.nd.array(input_.astype(dtype)))
    m.set_input(**params)

    ## Execute
    m.run()
    tvm_output = m.get_output(0)#, tvm.nd.empty(((1, 1008)), 'float32'))
    answer = tvm_output.asnumpy()
    #answer = np.squeeze(answer)
    print(np.shape(answer))

and the corresponding TVM generated model:

v0.0.1
free_var %input: Tensor[(128, 300, 300, 3), uint8]
%0 = cast(%input, dtype="float32")
free_var %Const: Tensor[(1,), float32]
%1 = multiply(%0, %Const)
%2 = transpose(%1, axes=[0, 3, 1, 2])
%3 = nn.pad(%2, pad_width=[[0, 0], [0, 0], [0, 1], [0, 1]])
free_var %filter: Tensor[(3, 3, 3, 3), float32]
%4 = transpose(%filter, axes=[3, 2, 0, 1])
%5 = nn.conv2d(%3, %4, strides=[2, 2], channels=3, kernel_size=[3, 3])
%6 = transpose(%5, axes=[0, 2, 3, 1])
%7 = transpose(%6, axes=[0, 3, 1, 2])
%8 = nn.pad(%7, pad_width=[[0, 0], [0, 0], [0, 1], [0, 1]])
%9 = transpose(%filter, axes=[3, 2, 0, 1])
%10 = nn.conv2d(%8, %9, strides=[2, 2], channels=3, kernel_size=[3, 3])
%11 = transpose(%10, axes=[0, 2, 3, 1])
%12 = transpose(%11, axes=[0, 3, 1, 2])
%13 = nn.pad(%12, pad_width=[[0, 0], [0, 0], [1, 1], [1, 1]])
%14 = transpose(%filter, axes=[3, 2, 0, 1])
%15 = nn.conv2d(%13, %14, strides=[2, 2], channels=3, kernel_size=[3, 3])
transpose(%15, axes=[0, 2, 3, 1])

and the output is just the dimension of the output tensor of the op which seems okay

WARNING:autotvm:Cannot find config for target=cuda -model=1080ti, workload=('conv2d', (128, 3, 301, 301, 'float32'), (3, 3, 3, 3, 'float32'), (2, 2), (0, 0), (1, 1), 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
WARNING:autotvm:Cannot find config for target=cuda -model=1080ti, workload=('conv2d', (128, 3, 151, 151, 'float32'), (3, 3, 3, 3, 'float32'), (2, 2), (0, 0), (1, 1), 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
WARNING:autotvm:Cannot find config for target=cuda -model=1080ti, workload=('conv2d', (128, 3, 77, 77, 'float32'), (3, 3, 3, 3, 'float32'), (2, 2), (0, 0), (1, 1), 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
(128, 38, 38, 3)



#5

That is interesting that it is still trying to target a workload with off-by-1 shapes.
You can use this tutorial https://docs.tvm.ai/tutorials/autotvm/tune_relay_cuda.html and tune the implementation and see if the extracted workloads (tasks) also have the off-by-1 problem.


#6

Extracted workload for stride=2 is off by 1

{"v": 0.1, "r": [[0.0004028920402144772, 0.0004029741474530831, 0.00040302522520107236], 0, 1.6767456531524658, 1555614243.9415061], "i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 3, 1201, 1921], "float32"], ["TENSOR", [32, 3, 3, 3], "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 3, 1201, 1921, "float32"], [32, 3, 3, 3, "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {"e": [["tile_f", "sp", [1, 2, 16, 1]], ["tile_y", "sp", [600, 1, 1, 1]], ["tile_x", "sp", [5, 12, 16, 1]], ["tile_rc", "sp", [3, 1]], ["tile_ry", "sp", [1, 3]], ["tile_rx", "sp", [3, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]], "t": "direct", "c": null, "i": 609459219}]}```