Map NNVM OP to hardware PU

algoton · December 10, 2018, 4:55am

I would like to compile the network model to Processing Unit macro implemented in PYNQ like CONV.

convert network model to NNVM graph.
override “schedule_conv2d_nchw” with tensorize to call the external C function in PYNQ

Question:

one command “nnvm.compiler.build” has generated LLVM IR. I am not sure whether my TVM OP schedule override works or not.
Is there a way to dump the TVM IR for the compiled NNVM graph which is more readable?

Code snippet:

_#!/usr/bin/env python_

_ # coding: utf-8_

_ # In[1]:_

_ from future import absolute_import, print_function_

_ import topi_
_ import nnvm_
_ import tvm_
_ import numpy as np_

_ # In[3]:_

_ def intrin_func(ins, outs):_
_ ib = tvm.ir_builder.create()_
_ dbuf, wbuf = ins_
_ cbuf = outs[0]_
_ ib.emit(tvm.call_extern(“float32”, “conv_bulldog”,_
_ cbuf.access_ptr(“w”),_
_ dbuf.access_ptr(“r”),_
_ wbuf.access_ptr(“r”)))_
_ return ib.get()_

_ # In[4]:_

_ def intrin_run():_
_ d = tvm.placeholder((batch, ic, ih, iw), name=‘data’)_
_ w = tvm.placeholder((oc, ic, kh, kw), name=‘weight’)_
_ oh = (ih - kh + 2pad) // stride + 1_
_ ow = (iw - kw + 2pad) // stride + 1_
_ _
_ # Create reduction variables_
_ rc = tvm.reduce_axis((0, ic), name=‘rc’)_
_ ry = tvm.reduce_axis((0, kh), name=‘ry’)_
_ rx = tvm.reduce_axis((0, kw), name=‘rx’)_
_ _
_ # Compute the convolution_
_ c = tvm.compute((batch, oc, oh, ow),_
_ lambda bb, oo, yy, xx: tvm.sum(_
_ data[bb, rc, yy * stride + ry, xx * stride + rx] * weight[oo, rc, ry, rx],_
_ axis=[rc, ry, rx]), name=‘c’)_

_ return tvm.decl_tensor_intrin(c.op, intrin_func)_

_ # In[5]:_

_ def my_schedule(outs):_
_ s = tvm.create_schedule(outs.op)_
_ bb, = outs.op.axis_
_ bd_conv = intrin_run()_
_ s[outs].tensorize(bb, bd_conv)_
_ return s_

_ fs = tvm.target.get_native_generic_func(“schedule_conv2d_nchw”)_
_ fs.set_default(my_schedule)_

_ # In[8]:_

_ data = nnvm.symbol.Variable(name=“data”)_
_ kernel = nnvm.symbol.Variable(name=“kernel”)_
_ conv = nnvm.symbol.conv2d(data=data, kernel_size=(3,3), channels=10, strides=[1,1])_
_ graph = nnvm.graph.create(conv)_
_ shape = {“data”: (1, 3, 12, 12)}_

_ # In[9]:_

_ dg, lib, params = nnvm.compiler.build(graph, target=“llvm”, shape=shape, dtype=“float32”)_
_ print(lib.get_source())_

_ # In[ ]:_

masahi · December 10, 2018, 8:24am

You can simply add some debug printf inside your schedule to see if your schedule is called or not. For the second question, try adding

import logging
logging.basicConfig(level=logging.DEBUG)

before nnvm.compiler.build

algoton · December 21, 2018, 2:55am

@masahi thanks for the tip.

I have more question after digging it further.
Again I intended to tensorize the original conv2d OP to extern op with the schedule override like this:
nnvm.top.registry.register_schedule(‘conv2d’, conv_intrin_schedule, level=11)

Then I found out the standard interface to register schedule has “attrs, outs, target” like:
def conv_intrin_schedule (attrs, outs, target)

And not all common convolution parameters are defined in attrs. For example, I cannot extract input shape from the inputs of register_schedule then cannot create tensor intrin dynamically.

Any suggestion?

def conv_intrin (args):
    batch = args[0]
    ic = args[1]
    ih = args[2]
    iw = args[3]
    oc = args[4]
    kh = args[5]
    kw = args[6]
    pad = args[7]
    stride = args[8]

    d = tvm.placeholder((ic, ih, iw), name='data')
    w = tvm.placeholder((ic, kh, kw), name='weight')
    oh = (ih - kh + 2*pad) // stride + 1
    ow = (iw - kw + 2*pad) // stride + 1
    
    # Create reduction variables
    rc = tvm.reduce_axis((0, ic), name='rc')
    ry = tvm.reduce_axis((0, kh), name='ry')
    rx = tvm.reduce_axis((0, kw), name='rx')
    
    # Compute the convolution
    c = tvm.compute((oh, ow), lambda yy, xx: tvm.sum(
                    d[rc, yy * stride + ry, xx * stride + rx] * w[rc, ry, rx],
                    axis=[rc, ry, rx]), name='c')
    
    def conv_intrin_lower (ins, outs):
        ib = tvm.ir_builder.create()
        dbuf, wbuf = ins
        cbuf = outs[0]
        ib.emit(tvm.call_extern('float32', 'conv_bulldog', 
                            cbuf.access_ptr('w'),
                            dbuf.access_ptr('r'),
                            wbuf.access_ptr('r'),
                            args))
        return ib.get()
    
    return tvm.decl_tensor_intrin(c.op, conv_intrin_lower)

def conv_intrin_schedule (attrs, outs, target):
    with tvm.target.create(target):
        batch, oc, oh, ow = outs.shape
        kh, kw = attrs["kernel_size"]
        stride = attrs["strides"][0]        
        pad = attrs["padding"][0]
        ic = ?? # not part of attrs
        ih = (oh - 1) * stride -2*pad + kh  # awkward
        iw = (ow - 1) * stride -2*pad + kw # awkard
        
        args = (batch, ic, ih, iw, oc, kh, kw, stride, pad)
        outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
        s = tvm.create_schedule([x.op for x in outs])
        x = outs[0]
        ti = conv_intrin(args)        
        axis = x.op.axis[0]
        s[x].tensorize(axis, ti)
        return s

masahi · December 21, 2018, 4:16am

To get input channels, you can examine the shape of convolution weight. From outs[0].op, you can traverse its input tensors (data, kernel). See here for an example of how to do this.

algoton · December 22, 2018, 9:00pm

@masahi thanks. I can move further, but stuck in one Error. I feel it is related to unsuccessful tensorize but have no clue.

> def conv_intrin (args):
>     batch = args[0]
>     ic = args[1]
>     ih = args[2]
>     iw = args[3]
>     oc = args[4]
>     kh = args[5]
>     kw = args[6]
>     pad = args[7]
>     stride = args[8]
> 
>     d = tvm.placeholder((ic, ih, iw), name='data')
>     w = tvm.placeholder((ic, kh, kw), name='weight')
>     oh = (ih - kh + 2*pad) // stride + 1
>     ow = (iw - kw + 2*pad) // stride + 1
>     
>     # Create reduction variables
>     rc = tvm.reduce_axis((0, ic), name='rc')
>     ry = tvm.reduce_axis((0, kh), name='ry')
>     rx = tvm.reduce_axis((0, kw), name='rx')
>     
>     # Compute the convolution
>     c = tvm.compute((oh, ow), lambda yy, xx: tvm.sum(
>                     d[rc, yy * stride + ry, xx * stride + rx] * w[rc, ry, rx],
>                     axis=[rc, ry, rx]), name='c')
>     
>     def conv_intrin_lower (ins, outs):
>         ib = tvm.ir_builder.create()
>         dbuf, wbuf = ins
>         cbuf = outs[0]
>         ib.emit(tvm.call_extern('float32', 'conv_bulldog', 
>                             cbuf.access_ptr('w'),
>                             dbuf.access_ptr('r'),
>                             wbuf.access_ptr('r'),
>                             args))
>         return ib.get()
>     
>     return tvm.decl_tensor_intrin(c.op, conv_intrin_lower)
> 
> def conv_intrin_schedule (attrs, outs, target):
>     with tvm.target.create(target):
>         s = tvm.create_schedule([x.op for x in outs])
>         scheduled_ops = []
>         
>         def traverse(op):
>             print("OP tag: ", op.tag)
>             if 'broadcast' in op.tag:
>                 if op not in s.outputs:
>                     s[op].compute_inline()
>                 for tensor in op.input_tensors:
>                     if tensor.op.input_tensors and tensor.op not in scheduled_ops:
>                         traverse(tensor.op)            
>             
>             if 'conv2d_nchw' in op.tag:
>                 output = op.output(0)
>                 conv_out = op.input_tensors[0]
>                 data_vec = conv_out.op.input_tensors[0]
>                 data = data_vec.op.input_tensors[0]
>                 data_pad = None
>                 if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
>                     data_pad = data
>                     data = data_pad.op.input_tensors[0]                
>                 
>                 batch, ic, ih, iw = data.shape
>                 _, oc, _, _ = output.shape
>                 kh, kw = attrs.get_int_tuple("kernel_size")
>                 stride, _ = attrs.get_int_tuple("strides")
>                 pad, _ = attrs.get_int_tuple("padding")
>                 args = (batch, ic, ih, iw, oc, kh, kw, pad, stride)
>                 print("args: ", args)
>                 ti = conv_intrin(args)
>                 axis = conv_out.op.axis[0]
>                 s[x].tensorize(axis, ti)
>             scheduled_ops.append(op)
> 
>         traverse(outs[0].op)
>         print("scheduled ops: ", scheduled_ops)
>         return s
> 
> data = nnvm.symbol.Variable(name='data')
> kernel = nnvm.symbol.Variable(name='kernel')
> conv = nnvm.symbol.conv2d(data=data, weight=kernel, kernel_size=(3,3), channels=1, strides=[1,1])
> graph = nnvm.graph.create(conv)
> shape = {'data': (1, 3, 12, 12), 'kernel': (1, 3, 3, 3)}
> 
> nnvm.top.registry.register_schedule('conv2d', conv_intrin_schedule, level=11)
> nnvm.top.registry.register_pattern('conv2d', nnvm.top.registry.OpPattern.OPAQUE, level=11)
> 
> dg, mhost2, params = nnvm.compiler.build(graph, target='llvm', shape=shape, dtype='float32')
> print(dg.ir())
> print(mhost2.get_source())
> mhost2.export_library('./conv2_graph.so')
> 
> DEBUG:autotvm:Finish loading 35 records
> WARNING:autotvm:Cannot find config for target=llvm, workload=('conv2d', (1, 3, 12, 12, 'float32'), (1, 3, 3, 3, 'float32'), (1, 1), (0, 0), (1, 1), 'NCHW', 'float32'). A fallback configuration is used, which may bring great performance regression.
> 
> ('OP tag: ', 'broadcast')
> ('OP tag: ', 'conv2d_nchw')
> ('args: ', (1, 3, 12, 12, 1, 3, 3, 0, 1))
> 
> ---------------------------------------------------------------------------
> NNVMError                                 Traceback (most recent call last)
> <ipython-input-5-1df80d347f1e> in <module>()
>       2 nnvm.top.registry.register_pattern('conv2d', nnvm.top.registry.OpPattern.OPAQUE, level=11)
>       3 
> ----> 4 dg, mhost2, params = nnvm.compiler.build(graph, target='llvm', shape=shape, dtype='float32')
>       5 print(dg.ir())
>       6 print(mhost2.get_source())
> 
> /home/sxie/projects/tvm/nnvm/python/nnvm/compiler/build_module.pyc in build(graph, target, shape, dtype, params, target_host, layout)
>     303         graph = graph.apply("GraphFuse")
>     304         with target:
> --> 305             graph = graph.apply("GraphCompile")
>     306         libmod = graph_attr._move_out_module(graph, "module")
>     307         # Write variable initial values into params
> 
> /home/sxie/projects/tvm/nnvm/python/nnvm/graph.pyc in apply(self, passes)
>     232         ghandle = GraphHandle()
>     233         npass = nn_uint(len(passes))
> --> 234         check_call(_LIB.NNGraphApplyPasses(self.handle, npass, cpass, ctypes.byref(ghandle)))
>     235         return Graph(ghandle)
>     236 
> 
> /home/sxie/projects/tvm/nnvm/python/nnvm/_base.pyc in check_call(ret)
>      73     """
>      74     if ret != 0:
> ---> 75         raise NNVMError(py_str(_LIB.NNGetLastError()))
>      76 
>      77 def c_str(string):
> 
> NNVMError: TVMCall CFunc Error:
> Traceback (most recent call last):
>   File "/home/sxie/projects/tvm/python/tvm/_ffi/_ctypes/function.py", line 55, in cfun
>     rv = local_pyfunc(*pyargs)
>   File "<ipython-input-3-20d6fcc639c9>", line 37, in conv_intrin_schedule
>     traverse(outs[0].op)
>   File "<ipython-input-3-20d6fcc639c9>", line 13, in traverse
>     traverse(tensor.op)
>   File "<ipython-input-3-20d6fcc639c9>", line 32, in traverse
>     ti = conv_intrin(args)
>   File "<ipython-input-2-2530259ff3f5>", line 39, in conv_intrin
>     return tvm.decl_tensor_intrin(c.op, conv_intrin_lower)
>   File "/home/sxie/projects/tvm/python/tvm/tensor_intrin.py", line 109, in decl_tensor_intrin
>     body = fcompute(binds_list[:len(inputs)], binds_list[len(inputs):])
>   File "<ipython-input-2-2530259ff3f5>", line 36, in conv_intrin_lower
>     args))
>   File "/home/sxie/projects/tvm/python/tvm/intrin.py", line 154, in call_extern
>     dtype, func_name, convert(args), _Call.Extern, None, 0)
>   File "/home/sxie/projects/tvm/python/tvm/_ffi/_ctypes/function.py", line 185, in __call__
>     ctypes.byref(ret_val), ctypes.byref(ret_tcode)))
>   File "/home/sxie/projects/tvm/python/tvm/_ffi/base.py", line 72, in check_call
>     raise TVMError(py_str(_LIB.TVMGetLastError()))
> TVMError: [12:29:51] /home/sxie/projects/tvm/include/tvm/packed_func_ext.h:123: Check failed: NodeTypeChecker<TNodeRef>::Check(sptr.get()) Expected type array<Expr> but get Array
> 
> Stack trace returned 10 entries:
> [bt] (0) /home/sxie/projects/tvm/build/libtvm.so(dmlc::StackTrace[abi:cxx11](unsigned long)+0x1f5) [0x7fad54668465]
> [bt] (1) /home/sxie/projects/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x3e) [0x7fad546690ee]
> [bt] (2) /home/sxie/projects/tvm/build/libtvm.so(tvm::Array<HalideIR::Expr, void> tvm::runtime::TVMArgValue::AsNodeRef<tvm::Array<HalideIR::Expr, void> >() const+0x33c) [0x7fad5469084c]
> [bt] (3) /home/sxie/projects/tvm/build/libtvm.so(+0x2cefc7) [0x7fad54687fc7]
> [bt] (4) /home/sxie/projects/tvm/build/libtvm.so(TVMFuncCall+0x5e) [0x7fad54cd707e]
> [bt] (5) /home/sxie/projects/python2/anaconda2/envs/ml-suite/lib/python2.7/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fad744bcec0]
> [bt] (6) /home/sxie/projects/python2/anaconda2/envs/ml-suite/lib/python2.7/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7fad744bc87d]
> [bt] (7) /home/sxie/projects/python2/anaconda2/envs/ml-suite/lib/python2.7/lib-dynload/_ctypes.so(_ctypes_callproc+0x4de) [0x7fad746d38de]
> [bt] (8) /home/sxie/projects/python2/anaconda2/envs/ml-suite/lib/python2.7/lib-dynload/_ctypes.so(+0x9b31) [0x7fad746c9b31]
> [bt] (9) /home/sxie/projects/python2/anaconda2/envs/ml-suite/bin/../lib/libpython2.7.so.1.0(PyObject_Call+0x43) [0x7fad76816973]

algoton · December 23, 2018, 6:51am

I found the answer from this post