Undefined symbol error occured when I use TVMDSOOp

Hello, all!

I tried to learn TVMDSOOp so I build TVM with set(USE_TF_TVMDSOOP ON) in config.cmake. Then I ran script prepare_and_test_tfop_module.sh and I had problem about undefined symbol: tensorflow.python.framework.errors_impl.NotFoundError: /root/code/tvmdsoop/tvm/build/libtvm_dso_op.so: undefined symbol: _ZN3tvm7runtime10ModuleNode11GetFunctionERKSsb

I use command nm to check all undefined symbols:

# path/to/tvm/apps/tf_tvmdsoop/build
nm -Du libtvm_dso_op.so

I can see many these:

U _Unwind_Resume
U _ZN10tensorflow10DEVICE_CPUE
U _ZN10tensorflow10DEVICE_GPUE
U _ZN10tensorflow11GetNodeAttrERKNS_9AttrSliceEN4absl11string_viewEPNS_8DataTypeE
U _ZN10tensorflow11GetNodeAttrERKNS_9AttrSliceEN4absl11string_viewEPSs
U _ZN10tensorflow11GetNodeAttrERKNS_9AttrSliceEN4absl11string_viewEPSt6vectorIxSaIxEE
U _ZN10tensorflow11GetNodeAttrERKNS_9AttrSliceEN4absl11string_viewEPb
U _ZN10tensorflow11register_op20OpDefBuilderReceiverC1ERKNS0_19OpDefBuilderWrapperILb1EEE
U _ZN10tensorflow12OpDefBuilder4AttrESs
U _ZN10tensorflow12OpDefBuilder5InputESs
U _ZN10tensorflow12OpDefBuilder6OutputESs
U _ZN10tensorflow12OpDefBuilderC1ESs
U _ZN10tensorflow14TensorShapeRep12SlowCopyFromERKS0_
U _ZN10tensorflow14TensorShapeRep19DestructorOutOfLineEv
U _ZN10tensorflow14kernel_factory17OpKernelRegistrar12InitInternalEPKNS_9KernelDefEN4absl11string_viewESt10unique_ptrINS0_15OpKernelFactoryESt14default_deleteIS8_EE
U _ZN10tensorflow15OpKernelContext13allocate_tempENS_8DataTypeERKNS_11TensorShapeEPNS_6TensorENS_19AllocatorAttributesERKNS_20AllocationAttributesE
U _ZN10tensorflow15OpKernelContext15allocate_outputEiRKNS_11TensorShapeEPPNS_6TensorE
U _ZN10tensorflow15OpKernelContext21CtxFailureWithWarningEPKciRKNS_6StatusE
U _ZN10tensorflow15OpKernelContext5inputEi
U _ZN10tensorflow15TensorShapeBaseINS_11TensorShapeEEC2Ev
U _ZN10tensorflow16KernelDefBuilder5BuildEv
U _ZN10tensorflow16KernelDefBuilder6DeviceEPKc
U _ZN10tensorflow16KernelDefBuilderC2EPKc
U _ZN10tensorflow16KernelDefBuilderD2Ev
U _ZN10tensorflow16TensorShapeUtils9MakeShapeEPKxxPNS_11TensorShapeE
U _ZN10tensorflow22CheckNotInComputeAsyncEPNS_15OpKernelContextEPKc
U _ZN10tensorflow5OpDefD1Ev
U _ZN10tensorflow6Status12SlowCopyFromEPKNS0_5StateE
U _ZN10tensorflow6StatusC1ENS_5error4CodeEN4absl11string_viewE
U _ZN10tensorflow6TensorC1Ev
U _ZN10tensorflow6TensorD1Ev
U _ZN10tensorflow8OpKernelC2EPNS_20OpKernelConstructionE
U _ZN10tensorflow8OpKernelD2Ev
U _ZN10tensorflow8internal15LogMessageFatalC1EPKci
U _ZN10tensorflow8internal15LogMessageFatalD1Ev
U _ZN10tensorflow8internal21CheckOpMessageBuilder7ForVar2Ev
U _ZN10tensorflow8internal21CheckOpMessageBuilder9NewStringEv
U _ZN10tensorflow8internal21CheckOpMessageBuilderC1EPKc
U _ZN10tensorflow8internal21CheckOpMessageBuilderD1Ev
U _ZN10tensorflow9AttrSliceC1ERKNS_7NodeDefE
U _ZN3tvm7runtime10ModuleNode11GetFunctionERKSsb
U _ZN3tvm7runtime6Module12LoadFromFileERKSsS3_
U _ZNK10tensorflow15TensorShapeBaseINS_11TensorShapeEE4dimsEv
U _ZNK10tensorflow15TensorShapeBaseINS_11TensorShapeEE9dim_sizesEv
U _ZNK10tensorflow6Tensor10TotalBytesEv
U _ZNK10tensorflow6Tensor11tensor_dataEv
U _ZNK10tensorflow6Tensor9CheckTypeENS_8DataTypeE
U _ZNSaIcEC1Ev
U _ZNSaIcED1Ev
U _ZNSolsEi
U _ZNSolsEl
U _ZNSolsEm
U _ZNSolsEx
U _ZNSsC1EOSs
U _ZNSsC1EPKcRKSaIcE
U _ZNSsC1ERKSs
U _ZNSsC1Ev
U _ZNSsD1Ev
U _ZNSt8ios_base4InitC1Ev
U _ZNSt8ios_base4InitD1Ev
U _ZSt17__throw_bad_allocv
U _ZSt25__throw_bad_function_callv
U _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
U _ZStlsIcSt11char_traitsIcESaIcEERSt13basic_ostreamIT_T0_ES7_RKSbIS4_S5_T1_E
U _ZTIN10tensorflow8OpKernelE
U _ZTVN10__cxxabiv117__class_type_infoE
U _ZTVN10__cxxabiv120__si_class_type_infoE
U _ZTVN10tensorflow14kernel_factory17OpKernelRegistrar18PtrOpKernelFactoryE
U _ZdlPv
U _ZdlPvm
U _Znam
U _Znwm
U __assert_fail
U __cxa_atexit
U __cxa_begin_catch
U __cxa_end_catch
w __cxa_finalize
U __cxa_pure_virtual
U __cxa_rethrow
U __cxa_throw_bad_array_new_length
w __gmon_start__
U __gxx_personality_v0
U __stack_chk_fail
U clock_gettime
U cudaMemcpy
U memcpy
U memmove
U memset
U random

My programming enviroment:
OS: Ubuntu 16.04
cmake: 3.16.6 clang/llvm: 9

My config.cmake:

set(USE_CUDA ON)

set(USE_ROCM OFF)

set(USE_SDACCEL OFF)

set(USE_AOCL OFF)

set(USE_OPENCL OFF)

set(USE_METAL OFF)

set(USE_VULKAN OFF)

set(USE_OPENGL OFF)

set(USE_MICRO OFF)

set(USE_SGX OFF)
set(SGX_MODE "SIM")
set(RUST_SGX_SDK "/path/to/rust-sgx-sdk")

set(USE_RPC ON)

set(USE_STACKVM_RUNTIME OFF)

set(USE_GRAPH_RUNTIME ON)

set(USE_GRAPH_RUNTIME_DEBUG OFF)

set(USE_VM_PROFILER OFF)

set(USE_MICRO_STANDALONE_RUNTIME OFF)

set(USE_LLVM /root/downloads/clang_llvm_9/bin/llvm-config)

set(USE_BLAS none)

set(USE_MKL_PATH none)

set(USE_MKLDNN OFF)

set(USE_OPENMP none)

set(USE_RANDOM OFF)

set(USE_NNPACK OFF)

set(USE_TFLITE OFF)

set(USE_TENSORFLOW_PATH none)

set(USE_FLATBUFFERS_PATH none)

set(USE_EDGETPU OFF)

set(USE_CUDNN ON)

set(USE_CUBLAS ON)

set(USE_MIOPEN OFF)

set(USE_MPS OFF)

set(USE_ROCBLAS OFF)

set(USE_SORT ON)

set(USE_DNNL_CODEGEN OFF)

set(USE_ANTLR OFF)

set(USE_RELAY_DEBUG OFF)

set(USE_VTA_FSIM OFF)

set(USE_VTA_TSIM OFF)

set(USE_VTA_FPGA OFF)

set(USE_EXAMPLE_EXT_RUNTIME OFF)

set(USE_THRUST OFF)

set(USE_TF_TVMDSOOP ON)

set(USE_FALLBACK_STL_MAP OFF)

set(USE_HEXAGON_DEVICE OFF)
set(USE_HEXAGON_SDK /path/to/sdk)

LD_LIBRARY_PATH is /root/tvm/build:/root/tvm/apps/tf_tvmdsoop/build

Did I do something wrong? Please help. Thanks!

As far as I know there is an CXXABI issue about this. Since libtvm_dso_op both link to libtensorflow_framework and libtvm, can you check whether they are built with same CXXABI configuration? For TF you can just run python -c "import tensorflow as tf; print(' '.join(tf.sysconfig.get_link_flags())) to show whether _GLIBCXX_USE_CXX11_ABI=0 (or 1) is set.

Hi wrongtest, thanks for your reply.

I run import tensorflow as tf; print(' '.join(tf.sysconfig.get_compile_flags())) and it shows -I/usr/local/lib/python3.7/site-packages/tensorflow_core/include -D_GLIBCXX_USE_CXX11_ABI=0.
And I run import tensorflow as tf; print(' '.join(tf.sysconfig.get_link_flags())) and it shows -L/usr/local/lib/python3.7/site-packages/tensorflow_core -l:libtensorflow_framework.so.1.

I use tensorflow1.15-gpu installed by pip.

Thanks~

Then you may check your compile configuration for libtvm.so. For example,

nm libtvm.so | grep _ZN3tvm7runtime10ModuleNode11GetFunction*

In my environment, I will get either

  • “_ZN3tvm7runtime10ModuleNode11GetFunctionERKSsb” (by default)

or

  • “ZN3tvm7runtime10ModuleNode11GetFunctionERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEb” (if enable CXXABI)

In short it should be same with Tensorflow’s configuration(=0 for you :slight_smile: ). Or else try recompile whole tvm with extra compile option -D_GLIBCXX_USE_CXX11_ABI=0 may help.

I checked my configuration for libtvm.so.
nm libtvm.so | grep _ZN3tvm7runtime10ModuleNode11GetFunction*
I get: _ZN3tvm7runtime10ModuleNode11GetFunctionERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEb

It seems that I already have set option -D_GLIBCXX_USE_CXX11_ABI=0 when I compiled TVM.

The problem here is when building libtvm_dso_op.so, the build script inherit compile flags from TensorFlow (-D_GLIBCXX_USE_CXX11_ABI=0) , thus when links to tvm, it will try find mangled name like “_ZN3tvm7runtime10ModuleNode11GetFunctionERKSsb” in libtvm but failed and result to undefined symbol errors.

I’ll also have a try in my env how to make this configuration take effect. However, I am not familiar with clang…

Well, as you said, the key to this linking problem is -D_GLIBCXX_USE_CXX11_ABI=0.

I installed tensorflow-1.15 by pip install. And the binary pip packages available on the TensorFlow website are built with gcc 4 (built with --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" flags).

But my gcc version is 5.4.0 and when I built TVM, I didn’t set -D_GLIBCXX_USE_CXX11_ABI=0. So there was a linking problem(undefined symbol) while compiling libtvm_dso_op.so because I set -D_GLIBCXX_USE_CXX11_ABI=0 and link to both libtvm.so and libtensorflow_framework.so.1.

My solution is to build tensorflow from source without -D_GLIBCXX_USE_CXX11_ABI=0 then I passed test_tfop_module.py.

1 Like