Can't use GPU (ARM Mali) of Hikey 970 board?

I can’t seem to access the GPU of the Hikey 970 (when I run the below script, I always get a message that says opencl/opencl_device_api.cc:273: Using CPU OpenCL device.

First I run:

sudo python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090

On the Hikey board.

Then, on the host machine, I run the code from this tutorial:

import numpy as np

import tvm
from tvm import rpc
from tvm.contrib import util

n = tvm.convert(1024)
A = tvm.placeholder((n,), name='A')
B = tvm.compute((n,), lambda i: A[i] + 1.0, name='B')
s = tvm.create_schedule(B.op)

target = tvm.target.create('opencl -device=mali')

#func = tvm.build(s, [A, B], target=target, name='add_one')
# save the lib at a local temp folder
temp = util.tempdir()
#path = temp.relpath('lib.tar')
#func.export_library(path)


def run_opencl():
    # NOTE: This is the setting for my rk3399 board. You need to modify
    # them according to your environment.
    target_host = "llvm -target=aarch64-linux-gnu"
    opencl_device_host = '129.215.90.220'
    opencl_device_port = 9090

    # create schedule for the above "add one" compute declaration
    s = tvm.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=32)
    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
    s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
    func = tvm.build(s, [A, B], target=target, target_host=target_host)

    remote = rpc.connect(opencl_device_host, opencl_device_port)

    # export and upload
    path = temp.relpath('lib_cl.tar')
    func.export_library(path)
    remote.upload(path)
    func = remote.load_module('lib_cl.tar')

    # run
    ctx = remote.cl()
    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
    func(a, b)
    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
    print("OpenCP test passed!")

run_opencl()

The output is always:

[16:50:53] /home/jack/work/tvm/src/runtime/opencl/opencl_device_api.cc:273: Using CPU OpenCL device
OpenCP test passed!

I have tried building just the runtime and also the whole of TVM on the board with USE_OPENCL ON.

Any ideas?