diff --git a/imperative/python/test/unit/xla/functional/test_communicate.py b/imperative/python/test/unit/xla/functional/test_communicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..7032de5ff570bdb3a383fabd97615ea1ecfc733c
--- /dev/null
+++ b/imperative/python/test/unit/xla/functional/test_communicate.py
@@ -0,0 +1,90 @@
+import numpy as np
+import pytest
+
+import megengine.distributed as dist
+import megengine.functional.distributed as fdist
+import megengine.jit as jit
+import megengine.tensor as tensor
+from megengine.distributed.helper import (
+    get_offsets,
+    param_pack_concat,
+    param_pack_split,
+)
+
+
+def test_param_pack_concat():
+    def tester(ishapes, dtype=None):
+        dtype = dtype or np.float32
+        inps = [tensor(np.random.randn(*ishape), dtype=dtype) for ishape in ishapes]
+        offset_vals = get_offsets(ishapes)
+        offsets = tensor(offset_vals, dtype="int32")
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(*inps, offsets):
+            return param_pack_concat(inps, offsets, offset_vals)
+
+        mge_rst = func(*inps, offsets=offsets)
+        xla_rst = func(*inps, offsets=offsets)
+        np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester(ishapes=((1,),))
+    tester(ishapes=((1, 2),))
+    tester(ishapes=((1,), (2,)))
+    tester(ishapes=((1,), (2, 3), (4, 5, 6), (1,), (3, 2)))
+
+
+def test_param_pack_split():
+    def tester(ishapes, dtype=None):
+        dtype = dtype or np.float32
+        offset_vals = get_offsets(ishapes)
+        inp = tensor(np.random.randn(offset_vals[-1]), dtype=dtype)
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(inp):
+            return param_pack_split(inp, offset_vals, ishapes)
+
+        mge_rsts = func(inp)
+        xla_rsts = func(inp)
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester(ishapes=((1,),))
+    tester(ishapes=((1, 2),))
+    tester(ishapes=((1,), (2,)))
+    tester(ishapes=((1,), (2, 3), (4, 5, 6), (1,), (3, 2)))
+
+
+# @pytest.mark.require_ngpu(2)
+# @pytest.mark.isolated_distributed
+def _test_all_reduce():
+    def tester(reduce_func, ishape, n_gpus, dtype=None):
+        @dist.launcher(n_gpus=n_gpus)
+        def worker(data):
+            rank = dist.get_rank()
+            inp = tensor(data[rank])
+
+            @jit.trace(without_host=True, use_xla=True)
+            def func(inp):
+                return reduce_func(inp)
+
+            mge_rst = func(inp)
+            xla_rst = func(inp)
+
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+        x = np.random.randn(*ishape).astype(dtype)
+        y = np.random.randn(*ishape).astype(dtype)
+        data = (x, y)
+        worker(data)
+
+    for func in [fdist.all_reduce_sum, fdist.all_reduce_min, fdist.all_reduce_max]:
+        tester(func, (1,), 2)
+        tester(func, (1, 1, 1), 2)
+        tester(func, (16, 1, 64,), 2)
+        tester(func, (16, 32, 64,), 2)
+
+
+if __name__ == "__main__":
+    # test_param_pack_concat()
+    # test_param_pack_split()
+    _test_all_reduce()
diff --git a/imperative/python/test/unit/xla/functional/test_elemwise.py b/imperative/python/test/unit/xla/functional/test_elemwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..57dff453c1f18fdd6a02031963c807f10ce426c8
--- /dev/null
+++ b/imperative/python/test/unit/xla/functional/test_elemwise.py
@@ -0,0 +1,108 @@
+import numpy as np
+
+import megengine as mge
+import megengine.functional as F
+import megengine.jit as jit
+import megengine.tensor as tensor
+from megengine.autodiff.grad_manager import GradManager
+
+
+def test_elemwise():
+    np.random.seed(123)
+    mge.random.seed(123)
+
+    def tester(felemwise, *inp_shapes, backward=True, dtype=None, atol=1e-5):
+        dtype = dtype or np.float32
+        inps = [
+            tensor(0.1 * np.random.randn(*inp_shape), dtype=dtype)
+            for inp_shape in inp_shapes
+        ]
+        doup = tensor(0.1 * np.random.randn(*felemwise(*inps).shape), dtype=dtype)
+
+        gm = GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(inps, doup):
+            gm.attach(inps)
+            with gm:
+                oup = felemwise(*inps)
+                if backward:
+                    gm.backward(oup, doup)
+                    return [oup, *[inp.grad for inp in inps]]
+                else:
+                    return [oup]
+
+        mge_rsts = func(inps, doup)
+        xla_rsts = func(inps, doup)
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=atol)
+
+    tester(F.neg, (4, 16, 12, 12), dtype=np.float32, atol=1e-5)
+    tester(F.abs, (2, 32, 16), dtype=np.float32, atol=1e-5)
+    tester(F.tanh, (4, 16, 3, 1), backward=False, dtype=np.float32, atol=1e-5)
+    tester(F.exp, (2, 8), dtype=np.float32, atol=1e-5)
+    tester(F.sqrt, (32,), dtype=np.float32, atol=1e-5)
+    tester(F.log, (8, 8, 16), dtype=np.float32, atol=1e-5)
+    tester(F.relu, (1,), dtype=np.float32, atol=1e-5)
+    tester(F.gelu, (4, 16, 12, 12), dtype=np.float32, atol=2e-5)
+
+    tester(F.add, (4, 16, 12, 12), (4, 16, 12, 12), dtype=np.float32, atol=1e-5)
+    tester(F.sub, (4, 16, 12, 12), (4, 16, 1, 1), dtype=np.float32, atol=1e-5)
+    tester(F.mul, (4, 16, 12, 12), (1, 1, 12, 12), dtype=np.float32, atol=1e-5)
+    tester(
+        F.div,
+        (4, 16, 1, 1),
+        (4, 16, 12, 12),
+        backward=False,
+        dtype=np.float32,
+        atol=1e-5,
+    )
+    tester(F.pow, (4, 1, 12, 12), (1, 16, 12, 12), dtype=np.float32, atol=1e-5)
+
+    tester(
+        F.equal, (4, 16, 12, 12), (1, 1), backward=False, dtype=np.float32, atol=1e-5
+    )
+    tester(
+        F.not_equal,
+        (4, 16, 12, 12),
+        (4, 16, 1, 1),
+        backward=False,
+        dtype=np.float32,
+        atol=1e-5,
+    )
+    tester(
+        F.greater,
+        (4, 16, 1, 1),
+        (4, 16, 12, 12),
+        backward=False,
+        dtype=np.float32,
+        atol=1e-5,
+    )
+    tester(
+        F.greater_equal,
+        (16, 1, 1),
+        (4, 16, 12, 12),
+        backward=False,
+        dtype=np.float32,
+        atol=1e-5,
+    )
+    tester(
+        F.less,
+        (4, 16, 12, 1),
+        (4, 16, 12, 12),
+        backward=False,
+        dtype=np.float32,
+        atol=1e-5,
+    )
+    tester(
+        F.less_equal,
+        (1, 1, 12, 12),
+        (4, 16, 12, 12),
+        backward=False,
+        dtype=np.float32,
+        atol=1e-5,
+    )
+
+
+if __name__ == "__main__":
+    test_elemwise()
diff --git a/imperative/python/test/unit/xla/functional/test_indexing.py b/imperative/python/test/unit/xla/functional/test_indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..477a3f3fd8d2c9a55bc01df94483300443d9bf28
--- /dev/null
+++ b/imperative/python/test/unit/xla/functional/test_indexing.py
@@ -0,0 +1,146 @@
+import numpy as np
+
+import megengine.functional as F
+import megengine.jit as jit
+import megengine.tensor as tensor
+from megengine.autodiff.grad_manager import GradManager
+
+
+def test_subtensor():
+    def tester(ishape, index, dtype=None):
+        dtype = dtype or np.float32
+        inp = tensor(np.random.randn(*ishape), dtype=dtype)
+        oshape = inp[index].shape
+        dout = tensor(np.random.randn(*oshape), dtype=dtype)
+
+        gm = GradManager()
+
+        @jit.trace(without_host=True, capture_as_const=True, use_xla=True)
+        def func(inp, dout):
+            gm.attach([inp])
+            with gm:
+                out = inp[index]
+                gm.backward(out, dout)
+            return out, inp.grad
+
+        mge_rsts = func(inp, dout)
+        xla_rsts = func(inp, dout)
+
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester(
+        (16, 32, 64, 128), (10, slice(3, 13, 1), slice(-12, -3, 2), slice(None, 13, 3),)
+    )
+    tester(
+        (16, 32, 64, 128),
+        (slice(3, None, 1), slice(5, None, 3), slice(None, 13, 1), slice(None, 18, 4),),
+    )
+    tester(
+        (16, 32, 64, 128),
+        (slice(None, None, 1), None, slice(None, None, 5), slice(-12, -3, 1),),
+    )
+    tester(
+        (16, 32, 1, 128),
+        (slice(-12, -3, 2), slice(-13, None, 1), 0, slice(-12, None, 3),),
+    )
+    tester(
+        (16, 32, 64, 128),
+        (slice(None, -4, 1), 18, slice(None, -3, 4), slice(None, -3, 1),),
+    )
+    tester((16, 32, 64, 128), 10)
+    tester((16, 32, 64, 128), None)
+    tester((16, 32, 64, 128), (slice(3, None, 1), None, slice(-12, -3, 2),))
+
+
+def test_setsubtensor():
+    def tester(x_shape, y_shape, indices, dtype=None):
+        dtype = dtype or np.float32
+        x = tensor(np.random.randn(*x_shape), dtype=dtype)
+        y = tensor(np.random.randn(*y_shape), dtype=dtype)
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(x, y):
+            x.__setitem__(indices, y)
+            return x
+
+        mge_rsts = func(x, y)
+        xla_rsts = func(x, y)
+
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((32, 16, 8), (16, 8), (11,))
+    tester((32, 16, 8), (16, 8), (11,))
+    tester((32, 16, 8), (1, 8), (11,))
+    tester((32, 16, 8), (8,), (11,))
+    tester((32, 16, 8), (1,), (11,))
+    tester((32, 16, 8), (14, 16, 8), (slice(2, 16, 1),))
+    tester((32, 16, 8), (7, 16, 8), (slice(2, 16, 2),))
+    tester((32, 16, 8), (16, 8), (slice(2, 16, 1),))
+    tester((32, 16, 8), (16, 8), (slice(2, 16, 2),))
+    tester((32, 16, 8), (1, 8), (slice(2, 16, 1),))
+    tester((32, 16, 8), (1, 8), (slice(2, 16, 2),))
+    tester((32, 16, 8), (8,), (slice(2, 16, 1),))
+    tester((32, 16, 8), (8,), (slice(2, 16, 2),))
+    tester((32, 16, 8), (1,), (slice(2, 16, 1),))
+    tester((32, 16, 8), (1,), (slice(2, 16, 2),))
+    tester((32, 16, 8), (8, 10, 8), (slice(4, 26, 3), slice(2, 12, 1),))
+    tester((32, 16, 8), (1, 10, 8), (slice(4, 26, 3), slice(2, 12, 1),))
+    tester((32, 16, 8), (10, 8), (slice(4, 26, 3), slice(2, 12, 1),))
+    tester((32, 16, 8), (1, 8), (slice(4, 26, 3), slice(2, 12, 1),))
+    tester((32, 16, 8), (8,), (slice(4, 26, 3), slice(2, 12, 1),))
+    tester((32, 16, 8), (1,), (slice(4, 26, 3), slice(2, 12, 1),))
+    tester((32, 16, 8), (10, 8), (10, slice(2, 12, 1),))
+    tester((32, 16, 8), (1, 8), (10, slice(2, 12, 1),))
+    tester((32, 16, 8), (8,), (10, slice(2, 12, 1),))
+    tester((32, 16, 8), (1,), (10, slice(2, 12, 1),))
+    tester((32, 16, 8), (1, 10, 16, 8), (None, slice(2, 12, 1),))
+    tester((32, 16, 8), (1, 1, 16, 8), (None, slice(2, 12, 1),))
+    tester((32, 16, 8), (10, 16, 8), (None, slice(2, 12, 1),))
+    tester((32, 16, 8), (1, 16, 8), (None, slice(2, 12, 1),))
+    tester((32, 16, 8), (8,), (None, slice(2, 12, 1),))
+
+
+def test_indexing_one_hot():
+    def tester(ishape, axis, keepdims, dtype=None):
+        dtype = dtype or np.float32
+        x = tensor(np.random.randn(*ishape), dtype=dtype)
+        nr_class = ishape[axis]
+        idx_shape = list(ishape)
+        del idx_shape[axis]
+        index = tensor(np.random.randint(0, nr_class, idx_shape), dtype="int32")
+        oshape = F.nn.indexing_one_hot(x, index, axis, keepdims=keepdims).shape
+        dy = tensor(np.random.randn(*oshape), dtype=dtype)
+
+        gm = GradManager()
+
+        # only capture_as_const is True, this function can be trace successfully
+        @jit.trace(without_host=True, capture_as_const=True, use_xla=True)
+        def func(x, index, dy):
+            gm.attach([x])
+            with gm:
+                y = F.nn.indexing_one_hot(x, index, axis, keepdims=keepdims)
+                gm.backward(y, dy)
+            return y, x.grad
+
+        mge_rsts = func(x, index, dy)
+        xla_rsts = func(x, index, dy)
+
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((4, 8, 16), 0, True)
+    tester((4, 8, 16), 0, False)
+    tester((4, 8, 16), 1, True)
+    tester((4, 8, 16), 1, False)
+    tester((4, 8, 16), -1, True)
+    tester((4, 8, 16), -1, False)
+    tester((4, 1, 16), -2, True)
+    tester((4, 1, 16), -2, False)
+
+
+if __name__ == "__main__":
+    test_subtensor()
+    test_setsubtensor()
+    test_indexing_one_hot()
diff --git a/imperative/python/test/unit/xla/functional/test_math.py b/imperative/python/test/unit/xla/functional/test_math.py
new file mode 100644
index 0000000000000000000000000000000000000000..90493e5ddb447e0246ce4d3e16408574be3f7924
--- /dev/null
+++ b/imperative/python/test/unit/xla/functional/test_math.py
@@ -0,0 +1,78 @@
+import numpy as np
+
+import megengine.functional as F
+import megengine.jit as jit
+import megengine.tensor as tensor
+from megengine.autodiff.grad_manager import GradManager
+
+
+def test_matmul():
+    def tester(lhs_shape, rhs_shape, lhs_transpose, rhs_transpose, dtype=None):
+        lhs = tensor(0.1 * np.random.randn(*lhs_shape), dtype=dtype)
+        rhs = tensor(0.1 * np.random.randn(*rhs_shape), dtype=dtype)
+        out = F.matmul(lhs, rhs, lhs_transpose, rhs_transpose)
+        dout = tensor(0.1 * np.random.randn(*out.shape), dtype=dtype)
+
+        gm = GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(lhs, rhs, dout):
+            gm.attach([lhs, rhs])
+            with gm:
+                out = F.matmul(lhs, rhs, lhs_transpose, rhs_transpose)
+                gm.backward(out, dout)
+            return out, lhs.grad, rhs.grad
+
+        mge_rsts = func(lhs, rhs, dout)
+        mge_rsts[0].numpy()
+        xla_rsts = func(lhs, rhs, dout)
+
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((5,), (5,), False, False)
+    tester((4, 5), (5,), False, False)
+    tester((5,), (5, 6), False, False)
+    tester((5, 4), (5,), True, False)
+
+    tester((4, 5), (5, 6), False, False)
+    tester((4, 5), (6, 5), False, True)
+    tester((5, 4), (5, 6), True, False)
+    tester((5, 4), (6, 5), True, True)
+
+    tester((2, 3, 4, 5), (5, 6), False, False)
+    tester((2, 3, 4, 5), (6, 5), False, True)
+    tester((2, 1, 5, 4), (5, 6), True, False)
+    tester((2, 1, 5, 4), (6, 5), True, True)
+    tester((1, 5, 4), (5, 6), True, False)
+    tester((1, 5, 4), (6, 5), True, True)
+
+    tester((4, 5), (2, 3, 5, 6), False, False)
+    tester((4, 5), (2, 3, 6, 5), False, True)
+    tester((5, 4), (2, 1, 5, 6), True, False)
+    tester((5, 4), (2, 1, 6, 5), True, True)
+    tester((5, 4), (1, 5, 6), True, False)
+    tester((5, 4), (1, 6, 5), True, True)
+
+    tester((1, 4, 5), (1, 5, 6), False, False)
+    tester((1, 5, 4), (1, 5, 6), True, False)
+    tester((3, 4, 5), (3, 5, 6), False, False)
+    tester((3, 5, 4), (3, 6, 5), True, True)
+
+    tester((5, 3, 2, 7, 8), (3, 2, 8, 9), False, False)
+    tester((5, 1, 2, 7, 8), (1, 2, 9, 8), False, True)
+    tester((5, 3, 2, 8, 7), (3, 1, 8, 9), True, False)
+    tester((5, 3, 2, 8, 7), (1, 2, 9, 8), True, True)
+    tester((5, 3, 2, 8, 7), (1, 8, 9), True, False)
+    tester((5, 3, 1, 8, 7), (1, 9, 8), True, True)
+
+    tester((3, 2, 7, 8), (4, 3, 2, 8, 9), False, False)
+    tester((3, 1, 7, 8), (4, 3, 1, 9, 8), False, True)
+    tester((3, 1, 8, 7), (4, 3, 2, 8, 9), True, False)
+    tester((1, 2, 8, 7), (4, 2, 2, 9, 8), True, True)
+    tester((1, 8, 7), (4, 3, 2, 8, 9), True, False)
+    tester((1, 8, 7), (4, 3, 1, 9, 8), True, True)
+
+
+if __name__ == "__main__":
+    test_matmul()
diff --git a/imperative/python/test/unit/xla/functional/test_nn.py b/imperative/python/test/unit/xla/functional/test_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9419c55e21e24d9535845ad41b0dcf4af9e9f638
--- /dev/null
+++ b/imperative/python/test/unit/xla/functional/test_nn.py
@@ -0,0 +1,184 @@
+import numpy as np
+
+import megengine as mge
+import megengine.functional as F
+import megengine.jit as jit
+import megengine.tensor as tensor
+from megengine import autodiff
+from megengine.autodiff.grad_manager import GradManager
+
+
+def test_conv2d():
+    np.random.seed(123)
+    mge.random.seed(123)
+
+    def tester(x_shape, w_shape, b_shape, stride, padding, groups, dtype=None):
+        dtype = dtype or np.float32
+        x = tensor(0.1 * np.random.rand(*x_shape), dtype=dtype)
+        w = tensor(0.1 * np.random.rand(*w_shape), dtype=dtype)
+        b = tensor(0.1 * np.random.rand(*b_shape), dtype=dtype) if b_shape else None
+        y = F.conv2d(x, w, b, stride=stride, padding=padding, groups=groups)
+        dy = tensor(0.1 * np.random.rand(*y.shape), dtype=dtype)
+
+        gm = GradManager()
+
+        if b is not None:
+
+            @jit.trace(without_host=True, use_xla=True)
+            def func(x, w, b, dy):
+                gm.attach([x, w, b])
+                with gm:
+                    y = F.conv2d(x, w, b, stride=stride, padding=padding, groups=groups)
+                    gm.backward(y, dy)
+                return [y, x.grad, w.grad, b.grad]
+
+            mge_rsts = func(x, w, b, dy)
+            xla_rsts = func(x, w, b, dy)
+        else:
+
+            @jit.trace(without_host=True, use_xla=True)
+            def func(x, w, dy):
+                gm.attach([x, w])
+                with gm:
+                    y = F.conv2d(x, w, stride=stride, padding=padding, groups=groups)
+                    gm.backward(y, dy)
+                return [y, x.grad, w.grad]
+
+            mge_rsts = func(x, w, dy)
+            xla_rsts = func(x, w, dy)
+
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester(
+        (4, 16, 24, 24), (32, 16, 3, 3), (1, 32, 1, 1), stride=1, padding=1, groups=1
+    )
+    tester(
+        (4, 16, 24, 24),
+        (32, 16, 3, 3),
+        (1, 32, 1, 1),
+        stride=(2, 3),
+        padding=(2, 1),
+        groups=1,
+    )
+    tester(
+        (4, 16, 24, 24),
+        (16, 1, 1, 3, 3),
+        None,
+        stride=(2, 3),
+        padding=(2, 1),
+        groups=16,
+    )
+
+    tester((4, 16, 24, 24), (32, 16, 1, 1), None, stride=1, padding=1, groups=1)
+    tester(
+        (4, 16, 1, 1),
+        (32, 16, 1, 1),
+        (1, 32, 1, 1),
+        stride=(2, 3),
+        padding=(2, 1),
+        groups=1,
+    )
+    tester(
+        (4, 16, 24, 24),
+        (16, 1, 1, 1, 1),
+        (1, 16, 1, 1),
+        stride=(2, 3),
+        padding=(2, 1),
+        groups=16,
+    )
+
+
+def test_adaptive_pooling():
+    def tester(fpool, ishape, oshape, dtype=None):
+        oshape = (oshape, oshape) if isinstance(oshape, int) else oshape
+        dtype = dtype or np.float32
+
+        x = tensor(np.random.randn(*ishape), dtype=dtype)
+        dy = tensor(np.random.randn(*ishape[:-2], *oshape), dtype=dtype)
+        gm = autodiff.GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(x, dy):
+            gm.attach([x])
+            with gm:
+                y = fpool(x, oshape)
+                gm.backward(y, dy)
+            return y, x.grad
+
+        mge_rsts = func(x, dy)
+        xla_rsts = func(x, dy)
+
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    for fpool in [F.adaptive_avg_pool2d, F.adaptive_max_pool2d]:
+        for oshape in [(1, 1), (2, 2), 3, (4, 4), (2, 4), (5, 5), (5, 7)]:
+            tester(fpool, (32, 16, 24, 24), oshape)
+            tester(fpool, (32, 16, 17, 13), oshape)
+
+
+def test_pooling():
+    def tester(fpool, ishape, kernel, stride, padding, dtype=None, **kwargs):
+        oshape = fpool(
+            tensor(np.random.randn(*ishape).astype("float32")), kernel, stride, padding
+        ).shape
+        x = tensor(np.random.randn(*ishape).astype("float32"))
+        dy = tensor(np.random.randn(*oshape).astype("float32"))
+
+        gm = autodiff.GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(x, dy):
+            gm.attach([x])
+            with gm:
+                y = fpool(x, kernel, stride, padding, **kwargs)
+                gm.backward(y, dy)
+            return y, x.grad
+
+        mge_rsts = func(x, dy)
+        xla_rsts = func(x, dy)
+
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester(F.max_pool2d, [32, 16, 8, 13], (3, 3), 2, 1)
+    tester(F.avg_pool2d, [32, 16, 8, 13], (3, 1), (2, 1), (1, 0), mode="average")
+    tester(F.avg_pool2d, [32, 16, 8, 2], (3, 3), 2, 1)
+
+
+def test_softmax():
+    def tester(ishape, axis, dtype=None):
+        dtype = dtype or np.float32
+        x = tensor(np.random.randn(*ishape), dtype=dtype)
+        dy = tensor(np.random.randn(*ishape), dtype=dtype)
+
+        gm = autodiff.GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(x, dy):
+            gm.attach([x])
+            with gm:
+                y = F.softmax(x, axis=axis)
+                gm.backward(y, dy)
+            return y, x.grad
+
+        mge_rsts = func(x, dy)
+        xla_rsts = func(x, dy)
+
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((32, 16, 8, 8), 1)
+    tester((1, 16, 17, 128), [0, 2])
+    tester((32, 16, 5), -2)
+    tester((32, 16, 5), 0)
+    tester((1, 16, 5), -1)
+    tester((14, 1, 13, 5), 1)
+
+
+if __name__ == "__main__":
+    test_conv2d()
+    test_adaptive_pooling()
+    test_pooling()
+    test_softmax()
diff --git a/imperative/python/test/unit/xla/functional/test_normalize.py b/imperative/python/test/unit/xla/functional/test_normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4336affc6a03a15d54bf27a95d76512d9fec259
--- /dev/null
+++ b/imperative/python/test/unit/xla/functional/test_normalize.py
@@ -0,0 +1,121 @@
+import numpy as np
+
+import megengine.functional as F
+import megengine.jit as jit
+import megengine.tensor as tensor
+from megengine import autodiff
+from megengine.autodiff.grad_manager import GradManager
+
+
+def test_layer_norm():
+    def tester(x_shape, normalized_shape, w_shape, b_shape, eps=1e-5, dtype=None):
+        dtype = dtype or np.float32
+        x = tensor(0.1 * np.random.rand(*x_shape), dtype=dtype)
+        w = tensor(0.1 * np.random.rand(*w_shape), dtype=dtype) if w_shape else None
+        b = tensor(0.1 * np.random.rand(*b_shape), dtype=dtype) if b_shape else None
+        y = F.layer_norm(
+            x,
+            normalized_shape=normalized_shape,
+            affine=b is not None,
+            weight=w,
+            bias=b,
+            eps=eps,
+        )
+        dy = tensor(0.1 * np.random.rand(*y.shape), dtype=dtype)
+
+        gm = GradManager()
+
+        if b is not None:
+
+            @jit.trace(without_host=True, use_xla=True)
+            def func(x, w, b, dy):
+                gm.attach([x, w, b])
+                with gm:
+                    y = F.layer_norm(
+                        x,
+                        normalized_shape=normalized_shape,
+                        affine=True,
+                        weight=w,
+                        bias=b,
+                        eps=eps,
+                    )
+                    gm.backward(y, dy)
+                return [y, x.grad, w.grad, b.grad]
+
+            mge_rsts = func(x, w, b, dy)
+            xla_rsts = func(x, w, b, dy)
+        else:
+
+            @jit.trace(without_host=True, use_xla=True)
+            def func(x, dy):
+                gm.attach([x])
+                with gm:
+                    y = F.layer_norm(
+                        x, normalized_shape=normalized_shape, affine=False, eps=eps
+                    )
+                    gm.backward(y, dy)
+                return [y, x.grad]
+
+            mge_rsts = func(x, dy)
+            xla_rsts = func(x, dy)
+
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((4, 16, 24, 24), (24, 24), (24, 24), (24, 24))
+    tester((4, 16, 24, 24), (24, 24), None, None)
+    tester((4, 16, 24, 28), (28,), (28,), (28,))
+    tester((4, 16, 24, 28), (28,), None, None)
+
+
+def test_batch_norm():
+    def tester(ishape, training, momentum, eps, inplace, dtype=None):
+        dtype = dtype or np.float32
+        x = tensor(np.random.randn(*ishape), dtype=dtype)
+        rmean = tensor(np.random.randn(1, ishape[1], 1, 1), dtype=dtype)
+        rvar = tensor(np.abs(np.random.randn(1, ishape[1], 1, 1)), dtype=dtype)
+        weight = tensor(np.random.randn(1, ishape[1], 1, 1), dtype=dtype)
+        bias = tensor(np.random.randn(1, ishape[1], 1, 1), dtype=dtype)
+        dy = tensor(np.random.randn(*ishape), dtype=dtype)
+
+        gm = autodiff.GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(x, rmean, rvar, weight, bias, dy):
+            gm.attach([x, weight, bias])
+            with gm:
+                outs = F.batch_norm(
+                    x,
+                    rmean,
+                    rvar,
+                    weight,
+                    bias,
+                    training=training,
+                    momentum=momentum,
+                    eps=eps,
+                    inplace=inplace,
+                )
+                if inplace:
+                    y = outs
+                else:
+                    y, rmean, rvar = outs
+                if training:
+                    gm.backward(y, dy)
+                    return y, rmean, rvar, x.grad, weight.grad, bias.grad
+                else:
+                    return [y]
+
+        mge_rsts = func(x, rmean, rvar, weight, bias, dy)
+        xla_rsts = func(x, rmean, rvar, weight, bias, dy)
+
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=5e-4)
+
+    tester((32, 16, 8, 8), True, 0.9, 1e-5, True)
+    tester((1, 16, 17, 128), True, 0.7, 1e-5, False)
+    tester((32, 16, 64, 5), False, 0.8, 1e-5, True)
+
+
+if __name__ == "__main__":
+    test_layer_norm()
+    test_batch_norm()
diff --git a/imperative/python/test/unit/xla/functional/test_random.py b/imperative/python/test/unit/xla/functional/test_random.py
new file mode 100644
index 0000000000000000000000000000000000000000..363a94c3f188f84c701ba36a6b836795b5354c2c
--- /dev/null
+++ b/imperative/python/test/unit/xla/functional/test_random.py
@@ -0,0 +1,52 @@
+import numpy as np
+
+import megengine.functional as F
+import megengine.jit as jit
+import megengine.tensor as tensor
+from megengine.autodiff.grad_manager import GradManager
+
+
+def test_dropout():
+    def check_dropout(mge_val, xla_val, drop_prob):
+        nr_zero = np.sum(np.array(xla_val == 0, np.uint32))
+        nr_el = np.prod(xla_val.shape)
+        xla_drop_rate = nr_zero * 1.0 / nr_el
+        np.testing.assert_allclose(drop_prob, xla_drop_rate, atol=1e-3)
+
+        mge_mask = mge_val == 0
+        xla_mask = xla_val == 0
+        both_mask = np.bitwise_or(xla_mask, mge_mask)
+        both_left = np.bitwise_not(both_mask)
+        mge_left = mge_val * both_left
+        xla_left = xla_val * both_left
+        np.testing.assert_allclose(mge_left, xla_left, atol=1e-6)
+
+    def tester(shape, drop_prob, dtype=None):
+        dtype = dtype or np.float32
+        x = tensor(np.random.randn(*shape), dtype=dtype)
+        dy = tensor(np.random.randn(*shape), dtype=dtype)
+
+        gm = GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(x, dy):
+            gm.attach([x])
+            with gm:
+                y = F.dropout(x, drop_prob, True)
+                gm.backward(y, dy)
+            return y, x.grad
+
+        mge_rsts = func(x, dy)
+        xla_rsts = func(x, dy)
+
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            check_dropout(mge_rst.numpy(), xla_rst.numpy(), drop_prob)
+
+    tester((32, 128, 128, 1, 16), 0.1)
+    tester((32, 128, 128, 1, 16), 0.3)
+    tester((32, 128, 128, 1, 16), 0.5)
+    tester((32, 128, 128, 1, 16), 0.9)
+
+
+if __name__ == "__main__":
+    test_dropout()
diff --git a/imperative/python/test/unit/xla/functional/test_reduction.py b/imperative/python/test/unit/xla/functional/test_reduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3c03e817a1e09ea892c362a322bcc6e445e5b30
--- /dev/null
+++ b/imperative/python/test/unit/xla/functional/test_reduction.py
@@ -0,0 +1,48 @@
+import numpy as np
+
+import megengine as mge
+import megengine.functional as F
+import megengine.jit as jit
+import megengine.tensor as tensor
+from megengine.autodiff.grad_manager import GradManager
+
+
+def test_reduce():
+    np.random.seed(123)
+    mge.random.seed(123)
+
+    def tester(freduce, inpshape, axes, keepdim, backward, dtype=None, atol=1e-5):
+        dtype = dtype or np.float32
+        inp = tensor(0.1 * np.random.randn(*inpshape), dtype=dtype)
+        doup = tensor(
+            0.1 * np.random.randn(*freduce(inp, axis=axes, keepdims=keepdim).shape),
+            dtype=dtype,
+        )
+
+        gm = GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(inp, doup):
+            gm.attach([inp])
+            with gm:
+                oup = freduce(inp, axis=axes, keepdims=keepdim)
+                if backward:
+                    gm.backward(oup, doup)
+                    return [oup, inp.grad]
+                else:
+                    return [oup]
+
+        mge_rsts = func(inp, doup)
+        xla_rsts = func(inp, doup)
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=atol)
+
+    tester(F.sum, (2, 4, 8, 16), [1, 2], keepdim=True, backward=True)
+    tester(F.mean, (2, 4, 8, 16), [3, 2], keepdim=False, backward=True)
+    tester(F.prod, (2, 4, 8, 16), [0, 1, 2, 3], keepdim=False, backward=True)
+    tester(F.min, (2, 4, 8, 16), 0, keepdim=True, backward=False)
+    tester(F.max, (2, 4, 8, 16), [-2], keepdim=False, backward=False)
+
+
+if __name__ == "__main__":
+    test_reduce()
diff --git a/imperative/python/test/unit/xla/functional/test_tensor.py b/imperative/python/test/unit/xla/functional/test_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e829b228d1e5f4ac2eeb89730c3d12b0fc9f0fe9
--- /dev/null
+++ b/imperative/python/test/unit/xla/functional/test_tensor.py
@@ -0,0 +1,226 @@
+import numpy as np
+
+import megengine.functional as F
+import megengine.jit as jit
+import megengine.tensor as tensor
+from megengine.autodiff.grad_manager import GradManager
+
+
+def test_broadcast_to():
+    def tester(ishape, tgtshape):
+        dtype = None
+        dtype = dtype or np.float32
+        inp = tensor(np.random.randn(*ishape), dtype=dtype)
+        dout = tensor(np.random.randn(*tgtshape), dtype=dtype)
+
+        gm = GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(inp, dout):
+            gm.attach([inp])
+            with gm:
+                out = F.broadcast_to(inp, tgtshape)
+                gm.backward(out, dout)
+            return [out, inp.grad]
+
+        mge_rsts = func(inp, dout)
+        xla_rsts = func(inp, dout)
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((1, 1, 1), (1, 1, 1, 1))
+    tester((1, 1, 1, 1), (1, 1, 1, 1))
+    tester((1, 1, 1, 1), (4, 5, 6, 7))
+    tester((1, 1, 1), (4, 5, 6, 7))
+    tester((5, 6, 7), (4, 5, 6, 7))
+    tester((1, 6, 1), (4, 5, 6, 7))
+    tester((1, 5, 6, 7), (4, 5, 6, 7))
+    tester((1,), (4, 5, 1, 7))
+    tester((4, 5, 3, 1), (4, 5, 3, 7))
+    tester((4, 5, 3, 7), (4, 5, 3, 7))
+
+
+def test_reshape():
+    def tester(ishape, tgt_shape, dtype=None):
+        dtype = dtype or np.float32
+        inp = tensor(np.random.randn(*ishape), dtype=dtype)
+        oshape = F.reshape(inp, tgt_shape).shape
+        dout = tensor(np.random.randn(*oshape), dtype=dtype)
+
+        gm = GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(inp, dout):
+            gm.attach([inp])
+            with gm:
+                out = F.reshape(inp, tgt_shape)
+                gm.backward(out, dout)
+            return [out, inp.grad]
+
+        mge_rsts = func(inp, dout)
+        xla_rsts = func(inp, dout)
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((1,), (1,))
+    tester((1,), (1, 1, 1, 1))
+    tester((2, 3, 4), (24,))
+    tester((2, 3, 4), (2, 12))
+    tester((2, 3, 4), (4, 3, 2))
+    tester((2, 1, 4), (8, 1))
+    tester((2, 1, 4), (-1))
+    tester((2, 1, 4), (-1, 2))
+
+
+def test_transpose():
+    def tester(ishape, permutation, dtype=None):
+        dtype = dtype or np.float32
+        inp = tensor(np.random.randn(*ishape), dtype=dtype)
+        oshape = F.transpose(inp, permutation).shape
+        dout = tensor(np.random.randn(*oshape), dtype=dtype)
+
+        gm = GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(inp, dout):
+            gm.attach([inp])
+            with gm:
+                out = F.transpose(inp, permutation)
+                gm.backward(out, dout)
+            return [out, inp.grad]
+
+        mge_rsts = func(inp, dout)
+        xla_rsts = func(inp, dout)
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((1,), (0,))
+    tester((2, 3, 4), (0, 2, 1))
+    tester((2, 3, 4), (2, 0, 1))
+    tester((2, 3, 1), (0, 1, 2))
+    tester((2, 3, 1, 4), (3, 1, 0, 2))
+
+
+def test_expand_dims():
+    def tester(ishape, axis, dtype=None):
+        dtype = dtype or np.float32
+        inp = tensor(np.random.randn(*ishape), dtype=dtype)
+        oshape = F.expand_dims(inp, axis).shape
+        dout = tensor(np.random.randn(*oshape), dtype=dtype)
+
+        gm = GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(inp, dout):
+            gm.attach([inp])
+            with gm:
+                out = F.expand_dims(inp, axis)
+                gm.backward(out, dout)
+            return [out, inp.grad]
+
+        mge_rsts = func(inp, dout)
+        xla_rsts = func(inp, dout)
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((2, 1, 4), 0)
+    tester((2, 3, 4), 1)
+    tester((2, 3, 4, 5), -1)
+
+
+def test_concat():
+    def tester(*ishapes, axis, dtype=None):
+        dtype = dtype or np.float32
+        inps = [tensor(np.random.randn(*ishape), dtype=dtype) for ishape in ishapes]
+        oshape = F.concat(inps, axis=axis).shape
+        dout = tensor(np.random.randn(*oshape), dtype=dtype)
+
+        gm = GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(*inps, dout):
+            gm.attach(inps)
+            with gm:
+                out = F.concat(inps, axis=axis)
+                gm.backward(out, dout)
+            rets = [inp.grad for inp in inps] + [out]
+            return rets
+
+        mge_rsts = func(*inps, dout=dout)
+        xla_rsts = func(*inps, dout=dout)
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((6, 5, 4), (6, 3, 4), (6, 1, 4), axis=1)
+    tester((6, 5, 2), (6, 5, 1), axis=-1)
+    tester((2, 5, 4), (6, 5, 4), axis=0)
+    tester((1, 5, 4), (1, 5, 4), axis=0)
+    tester((6, 5, 1), axis=-1)
+
+
+def test_split():
+    def tester(ishape, axis, nsplit_or_sections, dtype=None):
+        dtype = dtype or np.float32
+        inp = tensor(np.random.randn(*ishape), dtype=dtype)
+        oshapes = [o.shape for o in F.split(inp, nsplit_or_sections, axis)]
+        douts = [tensor(np.random.randn(*oshape), dtype=dtype) for oshape in oshapes]
+
+        gm = GradManager()
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(inp, douts):
+            gm.attach([inp])
+            with gm:
+                outs = list(F.split(inp, nsplit_or_sections, axis))
+                gm.backward(outs, douts)
+            rets = outs + [inp.grad]
+            return rets
+
+        mge_rsts = func(inp, douts)
+        xla_rsts = func(inp, douts)
+        for mge_rst, xla_rst in zip(mge_rsts, xla_rsts):
+            np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((32, 16, 8), -2, 5)
+    tester((32, 16, 8), 0, [8, 14, 27])
+    tester((32, 16, 8), 1, 1)
+    tester((32, 16, 8), 1, 16)
+
+
+def test_fill_and_fill_like():
+    def tester(ref_shape, value, dtype=None):
+        dtype = dtype or np.float32
+        ref = tensor(np.random.randn(*ref_shape), dtype=dtype)
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(ref):
+            return (
+                F.full_like(ref, value),
+                F.full(ref.shape, value, dtype=dtype),
+                F.ones_like(ref),
+                F.ones(ref.shape, dtype=dtype),
+                F.zeros_like(ref),
+                F.zeros(ref.shape, dtype=dtype),
+            )
+
+        mge_rst = func(ref)
+        xla_rst = func(ref)
+        for mge, xla in zip(mge_rst, xla_rst):
+            np.testing.assert_allclose(mge.numpy(), xla.numpy(), atol=1e-5)
+
+    tester((1,), 0.1)
+    tester((16,), 0.1)
+    tester((1, 16), 0.1)
+    tester((32, 16), 0.1)
+    tester((32, 16), 0)
+    tester((1, 1, 16), 1)
+
+
+if __name__ == "__main__":
+    test_broadcast_to()
+    test_reshape()
+    test_transpose()
+    test_expand_dims()
+    test_concat()
+    test_split()
+    test_fill_and_fill_like()
diff --git a/imperative/python/test/unit/xla/functional/test_trivial.py b/imperative/python/test/unit/xla/functional/test_trivial.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f401393da746351183f6fef306616ab05531678
--- /dev/null
+++ b/imperative/python/test/unit/xla/functional/test_trivial.py
@@ -0,0 +1,25 @@
+import numpy as np
+
+import megengine.jit as jit
+import megengine.tensor as tensor
+
+
+def test_get_var_shape():
+    def tester(shape):
+        x = tensor(np.random.randn(*shape).astype("float32"))
+
+        @jit.trace(without_host=True, use_xla=True)
+        def func(x):
+            return x.shape
+
+        mge_rst = func(x)
+        xla_rst = func(x)
+        np.testing.assert_allclose(mge_rst.numpy(), xla_rst.numpy(), atol=1e-5)
+
+    tester((2, 3, 4, 5))
+    tester((1, 2, 3))
+    tester((1,))
+
+
+if __name__ == "__main__":
+    test_get_var_shape()
diff --git a/imperative/python/test/unit/xla/test_trace.py b/imperative/python/test/unit/xla/test_trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..5590cdd09ebd3d7f5ab92dab198bf9c838c94231
--- /dev/null
+++ b/imperative/python/test/unit/xla/test_trace.py
@@ -0,0 +1,60 @@
+import numpy as np
+
+import megengine as mge
+import megengine.functional as F
+import megengine.module as M
+from megengine.autodiff import GradManager
+from megengine.jit import trace
+from megengine.optimizer import Adam
+
+
+def test_xla_conv_module():
+    m = M.Conv2d(3, 3, 3)
+
+    @trace(without_host=True, use_xla=True)
+    def step(m, inp):
+        return m(inp)
+
+    inp = mge.tensor(np.random.random((3, 3, 32, 32)))
+    step(m, inp)
+
+    xla_rst = step(m, inp)
+    mge_rst = step.__wrapped__(m, inp)
+    np.testing.assert_allclose(mge_rst, xla_rst)
+
+
+def test_train():
+    def run(use_trace):
+        np.random.seed(1024)
+        mge.random.seed(233)
+        m = M.Conv2d(3, 3, 3, padding=1)
+        inp = mge.tensor(np.random.random((3, 3, 32, 32)))
+        gm = GradManager()
+        opt = Adam(m.parameters(), lr=0.1)
+        gm.attach(m.parameters())
+
+        def train_step(model, opt, inp):
+            with gm:
+                out = model(inp) + 1
+                loss = F.loss.square_loss(out, F.sin(inp))
+                gm.backward(loss)
+            opt.step().clear_grad()
+            return loss
+
+        if use_trace:
+            train_step = trace(train_step, without_host=True)
+
+        for i in range(100):
+            loss = train_step(m, opt, inp)
+        return m.weight, m.bias, opt.state_dict()["state"][0]["exp_avg"]
+
+    w0, b0, s0 = run(False)
+    w1, b1, s1 = run(True)
+    np.testing.assert_allclose(w0, w1, rtol=1e-3)
+    np.testing.assert_allclose(b0, b1, rtol=1e-3)
+    np.testing.assert_allclose(s0, s1, rtol=1e-3)
+
+
+if __name__ == "__main__":
+    test_train()
+    test_xla_conv_module()