diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 6ae0e6df07cfb75b217f6a951d1e4ce0458963d6..456d541451e64a967fcb9cc9c9f9c72f5d7e0db4 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -28,6 +28,7 @@ namespace cub = hipcub;
 
 #include "gflags/gflags.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -165,4 +166,6 @@ PD_REGISTER_KERNEL(randperm,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py
index eaecf087f9f04b8fd25f3e363515f4c6d4f4dabf..5df2873b93ec4cfba354ca15273999adf7e262ef 100644
--- a/python/paddle/fluid/tests/unittests/test_randperm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py
@@ -15,7 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 
 import paddle
 from paddle.fluid import core
@@ -40,12 +44,21 @@ def error_msg(data_np):
 
 
 def convert_dtype(dtype_str):
-    dtype_str_list = ["int32", "int64", "float32", "float64"]
+    dtype_str_list = [
+        "int32",
+        "int64",
+        "float16",
+        "float32",
+        "float64",
+        "uint16",
+    ]
     dtype_num_list = [
         core.VarDesc.VarType.INT32,
         core.VarDesc.VarType.INT64,
+        core.VarDesc.VarType.FP16,
         core.VarDesc.VarType.FP32,
         core.VarDesc.VarType.FP64,
+        core.VarDesc.VarType.BF16,
     ]
     assert dtype_str in dtype_str_list, (
         dtype_str + " should in " + str(dtype_str_list)
@@ -62,9 +75,9 @@ class TestRandpermOp(OpTest):
         self.n = 200
         self.dtype = "int64"
 
+        self.init_attrs()
         self.inputs = {}
         self.outputs = {"Out": np.zeros(self.n).astype(self.dtype)}
-        self.init_attrs()
         self.attrs = {
             "n": self.n,
             "dtype": convert_dtype(self.dtype),
@@ -103,6 +116,47 @@ class TestRandpermOpFloat64(TestRandpermOp):
         self.dtype = "float64"
 
 
+class TestRandpermFP16Op(TestRandpermOp):
+    def init_attrs(self):
+        self.dtype = "float16"
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestRandpermBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "randperm"
+        self.python_api = paddle.randperm
+        self.n = 200
+
+        self.init_attrs()
+        self.inputs = {}
+        self.outputs = {"Out": np.zeros(self.n).astype(self.np_dtype)}
+        self.attrs = {
+            "n": self.n,
+            "dtype": convert_dtype(self.dtype),
+        }
+
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+        self.place = core.CUDAPlace(0)
+
+    def init_attrs(self):
+        self.dtype = "uint16"
+        self.np_dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place_customized(self.verify_output, self.place)
+
+    def verify_output(self, outs):
+        out_np = convert_uint16_to_float(np.array(outs[0]))
+        self.assertTrue(
+            check_randperm_out(self.n, out_np), msg=error_msg(out_np)
+        )
+
+
 class TestRandpermOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 3149ca82b3f623fe85ceef7e28ed97645ae0c8d8..29446bafbbfeb4dd482301ea2f032516b8e2c813 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -65,7 +65,7 @@ class TestSplitOp(OpTest):
 
 
 # test with attr(num)
-class TestSplitOp_2(OpTest):
+class TestSplitWithNumOp(OpTest):
     def setUp(self):
         self.python_api = paddle.split
         self.public_python_api = paddle.split
@@ -74,18 +74,32 @@ class TestSplitOp_2(OpTest):
         self.prim_op_type = "prim"
         self.dtype = self.get_dtype()
         self.init_data()
-        self.inputs = {'X': self.x}
         self.attrs = {
             'axis': self.axis,
             'sections': self.sections,
             'num': self.num,
         }
-
-        out = np.split(self.x, self.indices_or_sections, self.axis)
-        self.outputs = {'Out': [('out%d' % i, out[i]) for i in range(len(out))]}
+        if self.dtype == np.uint16:
+            self.inputs = {'X': convert_float_to_uint16(self.x)}
+            out = np.split(self.x, self.indices_or_sections, self.axis)
+            self.outputs = {
+                'Out': [
+                    ('out%d' % i, convert_float_to_uint16(out[i]))
+                    for i in range(len(out))
+                ]
+            }
+        else:
+            self.inputs = {'X': self.x}
+            out = np.split(self.x, self.indices_or_sections, self.axis)
+            self.outputs = {
+                'Out': [('out%d' % i, out[i]) for i in range(len(out))]
+            }
 
     def init_data(self):
-        self.x = np.random.random((4, 5, 6)).astype(self.dtype)
+        if self.dtype == np.uint16:
+            self.x = np.random.random((4, 5, 6)).astype(np.float32)
+        else:
+            self.x = np.random.random((4, 5, 6)).astype(self.dtype)
         self.axis = 2
         self.sections = []
         self.num = 3
@@ -240,28 +254,28 @@ def create_test_fp16(parent):
     @unittest.skipIf(
         not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
     )
-    class TestSplitFp16(parent):
+    class TestSplitFP16Op(parent):
         def get_dtype(self):
             return np.float16
 
-        def test_check_grad(self):
-            pass
-
-    cls_name = "{}_{}".format(parent.__name__, "Fp16")
-    TestSplitFp16.__name__ = cls_name
-    globals()[cls_name] = TestSplitFp16
+    cls_name = "{}_{}".format(parent.__name__, "FP16Op")
+    TestSplitFP16Op.__name__ = cls_name
+    globals()[cls_name] = TestSplitFP16Op
 
 
 create_test_fp16(TestSplitOp)
+create_test_fp16(TestSplitWithNumOp)
 
 # ----------------Split Bf16----------------
 
 
 def create_test_bf16(parent):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA or not support bfloat16",
     )
-    class TestSplitBf16(parent):
+    class TestSplitBF16Op(parent):
         def get_dtype(self):
             return np.uint16
 
@@ -270,14 +284,16 @@ def create_test_bf16(parent):
             self.check_output_with_place(place)
 
         def test_check_grad(self):
-            pass
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(place, ['X'], 'out2')
 
-    cls_name = "{}_{}".format(parent.__name__, "Bf16")
-    TestSplitBf16.__name__ = cls_name
-    globals()[cls_name] = TestSplitBf16
+    cls_name = "{}_{}".format(parent.__name__, "BF16Op")
+    TestSplitBF16Op.__name__ = cls_name
+    globals()[cls_name] = TestSplitBF16Op
 
 
 create_test_bf16(TestSplitOp)
+create_test_bf16(TestSplitWithNumOp)
 
 
 class TestSplitAPI(unittest.TestCase):
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 7309f80401322391f89bc8057a64ddaefbd4b24a..6ce6587d44e777647a3b03db6850f08e24142c49 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1976,6 +1976,7 @@ def split(x, num_or_sections, axis=0, name=None):
                 'int32',
                 'int64',
                 'uint8',
+                'uint16',
                 'int8',
             ],
             'split',