diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index e0dfad91570ad689755e1a3e4a0a59fbedbede37..d3943e09b6d0b1276b6756759389b269603b764a 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -22,12 +22,14 @@ namespace paddle {
 namespace operators {
 template <typename DeviceContext, typename T>
 class ScaleXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
     auto* in_var = ctx.InputVar("X");
     auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
-    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
-    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
+    auto scale = static_cast<float>(ctx.Attr<float>("scale"));
+    auto bias = static_cast<float>(ctx.Attr<float>("bias"));
     auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
     auto* out_var = ctx.OutputVar("Out");
     if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
@@ -46,9 +48,10 @@ class ScaleXPUKernel : public framework::OpKernel<T> {
                                           in->dims().to_str().c_str(),
                                           out->dims().to_str().c_str()));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r =
-        xpu::scale(dev_ctx.x_context(), in->data<float>(), out->data<float>(),
-                   in->numel(), bias_after_scale, scale, bias);
+    int r = xpu::scale(dev_ctx.x_context(),
+                       reinterpret_cast<const XPUType*>(in->data<T>()),
+                       reinterpret_cast<XPUType*>(out->data<T>()), in->numel(),
+                       bias_after_scale, scale, bias);
     PADDLE_ENFORCE_EQ(
         r, XPU_SUCCESS,
         platform::errors::External("XPU scale kernel return wrong value[%d %s]",
@@ -60,7 +63,11 @@ class ScaleXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OP_XPU_KERNEL(
-    scale, ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    scale, ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext,
+                        paddle::platform::float16>,
+    ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
 
 #endif
diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h
index 121d26e39dd8b33e7c1172a7446773cabdb97f04..0b95581c66cfc9e561becf66281e83662048137e 100644
--- a/paddle/fluid/platform/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/xpu/xpu2_op_list.h
@@ -184,6 +184,9 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::FP16, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace())})}
       // AddMore
   };
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b27c05d98a1c031269859d37306aa43a961c1402..2123569704f0bb7770f95c665aa3ded88bf1a414 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1709,6 +1709,14 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
   m.def("get_xpu_device_version",
         [](int device_id) { return platform::get_xpu_version(device_id); });
+  m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
+    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
+    return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
+  });
+  m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
+    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
+    return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
+  });
 #endif
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index 239708cc174492c16cd443a4ac195c561c099222..33c0c24056f48fa01f5dadd4f237bf351373cfd9 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -44,86 +44,33 @@ class XPUOpTest(OpTest):
     @classmethod
     def setUpClass(cls):
         '''Fix random seeds to remove randomness from tests'''
-        cls._np_rand_state = np.random.get_state()
-        cls._py_rand_state = random.getstate()
-        cls.call_once = False
-        cls.dtype = np.float32
-        cls.outputs = {}
-        cls.input_shape_is_large = True
-
-        np.random.seed(123)
-        random.seed(124)
-
-        cls._use_system_allocator = _set_use_system_allocator(True)
+        cls.use_xpu = True
+        cls.use_mkldnn = False
+        super().setUpClass()
 
     @classmethod
     def tearDownClass(cls):
         """Restore random seeds"""
-        np.random.set_state(cls._np_rand_state)
-        random.setstate(cls._py_rand_state)
-
-        _set_use_system_allocator(cls._use_system_allocator)
 
         def is_empty_grad_op(op_type):
             all_op_kernels = core._get_all_register_op_kernels()
             grad_op = op_type + '_grad'
             if grad_op in all_op_kernels.keys():
-                if is_mkldnn_op_test():
-                    grad_op_kernels = all_op_kernels[grad_op]
-                    for grad_op_kernel in grad_op_kernels:
-                        if 'MKLDNN' in grad_op_kernel:
-                            return False
-                else:
-                    return False
+                grad_op_kernels = all_op_kernels[grad_op]
+                for grad_op_kernel in grad_op_kernels:
+                    if 'XPU' in grad_op_kernel:
+                        return False
             return True
 
-        def is_xpu_op_test():
-            return True
-
-        def is_mkldnn_op_test():
-            return False
-
-        if not hasattr(cls, "op_type"):
-            raise AssertionError(
-                "This test do not have op_type in class attrs, "
-                "please set self.__class__.op_type=the_real_op_type manually.")
+        if cls.dtype == np.float16:
+            place = paddle.XPUPlace(0)
+            if core.is_float16_supported(place) == False:
+                return
+        super().tearDownClass()
 
-        # case in NO_FP64_CHECK_GRAD_CASES and op in NO_FP64_CHECK_GRAD_OP_LIST should be fixed
-        if not hasattr(cls, "no_need_check_grad") \
-            and not is_empty_grad_op(cls.op_type):
-            if cls.dtype is None or \
-                (cls.dtype == np.float16 \
-                    and cls.op_type not in op_accuracy_white_list.NO_FP16_CHECK_GRAD_OP_LIST \
-                    and not hasattr(cls, "exist_check_grad")):
-                raise AssertionError("This test of %s op needs check_grad." %
-                                     cls.op_type)
-
-            # check for op test with fp64 precision, but not check mkldnn op test for now
-            if cls.dtype in [np.float32, np.float64] \
-                and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \
-                and not hasattr(cls, 'exist_fp64_check_grad') \
-                and not is_xpu_op_test() \
-                and not is_mkldnn_op_test() \
-                and not is_rocm_op_test() \
-                and not is_npu_op_test():
-                raise AssertionError(
-                    "This test of %s op needs check_grad with fp64 precision." %
-                    cls.op_type)
-
-            if not cls.input_shape_is_large \
-                and cls.op_type not in check_shape_white_list.NEED_TO_FIX_OP_LIST:
-                raise AssertionError(
-                    "Input's shape should be large than or equal to 100 for " +
-                    cls.op_type + " Op.")
-
-    def try_call_once(self, data_type):
-        if not self.call_once:
-            self.call_once = True
-            if data_type is not None and \
-                data_type != np.float32:
-                raise AssertionError("Unsupport data type %s in xpu" %
-                                     data_type)
-            self.dtype = data_type
+    def _get_places(self):
+        places = [fluid.XPUPlace(0)]
+        return places
 
     def check_output_with_place(self,
                                 place,
@@ -133,166 +80,17 @@ class XPUOpTest(OpTest):
                                 check_dygraph=True,
                                 inplace_atol=None):
         self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
-        if self.dtype == np.float64 and \
-            self.op_type not in op_threshold_white_list.NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST:
-            atol = 0
-
-        if self.is_bfloat16_op():
-            check_dygraph = False
-            if hasattr(self, 'force_fp32_output') and getattr(
-                    self, 'force_fp32_output'):
-                atol = 1e-2
-            else:
-                atol = 2
-
-        if no_check_set is not None:
-            if self.op_type not in no_check_set_white_list.no_check_set_white_list:
-                raise AssertionError(
-                    "no_check_set of op %s must be set to None." % self.op_type)
-
-        if check_dygraph:
-            dygraph_outs = self._calc_dygraph_output(
-                place, no_check_set=no_check_set)
-        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
-        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
-            if out_name not in self.outputs:
-                continue
-            if no_check_set is not None and out_name in no_check_set:
-                continue
-
-            def find_imperative_actual(target_name, dygraph_outs, place):
-                with fluid.dygraph.base.guard(place=place):
-                    for name in dygraph_outs:
-                        if name == target_name:
-                            return dygraph_outs[name][0]
-                        var_list = dygraph_outs[name]
-                        for i, var in enumerate(var_list):
-                            if var.name == target_name:
-                                return dygraph_outs[name][i]
-                    self.assertTrue(False, "Found failed {} {}".format(
-                        dygraph_outs.keys(), target_name))
-
-            def find_actual(target_name, fetch_list):
-                found = [
-                    i for i, var_name in enumerate(fetch_list)
-                    if var_name == target_name
-                ]
-                self.assertTrue(
-                    len(found) == 1, "Found {} {}".format(
-                        len(found), target_name))
-                return found[0]
-
-            if out_dup:
-                sub_out = self.outputs[out_name]
-                if not isinstance(sub_out, list):
-                    raise AssertionError("sub_out type %s is not list",
-                                         type(sub_out))
-                for item in sub_out:
-                    sub_out_name, expect = item[0], item[1]
-                    if check_dygraph:
-                        imperative_actual = find_imperative_actual(
-                            sub_out_name, dygraph_outs, place)
-                        imperative_actual_t = np.array(imperative_actual.value()
-                                                       .get_tensor())
-                    idx = find_actual(sub_out_name, fetch_list)
-                    actual = outs[idx]
-                    actual_t = np.array(actual)
-                    expect_t = expect[0] \
-                        if isinstance(expect, tuple) else expect
-                    self.assertTrue(
-                        np.allclose(
-                            actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-                        "Output (" + sub_out_name + ") has diff at " +
-                        str(place))
-                    if check_dygraph:
-                        self.assertTrue(
-                            np.allclose(
-                                imperative_actual_t,
-                                expect_t,
-                                atol=atol,
-                                equal_nan=equal_nan),
-                            "Output (" + sub_out_name + ") has diff at " +
-                            str(place) + " in dygraph mode")
-                    if isinstance(expect, tuple):
-                        self.assertListEqual(
-                            actual.recursive_sequence_lengths(), expect[1],
-                            "Output (" + sub_out_name +
-                            ") has different lod at " + str(place))
-                        if check_dygraph:
-                            self.assertListEqual(
-                                imperative_actual.value().get_tensor()
-                                .recursive_sequence_lengths(), expect[1],
-                                "Output (" + out_name +
-                                ") has different lod at " + str(place) +
-                                " in dygraph mode")
-            else:
-                if check_dygraph:
-                    imperative_actual = find_imperative_actual(
-                        out_name, dygraph_outs, place)
-                    imperative_actual_t = np.array(imperative_actual.value()
-                                                   .get_tensor())
-                idx = find_actual(out_name, fetch_list)
-                actual = outs[idx]
-                actual_t = np.array(actual)
-                expect = self.outputs[out_name]
-                expect_t = expect[0] if isinstance(expect, tuple) else expect
-                self.assertTrue(
-                    np.allclose(
-                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-                    "Output (" + out_name + ") has diff at " + str(place) +
-                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
-                    str(actual_t) + " in class " + self.__class__.__name__ + " "
-                    + str(atol) + " " + str(expect_t - actual_t))
-                if check_dygraph:
-                    if six.moves.reduce(
-                            lambda x, y: x * y, imperative_actual_t.shape,
-                            1) == 0 and six.moves.reduce(
-                                lambda x, y: x * y, expect_t.shape, 1) == 0:
-                        pass
-                    else:
-                        self.assertTrue(
-                            np.allclose(
-                                imperative_actual_t,
-                                expect_t,
-                                atol=atol,
-                                equal_nan=equal_nan),
-                            "Output (" + out_name + ") has diff at " +
-                            str(place) + "\nExpect " + str(expect_t) + "\n" +
-                            "But Got" + str(imperative_actual_t) + " in class "
-                            + self.__class__.__name__)
-                if isinstance(expect, tuple):
-                    self.assertListEqual(actual.recursive_sequence_lengths(),
-                                         expect[1], "Output (" + out_name +
-                                         ") has different lod at " + str(place))
-                    if check_dygraph:
-                        self.assertListEqual(
-                            imperative_actual.value().get_tensor()
-                            .recursive_sequence_lengths(), expect[1],
-                            "Output (" + out_name + ") has different lod at " +
-                            str(place) + " in dygraph mode")
-
-        # Note(zhiqiu): inplace_atol should be only set when op doesn't ensure
-        # computational consistency.
-        # For example, group_norm uses AtomicAdd on CUDAPlace, which do not ensure
-        # computation order when multiple threads write the same address. So the
-        # result of group_norm is non-deterministic when datatype is float.
-        # When inplace_atol is not None, the inplace check uses numpy.allclose
-        # to check inplace result instead of numpy.array_equal.
-        if inplace_atol is not None:
-            warnings.warn(
-                "inplace_atol should only be set when op doesn't ensure computational consistency, please check it!"
-            )
-        # Check inplace for given op, its grad op, its grad_grad op, etc.
-        # No effect on original OpTest
-        # Currently not support ParallelExecutor on XPUPlace.
-        if not paddle.is_compiled_with_xpu():
-            self.check_inplace_output_with_place(
-                place, no_check_set=no_check_set, inplace_atol=inplace_atol)
-
-        if check_dygraph:
-            return outs
-        else:
-            return outs
+        #xpu not support float64
+        if self.dtype == np.float64:
+            return
+        if place == None:
+            place = paddle.XPUPlace(0)
+
+        if self.dtype == np.float16:
+            if core.is_float16_supported(place) == False:
+                return
+        return super().check_output_with_place(
+            place, atol, no_check_set, equal_nan, check_dygraph, inplace_atol)
 
     def check_grad_with_place(self,
                               place,
@@ -303,8 +101,25 @@ class XPUOpTest(OpTest):
                               in_place=False,
                               max_relative_error=0.005,
                               user_defined_grads=None,
-                              check_dygraph=True):
-        place = paddle.XPUPlace(0)
+                              user_defined_grad_outputs=None,
+                              check_dygraph=True,
+                              numeric_place=None):
+        if place == None:
+            place = paddle.XPUPlace(0)
+
+        if self.dtype == np.float64:
+            return
+
+        if self.dtype == np.float16:
+            if core.is_float16_supported(place) == False:
+                return
+
+        if self.dtype == np.float16:
+            return super().check_grad_with_place(
+                place, inputs_to_check, output_names, no_grad_set,
+                numeric_grad_delta, in_place, max_relative_error,
+                user_defined_grads, user_defined_grads, check_dygraph)
+
         a1 = self.get_grad_with_place(
             place, inputs_to_check, output_names, no_grad_set=no_grad_set)
         a2 = self.get_grad_with_place(
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
index c4905a229b2e51075e5129c4b20eb80d6e30785d..9ef8cc1e02790ca19475df985ddf09ca72092781 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
@@ -28,17 +28,12 @@ paddle.enable_static()
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp(XPUOpTest):
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
     def setUp(self):
         self.op_type = "elementwise_add"
         self.init_dtype()
         self.init_input_output()
-        self.init_kernel_type()
         self.init_axis()
-        self.use_xpu = True
-
+        self.init_max_relative_error()
         self.inputs = {
             'X': OpTest.np_dtype_to_fluid_dtype(self.x),
             'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
@@ -55,7 +50,9 @@ class TestElementwiseAddOp(XPUOpTest):
         if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
             self.check_grad_with_place(
-                place, ['X', 'Y'], 'Out', max_relative_error=0.006)
+                place, ['X', 'Y'],
+                'Out',
+                max_relative_error=self.max_relative_error)
 
     def test_check_grad_ingore_x(self):
         if paddle.is_compiled_with_xpu():
@@ -64,7 +61,7 @@ class TestElementwiseAddOp(XPUOpTest):
                 place, ['Y'],
                 'Out',
                 no_grad_set=set("X"),
-                max_relative_error=0.006)
+                max_relative_error=self.max_relative_error)
 
     def test_check_grad_ingore_y(self):
         if paddle.is_compiled_with_xpu():
@@ -73,7 +70,7 @@ class TestElementwiseAddOp(XPUOpTest):
                 place, ['X'],
                 'Out',
                 no_grad_set=set("Y"),
-                max_relative_error=0.006)
+                max_relative_error=self.max_relative_error)
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
@@ -86,6 +83,9 @@ class TestElementwiseAddOp(XPUOpTest):
     def init_axis(self):
         self.axis = -1
 
+    def init_max_relative_error(self):
+        self.max_relative_error = 0.006
+
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
@@ -337,5 +337,170 @@ class TestAddOp(unittest.TestCase):
             self.assertEqual((np_z == z_expected).all(), True)
 
 
+######## fp16 test
+class TestElementwiseAddFP16Op(TestElementwiseAddOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def init_max_relative_error(self):
+        self.max_relative_error = 0.01
+
+
+class TestElementwiseAddOp_scalarFP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_scalar2FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_VectorFP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
+class TestElementwiseAddOp_broadcast_0FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_1FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 100, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 100, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_2FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 100)
+
+
+class TestElementwiseAddOp_broadcast_3FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_4FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(100, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_5FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestElementwiseAddOp_broadcast_6FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_broadcast_7FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
+        self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestElementwiseAddOp_rowwise_add_0FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_rowwise_add_1FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_channelwise_addFP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add1FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(1, 1, 100).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add2FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_xsize_lessthan_ysize_addFP16(
+        TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
index f5d3ace202692ab3ddccdec253d4c2cfab653071..59646f2db413e57c36b8a4743568525bf73d7dbf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
@@ -127,45 +127,23 @@ class Generator(object):
         self.outputs = {'Out': Out}
 
     def test_check_output(self):
-
-        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
-                self.inputs['Y'].shape) and self.inputs['X'].shape[
-                    0] == self.inputs['Y'].shape[0]:
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place, atol=1e-3)
 
     def test_check_grad_normal(self):
-
-        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
-                self.inputs['Y'].shape) and self.inputs['X'].shape[
-                    0] == self.inputs['Y'].shape[0]:
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['X', 'Y'], 'Out', max_relative_error=5e-2)
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=5e-2)
 
     def test_check_grad_ignore_x(self):
-
-        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
-                self.inputs['Y'].shape) and self.inputs['X'].shape[
-                    0] == self.inputs['Y'].shape[0]:
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['Y'],
-                'Out',
-                max_relative_error=5e-2,
-                no_grad_set=set("X"))
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=5e-2, no_grad_set=set("X"))
 
     def test_check_grad_ignore_y(self):
-
-        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
-                self.inputs['Y'].shape) and self.inputs['X'].shape[
-                    0] == self.inputs['Y'].shape[0]:
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['X'],
-                'Out',
-                max_relative_error=5e-2,
-                no_grad_set=set('Y'))
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=5e-2, no_grad_set=set('Y'))
 
 
 class TestMatmulOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
index bbdb0984ed68aa038fed055b1be341e7f1e21991..896821552c9f7a0d50ff4d1cdbcbb8223c2875be 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 import sys
 sys.path.append("..")
+from op_test_xpu import XPUOpTest
 from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
@@ -27,22 +28,27 @@ from paddle.fluid import Program, program_guard
 np.random.seed(10)
 
 
-class TestMeanOp(OpTest):
+class TestMeanOp(XPUOpTest):
     def setUp(self):
         self.op_type = "mean"
-        self.dtype = np.float64
         self.init_dtype_type()
         self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
-        self.outputs = {'Out': np.mean(self.inputs["X"])}
+        self.outputs = {'Out': np.mean(self.inputs["X"]).astype(np.float16)}
 
     def init_dtype_type(self):
-        pass
+        self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output()
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=2e-3)
 
     def test_checkout_grad(self):
-        self.check_grad(['X'], 'Out')
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
 
 
 class TestMeanOpError(unittest.TestCase):
@@ -77,5 +83,23 @@ class TestXPUMeanOp(TestMeanOp):
             self.check_grad_with_place(place, ['X'], 'Out')
 
 
+class TestXPUMeanOpFp16(TestMeanOp):
+    def init_dtype_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_checkout_grad(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'], 'Out', max_relative_error=1.e1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
index 1f74fa5e2d6852abdadac288639274b8be1ebf6e..761e5c2243c65927c60c25a55a908829315e786d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
@@ -18,27 +18,27 @@ import unittest
 import numpy as np
 import sys
 sys.path.append("..")
-from op_test import OpTest
+from op_test_xpu import XPUOpTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle
+from paddle.static import Program, program_guard
 
-paddle.enable_static()
 
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUScaleOp(OpTest):
+class TestXPUScaleOp(XPUOpTest):
     def setUp(self):
         self.op_type = "scale"
-        self.dtype = np.float32
+        self.init_type()
         self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
         self.attrs = {'scale': -2.3, 'use_xpu': True}
         self.outputs = {
             'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
         }
 
+    def init_type(self):
+        self.dtype = np.float32
+
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
@@ -50,5 +50,63 @@ class TestXPUScaleOp(OpTest):
             self.check_grad_with_place(place, ['X'], 'Out')
 
 
+# class TestXPUScaleOpInt64(TestXPUScaleOp):
+#     def init_type(self):
+#         self.dtype = np.int64
+
+
+class TestScaleFp16Op(TestXPUScaleOp):
+    def init_dtype_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        self.check_output_with_place(place, atol=0.002)
+
+    def test_check_grad(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(place, ["X"], "Out", max_relative_error=0.05)
+
+
+class TestScaleApiStatic(unittest.TestCase):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return paddle.scale(x, scale, bias)
+
+    def test_api(self):
+        paddle.enable_static()
+        input = np.random.random([2, 25]).astype("float32")
+        main_prog = Program()
+        with program_guard(main_prog, Program()):
+            x = paddle.static.data(name="x", shape=[2, 25], dtype="float32")
+            out = self._executed_api(x, scale=2.0, bias=3.0)
+
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
+        out = exe.run(main_prog, feed={"x": input}, fetch_list=[out])
+        self.assertEqual(np.array_equal(out[0], input * 2.0 + 3.0), True)
+
+
+class TestScaleInplaceApiStatic(TestScaleApiStatic):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return x.scale_(scale, bias)
+
+
+class TestScaleApiDygraph(unittest.TestCase):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return paddle.scale(x, scale, bias)
+
+    def test_api(self):
+        paddle.disable_static()
+        input = np.random.random([2, 25]).astype("float32")
+        x = paddle.to_tensor(input)
+        out = self._executed_api(x, scale=2.0, bias=3.0)
+        self.assertEqual(np.array_equal(out.numpy(), input * 2.0 + 3.0), True)
+        paddle.enable_static()
+
+
+class TestScaleInplaceApiDygraph(TestScaleApiDygraph):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return x.scale_(scale, bias)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
index 3bafbf649e6ce4efd2c7f593d421c37898538cc1..8ae588975a56ae311aee1990c13e9837ef77e2fa 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
@@ -13,27 +13,26 @@
 # limitations under the License.
 
 from __future__ import print_function
-
-import unittest
-import numpy as np
 import sys
 sys.path.append("..")
-from op_test import OpTest
+import unittest
+import numpy as np
+from op_test_xpu import XPUOpTest
 import paddle
+from paddle import enable_static
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
-import paddle
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, convert_uint16_to_float)
 
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSumOp(OpTest):
+class TestSumOp(XPUOpTest):
     def setUp(self):
         self.op_type = "sum"
-        self.use_mkldnn = False
+        self.init_kernel_type()
         self.init_kernel_type()
         x0 = np.random.random((3, 40)).astype(self.dtype)
         x1 = np.random.random((3, 40)).astype(self.dtype)
@@ -41,21 +40,147 @@ class TestXPUSumOp(OpTest):
         self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
         y = x0 + x1 + x2
         self.outputs = {'Out': y}
-        self.attrs = {'use_mkldnn': self.use_mkldnn, 'use_xpu': True}
 
     def init_kernel_type(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
+        self.check_output()
 
     def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['x0'], 'Out')
+        self.check_grad(['x0'], 'Out')
+
+
+#----------- test fp16 -----------
+class TestFP16SumOp(TestSumOp):
+    def init_kernel_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        # if core.is_float16_supported(place):
+        self.check_output_with_place(place, atol=2e-2)
+
+    # FIXME: Because of the precision fp16, max_relative_error
+    # should be 0.15 here.
+    def test_check_grad(self):
+        place = core.XPUPlace(0)
+        # if core.is_float16_supported(place):
+        self.check_grad_with_place(
+            place, ['x0'], 'Out', max_relative_error=0.15)
+
+
+def create_test_sum_fp16_class(parent):
+    class TestSumFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_w_is_selected_rows(self):
+            place = core.XPUPlace(0)
+            # if core.is_float16_supported(place):
+            for inplace in [True, False]:
+                self.check_with_place(place, inplace)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "SumFp16Test")
+    TestSumFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestSumFp16Case
+
+
+class API_Test_Add_n(unittest.TestCase):
+    def test_api(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input0 = fluid.layers.fill_constant(
+                shape=[2, 3], dtype='int64', value=5)
+            input1 = fluid.layers.fill_constant(
+                shape=[2, 3], dtype='int64', value=3)
+            expected_result = np.empty((2, 3))
+            expected_result.fill(8)
+            sum_value = paddle.add_n([input0, input1])
+            exe = fluid.Executor(fluid.XPUPlace(0))
+            result = exe.run(fetch_list=[sum_value])
+
+            self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.dygraph.guard():
+            input0 = paddle.ones(shape=[2, 3], dtype='float32')
+            expected_result = np.empty((2, 3))
+            expected_result.fill(2)
+            sum_value = paddle.add_n([input0, input0])
+
+            self.assertEqual((sum_value.numpy() == expected_result).all(), True)
+
+
+class TestRaiseSumError(unittest.TestCase):
+    def test_errors(self):
+        def test_type():
+            fluid.layers.sum([11, 22])
+
+        self.assertRaises(TypeError, test_type)
+
+        def test_dtype():
+            data1 = fluid.data(name="input1", shape=[10], dtype="int8")
+            data2 = fluid.data(name="input2", shape=[10], dtype="int8")
+            fluid.layers.sum([data1, data2])
+
+        self.assertRaises(TypeError, test_dtype)
+
+        def test_dtype1():
+            data1 = fluid.data(name="input1", shape=[10], dtype="int8")
+            fluid.layers.sum(data1)
+
+        self.assertRaises(TypeError, test_dtype1)
+
+
+class TestRaiseSumsError(unittest.TestCase):
+    def test_errors(self):
+        def test_type():
+            fluid.layers.sums([11, 22])
+
+        self.assertRaises(TypeError, test_type)
+
+        def test_dtype():
+            data1 = fluid.data(name="input1", shape=[10], dtype="int8")
+            data2 = fluid.data(name="input2", shape=[10], dtype="int8")
+            fluid.layers.sums([data1, data2])
+
+        self.assertRaises(TypeError, test_dtype)
+
+        def test_dtype1():
+            data1 = fluid.data(name="input1", shape=[10], dtype="int8")
+            fluid.layers.sums(data1)
+
+        self.assertRaises(TypeError, test_dtype1)
+
+        def test_out_type():
+            data1 = fluid.data(name="input1", shape=[10], dtype="flaot32")
+            data2 = fluid.data(name="input2", shape=[10], dtype="float32")
+            fluid.layers.sums([data1, data2], out=[10])
+
+        self.assertRaises(TypeError, test_out_type)
+
+        def test_out_dtype():
+            data1 = fluid.data(name="input1", shape=[10], dtype="flaot32")
+            data2 = fluid.data(name="input2", shape=[10], dtype="float32")
+            out = fluid.data(name="out", shape=[10], dtype="int8")
+            fluid.layers.sums([data1, data2], out=out)
+
+        self.assertRaises(TypeError, test_out_dtype)
+
+
+class TestSumOpError(unittest.TestCase):
+    def test_errors(self):
+        def test_empty_list_input():
+            with fluid.dygraph.guard():
+                fluid.core.ops.sum([])
+
+        def test_list_of_none_input():
+            with fluid.dygraph.guard():
+                fluid.core.ops.sum([None])
+
+        self.assertRaises(Exception, test_empty_list_input)
+        self.assertRaises(Exception, test_list_of_none_input)
 
 
 if __name__ == "__main__":
+    enable_static()
     unittest.main()