diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 83411a68f0847df1f382fb055e24f6298f52abf9..e83bdef327891ad72668e608bb731ded7478b50a 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220331")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220402")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index e92e160c7ae3be6ce9dfc5b3264aa9064d211ea6..fb4c9937611e733593acf3a768e99dab8cf03b1b 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -117,12 +117,14 @@ endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
 
 set(BRPC_DEPS "")
-if(WITH_PSLIB OR WITH_PSCORE)
-    if(NOT WITH_HETERPS)
-        set(BRPC_DEPS brpc ssl crypto)
-    endif()
+if(WITH_PSCORE)
+    set(BRPC_DEPS brpc ssl crypto)
+endif()
+if(WITH_PSLIB)
     if(WITH_PSLIB_BRPC)
         set(BRPC_DEPS pslib_brpc)
+    elseif(NOT WITH_HETERPS)
+        set(BRPC_DEPS brpc ssl crypto)
     endif()
 endif()
 
diff --git a/paddle/fluid/operators/optimizers/momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
index 5624312d9a728496e57321c71be75fd8063b884d..6897213c91a34350c10a405329c2ffff4325c495 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <string>
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
 namespace operators {
 
@@ -33,6 +34,13 @@ class MomentumOpXPUKernel : public framework::OpKernel<T> {
     velocity_out->mutable_data<T>(ctx.GetPlace());
     auto* lr = learning_rate->data<T>();
 
+    auto regularization_method = ctx.Attr<std::string>("regularization_method");
+    auto regularization_coeff = ctx.Attr<float>("regularization_coeff");
+    if (regularization_method != "l2_decay") {
+      // only support l2_decay
+      regularization_coeff = 0.0f;
+    }
+
     auto* grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
                       platform::errors::PermissionDenied(
@@ -44,28 +52,16 @@ class MomentumOpXPUKernel : public framework::OpKernel<T> {
     auto grad = ctx.Input<framework::Tensor>("Grad");
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    // int momentum(Context* ctx, const T* param, const T* velocity, const T*
+    // grad, T* param_out, T* velocity_out, int len, const float* lr, int
+    // use_nesterov, float mu, float l2_weight_decay);
     int r = xpu::momentum(dev_ctx.x_context(), param->data<float>(),
                           velocity->data<float>(), grad->data<float>(),
                           param_out->data<float>(), velocity_out->data<float>(),
-                          param_out->numel(), lr, use_nesterov, mu);
-    if (r == xpu::Error_t::INVALID_PARAM) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::InvalidArgument(
-              "XPU kernel error of MomentumOp, error message: INVALID_PARAM, "
-              "please check your input & output."));
-    } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::Unavailable(
-              "XPU kernel error of MomentumOp, error message: RUNTIME_ERROR, "
-              "please check whether Baidu Kunlun card is properly installed."));
-    } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::ResourceExhausted(
-                            "XPU kernel error of MomentumOp, error message: "
-                            "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
-    }
+                          param_out->numel(), lr, use_nesterov, mu,
+                          regularization_coeff);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "momentum");
   }
 };
 }  // namespace operators
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
index ccee79e8cd77aeedbc23f5d3b188d6d655f90ec3..f7c1f0041e8050011bd814d7ab6a3dcd32dae913 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,52 +17,150 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import sys
-import os
 sys.path.append("..")
-from op_test import OpTest
-import paddle
-from paddle.fluid import core
-from paddle.fluid.op import Operator
-
 
-class TestMomentumOp1(OpTest):
-    def setUp(self):
-        self.op_type = "momentum"
-        self.dtype = np.float32
-        self.init_dtype()
+import paddle
+import paddle.fluid.core as core
 
-        param = np.random.random((123, 321)).astype(self.dtype)
-        grad = np.random.random((123, 321)).astype(self.dtype)
-        velocity = np.zeros((123, 321)).astype(self.dtype)
-        learning_rate = np.array([0.001]).astype(self.dtype)
-        mu = 0.0001
-        use_nesterov = False
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Velocity': velocity,
-            'LearningRate': learning_rate
-        }
+paddle.enable_static()
 
-        self.attrs = {'mu': mu}
 
+def calculate_momentum_by_numpy(param, grad, mu, velocity, use_nesterov,
+                                learning_rate, regularization_method,
+                                regularization_coeff):
+    if regularization_method == "l2_decay":
+        grad = grad + regularization_coeff * param
+        velocity_out = mu * velocity + grad
+        if use_nesterov:
+            param_out = param - (grad + velocity_out * mu) * learning_rate
+        else:
+            param_out = param - learning_rate * velocity_out
+    else:
         velocity_out = mu * velocity + grad
         if use_nesterov:
             param_out = param - grad * learning_rate - \
                         velocity_out * mu * learning_rate
         else:
             param_out = param - learning_rate * velocity_out
+    return param_out, velocity_out
+
+
+class XPUTestMomentumOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'momentum'
+        self.use_dynamic_create_class = False
+
+    class TestMomentumOPBase(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.xpu_version = core.get_xpu_device_version(0)
+            self.init_dtype()
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'momentum'
+            self.dtype = self.in_type
+            self.init_config()
+
+            self.param = np.random.uniform(-1, 1,
+                                           self.input_shape).astype(self.dtype)
+            self.grad = np.random.uniform(-1, 1,
+                                          self.input_shape).astype(self.dtype)
+            self.velocity = np.random.uniform(
+                -1, 1, self.input_shape).astype(self.dtype)
+
+            param_out, velocity_out = calculate_momentum_by_numpy(
+                param=self.param,
+                grad=self.grad,
+                mu=self.mu,
+                velocity=self.velocity,
+                use_nesterov=self.use_nesterov,
+                learning_rate=self.learning_rate,
+                regularization_method=self.regularization_method,
+                regularization_coeff=self.regularization_coeff)
+            self.inputs = {
+                'Param': self.param,
+                'Grad': self.grad,
+                'Velocity': self.velocity,
+                'LearningRate': self.learning_rate,
+            }
+            self.attrs = {
+                'use_xpu': True,
+                'mu': self.mu,
+                'use_nesterov': self.use_nesterov,
+                'regularization_method': self.regularization_method,
+                'regularization_coeff': self.regularization_coeff
+            }
+            self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+
+        def init_dtype(self):
+            self.dtype = np.float32
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def init_config(self):
+            self.input_shape = [864]
+            self.learning_rate = np.array([0.001]).astype(self.dtype)
+            self.mu = 0.0001
+            self.use_nesterov = False
+            self.regularization_method = None
+            self.regularization_coeff = 0
+
+    class XPUTestMomentum1(TestMomentumOPBase):
+        def init_config(self):
+            self.input_shape = [2, 768]
+            self.learning_rate = np.array([0.002]).astype(self.dtype)
+            self.mu = 0.001
+            self.use_nesterov = False
+            self.regularization_method = None
+            self.regularization_coeff = 0
+
+    class XPUTestMomentum2(TestMomentumOPBase):
+        def init_config(self):
+            self.input_shape = [3, 8, 4096]
+            self.learning_rate = np.array([0.005]).astype(self.dtype)
+            self.mu = 0.002
+            self.use_nesterov = True
+            self.regularization_method = None
+            self.regularization_coeff = 0
 
-        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+    class XPUTestMomentum3(TestMomentumOPBase):
+        def init_config(self):
+            self.input_shape = [1024]
+            self.learning_rate = np.array([0.01]).astype(self.dtype)
+            self.mu = 0.0001
+            self.use_nesterov = False
+            if self.xpu_version != core.XPUVersion.XPU1:
+                self.regularization_method = "l2_decay"
+                self.regularization_coeff = 0.005
+            else:
+                # regularization not supported on XPU1
+                self.regularization_method = None
+                self.regularization_coeff = 0
 
-    def init_dtype(self):
-        pass
+    class XPUTestMomentum4(TestMomentumOPBase):
+        def init_config(self):
+            self.input_shape = [2, 2, 255]
+            self.learning_rate = np.array([0.0005]).astype(self.dtype)
+            self.mu = 0.005
+            self.use_nesterov = True
+            if self.xpu_version != core.XPUVersion.XPU1:
+                self.regularization_method = "l2_decay"
+                self.regularization_coeff = 0.005
+            else:
+                # regularization not supported on XPU1
+                self.regularization_method = None
+                self.regularization_coeff = 0
 
-    def test_check_output_with_place(self):
-        self.check_output_with_place(paddle.XPUPlace(0))
 
+support_types = get_xpu_op_support_types('momentum')
+for stype in support_types:
+    create_test_class(globals(), XPUTestMomentumOP, stype)
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()