momentum support l2decay for xpu. test=kunlun (#41325)

* momentum support l2decay for xpu. test=kunlun * fix include file. test=kunlun * fix cmake for device_worker. test=kunlun

momentum support l2decay for xpu. test=kunlun (#41325)
* momentum support l2decay for xpu. test=kunlun * fix include file. test=kunlun * fix cmake for device_worker. test=kunlun
533c649f · houj04 · GitHub · 56e72b20 · 533c649f · 533c649f
4 changed file
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()

 if(NOT DEFINED XPU_BASE_URL)
  SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220331")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220402")
 else()
  SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -117,12 +117,14 @@ endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)

 set(BRPC_DEPS "")
-if(WITH_PSLIB OR WITH_PSCORE)
-    if(NOT WITH_HETERPS)
-        set(BRPC_DEPS brpc ssl crypto)
-    endif()
+if(WITH_PSCORE)
+    set(BRPC_DEPS brpc ssl crypto)
+endif()
+if(WITH_PSLIB)
    if(WITH_PSLIB_BRPC)
        set(BRPC_DEPS pslib_brpc)
+    elseif(NOT WITH_HETERPS)
+        set(BRPC_DEPS brpc ssl crypto)
    endif()
 endif()


--- a/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <string>
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
 namespace operators {

@@ -33,6 +34,13 @@ class MomentumOpXPUKernel : public framework::OpKernel<T> {
    velocity_out->mutable_data<T>(ctx.GetPlace());
    auto* lr = learning_rate->data<T>();

+    auto regularization_method = ctx.Attr<std::string>("regularization_method");
+    auto regularization_coeff = ctx.Attr<float>("regularization_coeff");
+    if (regularization_method != "l2_decay") {
+      // only support l2_decay
+      regularization_coeff = 0.0f;
+    }
+
    auto* grad_var = ctx.InputVar("Grad");
    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
                      platform::errors::PermissionDenied(
@@ -44,28 +52,16 @@ class MomentumOpXPUKernel : public framework::OpKernel<T> {
    auto grad = ctx.Input<framework::Tensor>("Grad");

    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    // int momentum(Context* ctx, const T* param, const T* velocity, const T*
+    // grad, T* param_out, T* velocity_out, int len, const float* lr, int
+    // use_nesterov, float mu, float l2_weight_decay);
    int r = xpu::momentum(dev_ctx.x_context(), param->data<float>(),
                          velocity->data<float>(), grad->data<float>(),
                          param_out->data<float>(), velocity_out->data<float>(),
-                          param_out->numel(), lr, use_nesterov, mu);
-    if (r == xpu::Error_t::INVALID_PARAM) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::InvalidArgument(
-              "XPU kernel error of MomentumOp, error message: INVALID_PARAM, "
-              "please check your input & output."));
-    } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::Unavailable(
-              "XPU kernel error of MomentumOp, error message: RUNTIME_ERROR, "
-              "please check whether Baidu Kunlun card is properly installed."));
-    } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::ResourceExhausted(
-                            "XPU kernel error of MomentumOp, error message: "
-                            "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
-    }
+                          param_out->numel(), lr, use_nesterov, mu,
+                          regularization_coeff);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "momentum");
  }
 };
 }  // namespace operators

--- a/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,52 +17,150 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import sys
-import os
 sys.path.append("..")
-from op_test import OpTest
-import paddle
-from paddle.fluid import core
-from paddle.fluid.op import Operator
-

-class TestMomentumOp1(OpTest):
-    def setUp(self):
-        self.op_type = "momentum"
-        self.dtype = np.float32
-        self.init_dtype()
+import paddle
+import paddle.fluid.core as core

-        param = np.random.random((123, 321)).astype(self.dtype)
-        grad = np.random.random((123, 321)).astype(self.dtype)
-        velocity = np.zeros((123, 321)).astype(self.dtype)
-        learning_rate = np.array([0.001]).astype(self.dtype)
-        mu = 0.0001
-        use_nesterov = False
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper

-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Velocity': velocity,
-            'LearningRate': learning_rate
-        }
+paddle.enable_static()

-        self.attrs = {'mu': mu}

+def calculate_momentum_by_numpy(param, grad, mu, velocity, use_nesterov,
+                                learning_rate, regularization_method,
+                                regularization_coeff):
+    if regularization_method == "l2_decay":
+        grad = grad + regularization_coeff * param
+        velocity_out = mu * velocity + grad
+        if use_nesterov:
+            param_out = param - (grad + velocity_out * mu) * learning_rate
+        else:
+            param_out = param - learning_rate * velocity_out
+    else:
        velocity_out = mu * velocity + grad
        if use_nesterov:
            param_out = param - grad * learning_rate - \
                        velocity_out * mu * learning_rate
        else:
            param_out = param - learning_rate * velocity_out
+    return param_out, velocity_out
+
+
+class XPUTestMomentumOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'momentum'
+        self.use_dynamic_create_class = False
+
+    class TestMomentumOPBase(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.xpu_version = core.get_xpu_device_version(0)
+            self.init_dtype()
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'momentum'
+            self.dtype = self.in_type
+            self.init_config()
+
+            self.param = np.random.uniform(-1, 1,
+                                           self.input_shape).astype(self.dtype)
+            self.grad = np.random.uniform(-1, 1,
+                                          self.input_shape).astype(self.dtype)
+            self.velocity = np.random.uniform(
+                -1, 1, self.input_shape).astype(self.dtype)
+
+            param_out, velocity_out = calculate_momentum_by_numpy(
+                param=self.param,
+                grad=self.grad,
+                mu=self.mu,
+                velocity=self.velocity,
+                use_nesterov=self.use_nesterov,
+                learning_rate=self.learning_rate,
+                regularization_method=self.regularization_method,
+                regularization_coeff=self.regularization_coeff)
+            self.inputs = {
+                'Param': self.param,
+                'Grad': self.grad,
+                'Velocity': self.velocity,
+                'LearningRate': self.learning_rate,
+            }
+            self.attrs = {
+                'use_xpu': True,
+                'mu': self.mu,
+                'use_nesterov': self.use_nesterov,
+                'regularization_method': self.regularization_method,
+                'regularization_coeff': self.regularization_coeff
+            }
+            self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+
+        def init_dtype(self):
+            self.dtype = np.float32
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def init_config(self):
+            self.input_shape = [864]
+            self.learning_rate = np.array([0.001]).astype(self.dtype)
+            self.mu = 0.0001
+            self.use_nesterov = False
+            self.regularization_method = None
+            self.regularization_coeff = 0
+
+    class XPUTestMomentum1(TestMomentumOPBase):
+        def init_config(self):
+            self.input_shape = [2, 768]
+            self.learning_rate = np.array([0.002]).astype(self.dtype)
+            self.mu = 0.001
+            self.use_nesterov = False
+            self.regularization_method = None
+            self.regularization_coeff = 0
+
+    class XPUTestMomentum2(TestMomentumOPBase):
+        def init_config(self):
+            self.input_shape = [3, 8, 4096]
+            self.learning_rate = np.array([0.005]).astype(self.dtype)
+            self.mu = 0.002
+            self.use_nesterov = True
+            self.regularization_method = None
+            self.regularization_coeff = 0

-        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+    class XPUTestMomentum3(TestMomentumOPBase):
+        def init_config(self):
+            self.input_shape = [1024]
+            self.learning_rate = np.array([0.01]).astype(self.dtype)
+            self.mu = 0.0001
+            self.use_nesterov = False
+            if self.xpu_version != core.XPUVersion.XPU1:
+                self.regularization_method = "l2_decay"
+                self.regularization_coeff = 0.005
+            else:
+                # regularization not supported on XPU1
+                self.regularization_method = None
+                self.regularization_coeff = 0

-    def init_dtype(self):
-        pass
+    class XPUTestMomentum4(TestMomentumOPBase):
+        def init_config(self):
+            self.input_shape = [2, 2, 255]
+            self.learning_rate = np.array([0.0005]).astype(self.dtype)
+            self.mu = 0.005
+            self.use_nesterov = True
+            if self.xpu_version != core.XPUVersion.XPU1:
+                self.regularization_method = "l2_decay"
+                self.regularization_coeff = 0.005
+            else:
+                # regularization not supported on XPU1
+                self.regularization_method = None
+                self.regularization_coeff = 0

-    def test_check_output_with_place(self):
-        self.check_output_with_place(paddle.XPUPlace(0))

+support_types = get_xpu_op_support_types('momentum')
+for stype in support_types:
+    create_test_class(globals(), XPUTestMomentumOP, stype)

 if __name__ == "__main__":
-    paddle.enable_static()
    unittest.main()