lamb_op.cc 5.8 KB
Newer Older
T
Thomas Young 已提交
1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Y
Yibing Liu 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include <string>
T
Thomas Young 已提交
16 17
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
18
#include "paddle/fluid/framework/op_version_registry.h"
T
Thomas Young 已提交
19 20 21 22 23
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/infermeta/multiary.h"
#include "paddle/phi/kernels/lamb_kernel.h"
Y
Yibing Liu 已提交
24 25 26 27

namespace paddle {
namespace operators {

A
Aurelius84 已提交
28 29 30 31 32
class LambOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

  framework::OpKernelType GetExpectedKernelType(
33
      const framework::ExecutionContext &ctx) const {
A
Aurelius84 已提交
34 35 36 37
    auto input_data_type =
        OperatorWithKernel::IndicateVarDataType(ctx, "Param");
    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }
38
  framework::OpKernelType GetKernelTypeForVar(
39 40
      const std::string &var_name,
      const framework::Tensor &tensor,
41 42 43 44
      const framework::OpKernelType &expected_kernel_type) const {
    if (var_name == "Beta1Pow" || var_name == "Beta2Pow") {
      return expected_kernel_type;
    } else {
45 46
      return framework::OpKernelType(
          expected_kernel_type.data_type_, tensor.place(), tensor.layout());
47 48
    }
  }
A
Aurelius84 已提交
49 50
};

Y
Yibing Liu 已提交
51 52 53 54 55 56 57 58 59 60 61 62 63 64
class LambOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("Param",
             "(LoDTensor, default LoDTensor<float>) "
             "Input parameter that has to be updated.");
    AddInput("Grad",
             "(LoDTensor, default LoDTensor<float>) "
             "Input gradient of the parameter.");
    AddInput("LearningRate", "(Tensor) Learning rate.");
    AddInput("Moment1", "(Tensor) Input first moment.");
    AddInput("Moment2", "(Tensor) Input second moment.");
    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator.");
    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator.");
65 66 67 68 69 70 71 72
    AddInput("MasterParam",
             "(LoDTensor, default LoDTensor<float>) "
             "Input master parameter that has to be updated.")
        .AsDispensable();
    AddInput(
        "SkipUpdate",
        "(Tensor) Input tensor to determine whether to update the parameter.")
        .AsDispensable();
Y
Yibing Liu 已提交
73 74 75 76

    AddOutput("ParamOut", "(Tensor) Output parameter.");
    AddOutput("Moment1Out", "(Tensor) Output first moment.");
    AddOutput("Moment2Out", "(Tensor) Output second moment.");
77 78 79 80
    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator")
        .AsDispensable();
    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator")
        .AsDispensable();
81 82
    AddOutput("MasterParamOut", "(Tensor) Output master parameter.")
        .AsDispensable();
Y
Yibing Liu 已提交
83 84 85 86 87 88 89 90 91 92 93 94 95
    AddAttr<float>("weight_decay", "(float) Weight decay rate.");
    AddAttr<float>("beta1",
                   "(float, default 0.9) The exponential decay rate for the "
                   "1st moment estimates.")
        .SetDefault(0.9);
    AddAttr<float>("beta2",
                   "(float, default 0.999) The exponential decay rate for the "
                   "2nd moment estimates.")
        .SetDefault(0.999);
    AddAttr<float>("epsilon",
                   "(float, default 1.0e-6) "
                   "Constant for numerical stability.")
        .SetDefault(1.0e-6f);
96 97 98 99
    AddAttr<bool>(
        "multi_precision",
        "(bool, default false) Whether to enable multi-precision mode.")
        .SetDefault(false);
Y
Yibing Liu 已提交
100 101 102 103

    AddComment(R"DOC(
LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.

104 105
LAMB Optimizer is designed to scale up the batch size of training without losing
accuracy, which supports adaptive element-wise updating and accurate layer-wise
Y
Yibing Liu 已提交
106 107 108 109 110
correction. For more information, please refer to https://arxiv.org/abs/1904.00962.

The updating of parameters follows:

$$
Y
Yibing Liu 已提交
111
m_t &= \beta_1 m_{t - 1}+ (1 - \beta_1)g_t \\
Y
Yibing Liu 已提交
112

Y
Yibing Liu 已提交
113
v_t &= \beta_2 v_{t - 1}  + (1 - \beta_2)g_t^2 \\
Y
Yibing Liu 已提交
114

115 116 117 118
m_t &= \frac{m_t}{\beta_1^t} \\

v_t &= \frac{v_t}{\beta_2^t} \\

Y
Yibing Liu 已提交
119
r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon} \\
Y
Yibing Liu 已提交
120

Y
Yibing Liu 已提交
121
w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})
Y
Yibing Liu 已提交
122 123
$$

124
where $m$ is the 1st moment, and $v$ the 2nd moment, $\eta$ the
Y
Yibing Liu 已提交
125 126 127 128 129 130 131 132 133
learning rate, $\lambda$ the weight decay rate.
)DOC");
  }
};

}  // namespace operators
}  // namespace paddle

namespace ops = paddle::operators;
T
Thomas Young 已提交
134 135 136 137 138 139 140 141 142 143
DECLARE_INFER_SHAPE_FUNCTOR(lamb,
                            LambInferMetaFunctor,
                            PD_INFER_META(phi::LambInferMeta));
REGISTER_OPERATOR(
    lamb,
    ops::LambOp,
    ops::LambOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    LambInferMetaFunctor);
144 145

/* ==========================  register checkpoint ===========================*/
146 147 148 149 150 151 152 153 154
REGISTER_OP_VERSION(lamb).AddCheckpoint(
    R"ROC(Upgrade lamb, add two new outputs [Beta1PowOut] and [Beta2PowOut].)ROC",
    paddle::framework::compatible::OpVersionDesc()
        .NewInput("Beta1PowOut",
                  "The Output beta1 power accumulator. 'Beta1PowOut' is "
                  "dispensable.")
        .NewInput("Beta2PowOut",
                  "The Output beta2 power accumulator. 'Beta2PowOut' is "
                  "dispensable."));