未验证 提交 23032590 编写于 作者: W wangzhen38 提交者: GitHub

[BUG Fixs] adadelta lr support (#49732)

上级 17fec4e9
...@@ -39,6 +39,7 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -39,6 +39,7 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient"); AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
AddInput("AvgSquaredUpdate", AddInput("AvgSquaredUpdate",
"(Tensor) Input average of squared parameter updates"); "(Tensor) Input average of squared parameter updates");
AddInput("LearningRate", "(Tensor) Learning rate");
AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("ParamOut", "(Tensor) Output parameter");
......
...@@ -220,7 +220,12 @@ std::map<std::string, std::set<std::string>> op_ins_map = { ...@@ -220,7 +220,12 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"sgd", {"Param", "LearningRate", "Grad", "MasterParam"}}, {"sgd", {"Param", "LearningRate", "Grad", "MasterParam"}},
{"adagrad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}}, {"adagrad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}},
{"adadelta", {"adadelta",
{"Param", "Grad", "AvgSquaredGrad", "AvgSquaredUpdate", "MasterParam"}}, {"Param",
"Grad",
"AvgSquaredGrad",
"AvgSquaredUpdate",
"LearningRate",
"MasterParam"}},
{"graph_khop_sampler", {"Row", "Eids", "Col_Ptr", "X"}}, {"graph_khop_sampler", {"Row", "Eids", "Col_Ptr", "X"}},
{"nce", {"nce",
{"Input", {"Input",
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
backward : abs_grad backward : abs_grad
- op : adadelta_ - op : adadelta_
args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, Tensor master_param, float rho, float epsilon, bool multi_precision) args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, Tensor learning_rate, Tensor master_param, float rho, float epsilon, bool multi_precision)
output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out), Tensor(master_param_out) output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out), Tensor(master_param_out)
infer_meta : infer_meta :
func : AdadeltaInferMeta func : AdadeltaInferMeta
......
...@@ -40,6 +40,7 @@ void AdadeltaInferMeta(const MetaTensor& param, ...@@ -40,6 +40,7 @@ void AdadeltaInferMeta(const MetaTensor& param,
const MetaTensor& grad, const MetaTensor& grad,
const MetaTensor& avg_squared_grad, const MetaTensor& avg_squared_grad,
const MetaTensor& avg_squared_update, const MetaTensor& avg_squared_update,
const MetaTensor& learning_rate,
const MetaTensor& master_param, const MetaTensor& master_param,
float rho, float rho,
float epsilon, float epsilon,
...@@ -48,6 +49,11 @@ void AdadeltaInferMeta(const MetaTensor& param, ...@@ -48,6 +49,11 @@ void AdadeltaInferMeta(const MetaTensor& param,
MetaTensor* avg_squared_grad_out, MetaTensor* avg_squared_grad_out,
MetaTensor* avg_squared_update_out, MetaTensor* avg_squared_update_out,
MetaTensor* master_param_out) { MetaTensor* master_param_out) {
auto lr_dims = learning_rate.dims();
PADDLE_ENFORCE_EQ(
phi::product(lr_dims),
1,
phi::errors::InvalidArgument("LearningRate should have one element"));
auto param_dims = param.dims(); auto param_dims = param.dims();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dims, param_dims,
......
...@@ -43,6 +43,7 @@ void AdadeltaInferMeta(const MetaTensor& param, ...@@ -43,6 +43,7 @@ void AdadeltaInferMeta(const MetaTensor& param,
const MetaTensor& grad, const MetaTensor& grad,
const MetaTensor& avg_squared_grad, const MetaTensor& avg_squared_grad,
const MetaTensor& avg_squared_update, const MetaTensor& avg_squared_update,
const MetaTensor& learning_rate,
const MetaTensor& master_param, const MetaTensor& master_param,
float rho, float rho,
float epsilon, float epsilon,
......
...@@ -24,6 +24,7 @@ void AdadeltaKernel(const Context& dev_ctx, ...@@ -24,6 +24,7 @@ void AdadeltaKernel(const Context& dev_ctx,
const DenseTensor& grad, const DenseTensor& grad,
const DenseTensor& avg_squared_grad, const DenseTensor& avg_squared_grad,
const DenseTensor& avg_squared_update, const DenseTensor& avg_squared_update,
const DenseTensor& learning_rate,
const paddle::optional<DenseTensor>& master_param, const paddle::optional<DenseTensor>& master_param,
float rho, float rho,
float epsilon, float epsilon,
......
...@@ -13,11 +13,14 @@ ...@@ -13,11 +13,14 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/kernels/adadelta_kernel.h" #include "paddle/phi/kernels/adadelta_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace phi { namespace phi {
...@@ -27,6 +30,7 @@ void AdadeltaKernel(const Context& dev_ctx, ...@@ -27,6 +30,7 @@ void AdadeltaKernel(const Context& dev_ctx,
const DenseTensor& grad, const DenseTensor& grad,
const DenseTensor& avg_squared_grad, const DenseTensor& avg_squared_grad,
const DenseTensor& avg_squared_update, const DenseTensor& avg_squared_update,
const DenseTensor& learning_rate,
const paddle::optional<DenseTensor>& master_param, const paddle::optional<DenseTensor>& master_param,
float rho, float rho,
float epsilon, float epsilon,
...@@ -56,29 +60,36 @@ void AdadeltaKernel(const Context& dev_ctx, ...@@ -56,29 +60,36 @@ void AdadeltaKernel(const Context& dev_ctx,
auto eigen_avg_squared_update_out = auto eigen_avg_squared_update_out =
EigenVector<MPDType>::Flatten(*avg_squared_update_out); EigenVector<MPDType>::Flatten(*avg_squared_update_out);
auto& place = *dev_ctx.eigen_device(); auto& place = *dev_ctx.eigen_device();
auto eigen_grad_cast = eigen_grad.template cast<MPDType>(); auto eigen_grad_cast = eigen_grad.template cast<MPDType>();
eigen_avg_squared_grad_out.device(place) = eigen_avg_squared_grad_out.device(place) =
rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad_cast.square(); rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad_cast.square();
auto update = -((eigen_avg_squared_update + epsilon_) / auto update =
(eigen_avg_squared_grad_out + epsilon_)) -(((eigen_avg_squared_update + epsilon_).sqrt()) /
.sqrt() * ((eigen_avg_squared_grad_out + epsilon_).sqrt()) * eigen_grad_cast);
eigen_grad_cast; Eigen::DSizes<int, 1> m_dsize(avg_squared_update_out->numel());
eigen_avg_squared_update_out.device(place) = if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
rho_ * eigen_avg_squared_update + (1 - rho_) * update.square(); auto* lr = learning_rate.data<T>();
eigen_param_out.device(place) =
eigen_param + lr[0] * update.template cast<T>();
} else {
auto lr = EigenVector<MPDType>::Flatten(learning_rate);
if (multi_precision) { if (multi_precision) {
auto eigen_master_param_out = auto eigen_master_param_out =
EigenVector<MPDType>::Flatten(*master_param_outs); EigenVector<MPDType>::Flatten(*master_param_outs);
auto eigen_master_param = EigenVector<MPDType>::Flatten(*master_param); auto eigen_master_param = EigenVector<MPDType>::Flatten(*master_param);
eigen_master_param_out.device(place) = eigen_master_param + update; eigen_master_param_out.device(place) =
eigen_param_out.device(place) = eigen_master_param + lr.broadcast(m_dsize) * update;
(eigen_param.template cast<MPDType>() + update).template cast<T>(); eigen_param_out.device(place) = (eigen_param.template cast<MPDType>() +
lr.broadcast(m_dsize) * update)
.template cast<T>();
} else { } else {
eigen_param_out.device(place) = eigen_param + update.template cast<T>(); eigen_param_out.device(place) =
eigen_param + (lr.broadcast(m_dsize) * update).template cast<T>();
} }
}
eigen_avg_squared_update_out.device(place) =
rho_ * eigen_avg_squared_update + (1 - rho_) * update.square();
} }
} // namespace phi } // namespace phi
...@@ -25,6 +25,7 @@ void AdadeltaKernel(const Context& dev_ctx, ...@@ -25,6 +25,7 @@ void AdadeltaKernel(const Context& dev_ctx,
const DenseTensor& grad, const DenseTensor& grad,
const DenseTensor& avg_squared_grad, const DenseTensor& avg_squared_grad,
const DenseTensor& avg_squared_update, const DenseTensor& avg_squared_update,
const DenseTensor& learning_rate,
const paddle::optional<DenseTensor>& master_param, const paddle::optional<DenseTensor>& master_param,
float rho, float rho,
float epsilon, float epsilon,
......
...@@ -18,9 +18,13 @@ namespace phi { ...@@ -18,9 +18,13 @@ namespace phi {
KernelSignature AdadeltaOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature AdadeltaOpArgumentMapping(const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("Grad")) { if (ctx.IsDenseTensorInput("Grad")) {
return KernelSignature( return KernelSignature("adadelta",
"adadelta", {"Param",
{"Param", "Grad", "AvgSquaredGrad", "AvgSquaredUpdate", "MasterParam"}, "Grad",
"AvgSquaredGrad",
"AvgSquaredUpdate",
"LearningRate",
"MasterParam"},
{"rho", "epsilon", "multi_precision"}, {"rho", "epsilon", "multi_precision"},
{"ParamOut", {"ParamOut",
"AvgSquaredGradOut", "AvgSquaredGradOut",
......
...@@ -3215,6 +3215,7 @@ class AdadeltaOptimizer(Optimizer): ...@@ -3215,6 +3215,7 @@ class AdadeltaOptimizer(Optimizer):
param_and_grad[1], param_and_grad[1],
avg_squared_grad_acc, avg_squared_grad_acc,
avg_squared_update_acc, avg_squared_update_acc,
self._create_param_lr(param_and_grad),
master_weight, master_weight,
self._rho, self._rho,
self._epsilon, self._epsilon,
...@@ -3227,6 +3228,7 @@ class AdadeltaOptimizer(Optimizer): ...@@ -3227,6 +3228,7 @@ class AdadeltaOptimizer(Optimizer):
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"AvgSquaredGrad": avg_squared_grad_acc, "AvgSquaredGrad": avg_squared_grad_acc,
"AvgSquaredUpdate": avg_squared_update_acc, "AvgSquaredUpdate": avg_squared_update_acc,
"LearningRate": self._create_param_lr(param_and_grad),
} }
outputs = { outputs = {
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
......
...@@ -26,6 +26,7 @@ def adadelta_wrapper( ...@@ -26,6 +26,7 @@ def adadelta_wrapper(
Grad, Grad,
AvgSquaredGrad, AvgSquaredGrad,
AvgSquaredUpdate, AvgSquaredUpdate,
LearningRate,
master_weight=None, master_weight=None,
rho=0.95, rho=0.95,
epsilon=1e-6, epsilon=1e-6,
...@@ -35,12 +36,13 @@ def adadelta_wrapper( ...@@ -35,12 +36,13 @@ def adadelta_wrapper(
Grad, Grad,
AvgSquaredGrad, AvgSquaredGrad,
AvgSquaredUpdate, AvgSquaredUpdate,
LearningRate,
None, None,
rho, rho,
epsilon, epsilon,
False, False,
) )
return Param, AvgSquaredGrad, AvgSquaredUpdate return Param, AvgSquaredGrad, AvgSquaredUpdate, LearningRate
class TestAdadeltaOp1(OpTest): class TestAdadeltaOp1(OpTest):
...@@ -58,11 +60,13 @@ class TestAdadeltaOp1(OpTest): ...@@ -58,11 +60,13 @@ class TestAdadeltaOp1(OpTest):
rho = 0.95 rho = 0.95
epsilon = 1e-6 epsilon = 1e-6
learning_rate = 1.0
self.inputs = { self.inputs = {
'Param': param, 'Param': param,
'Grad': grad, 'Grad': grad,
'AvgSquaredGrad': avg_squared_grad, 'AvgSquaredGrad': avg_squared_grad,
'AvgSquaredUpdate': avg_squared_update, 'AvgSquaredUpdate': avg_squared_update,
'LearningRate': np.array([learning_rate]).astype("float32"),
} }
self.attrs = {'rho': rho, 'epsilon': epsilon} self.attrs = {'rho': rho, 'epsilon': epsilon}
...@@ -113,12 +117,13 @@ class TestAdadeltaOp2(OpTest): ...@@ -113,12 +117,13 @@ class TestAdadeltaOp2(OpTest):
epsilon = 1e-6 epsilon = 1e-6
self.attrs = {'rho': rho, 'epsilon': epsilon} self.attrs = {'rho': rho, 'epsilon': epsilon}
learning_rate = 1.0
self.inputs = { self.inputs = {
'Param': param, 'Param': param,
'Grad': grad, 'Grad': grad,
'AvgSquaredGrad': avg_squared_grad, 'AvgSquaredGrad': avg_squared_grad,
'AvgSquaredUpdate': avg_squared_update, 'AvgSquaredUpdate': avg_squared_update,
'LearningRate': np.array([learning_rate]).astype("float32"),
} }
avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * np.square( avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * np.square(
......
...@@ -197,6 +197,7 @@ class Adadelta(Optimizer): ...@@ -197,6 +197,7 @@ class Adadelta(Optimizer):
param_and_grad[1], param_and_grad[1],
avg_squared_grad_acc, avg_squared_grad_acc,
avg_squared_update_acc, avg_squared_update_acc,
self._create_param_lr(param_and_grad),
master_weight, master_weight,
self._rho, self._rho,
self._epsilon, self._epsilon,
...@@ -213,6 +214,7 @@ class Adadelta(Optimizer): ...@@ -213,6 +214,7 @@ class Adadelta(Optimizer):
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"AvgSquaredGrad": avg_squared_grad_acc, "AvgSquaredGrad": avg_squared_grad_acc,
"AvgSquaredUpdate": avg_squared_update_acc, "AvgSquaredUpdate": avg_squared_update_acc,
"LearningRate": self._create_param_lr(param_and_grad),
} }
outputs = { outputs = {
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册