未验证 提交 a8a2b7f4 编写于 作者: N niuliling123 提交者: GitHub

Add multiprecision for adadelta op (#50131)

上级 a1006b2b
...@@ -39,12 +39,17 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -39,12 +39,17 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient"); AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
AddInput("AvgSquaredUpdate", AddInput("AvgSquaredUpdate",
"(Tensor) Input average of squared parameter updates"); "(Tensor) Input average of squared parameter updates");
AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("ParamOut", "(Tensor) Output parameter");
AddOutput("AvgSquaredGradOut", AddOutput("AvgSquaredGradOut",
"(Tensor) Output average of squared gradient"); "(Tensor) Output average of squared gradient");
AddOutput("AvgSquaredUpdateOut", AddOutput("AvgSquaredUpdateOut",
"(Tensor) Output average of squared parameter updates"); "(Tensor) Output average of squared parameter updates");
AddOutput("MasterParamOut",
"The updated FP32 master weight for AMP. "
"It shared memory with Input(MasterParam).")
.AsDispensable();
AddAttr<float>("rho", AddAttr<float>("rho",
"(float, default 0.95) Exponential decay rate " "(float, default 0.95) Exponential decay rate "
...@@ -54,6 +59,10 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -54,6 +59,10 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, default 1.0e-6) Constant for " "(float, default 1.0e-6) Constant for "
"numerical stability") "numerical stability")
.SetDefault(1.0e-6f); .SetDefault(1.0e-6f);
AddAttr<bool>("multi_precision",
"(bool, default false) "
"Whether to use multi-precision during weight updating.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Adadelta Optimizer. Adadelta Optimizer.
......
...@@ -206,6 +206,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = { ...@@ -206,6 +206,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"Q", "K", "V", "Offset", "Columns", "KeyPaddingMask", "AttnMask"}}, {"Q", "K", "V", "Offset", "Columns", "KeyPaddingMask", "AttnMask"}},
{"sgd", {"Param", "LearningRate", "Grad", "MasterParam"}}, {"sgd", {"Param", "LearningRate", "Grad", "MasterParam"}},
{"adagrad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}}, {"adagrad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}},
{"adadelta",
{"Param", "Grad", "AvgSquaredGrad", "AvgSquaredUpdate", "MasterParam"}},
{"graph_khop_sampler", {"Row", "Eids", "Col_Ptr", "X"}}, {"graph_khop_sampler", {"Row", "Eids", "Col_Ptr", "X"}},
{"nce", {"nce",
{"Input", {"Input",
...@@ -311,6 +313,11 @@ std::map<std::string, std::set<std::string>> op_outs_map = { ...@@ -311,6 +313,11 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
"SavedMean", "SavedMean",
"SavedVariance", "SavedVariance",
"ReserveSpace"}}, "ReserveSpace"}},
{"adadelta",
{"ParamOut",
"AvgSquaredGradOut",
"AvgSquaredUpdateOut",
"MasterParamOut"}},
{"unique", {"Out", "Index", "Indices", "Counts"}}, {"unique", {"Out", "Index", "Indices", "Counts"}},
{"unique_consecutive", {"Out", "Index", "Counts"}}, {"unique_consecutive", {"Out", "Index", "Counts"}},
{"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
...@@ -400,7 +407,11 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = { ...@@ -400,7 +407,11 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
"MeanGradOut", "MeanGradOut",
"MasterParamOut"}}, "MasterParamOut"}},
{"ftrl", {"ParamOut", "SquaredAccumOut", "LinearAccumOut"}}, {"ftrl", {"ParamOut", "SquaredAccumOut", "LinearAccumOut"}},
{"adadelta", {"ParamOut", "AvgSquaredGradOut", "AvgSquaredUpdateOut"}}, {"adadelta",
{"ParamOut",
"AvgSquaredGradOut",
"AvgSquaredUpdateOut",
"MasterParamOut"}},
{"adagrad", {"ParamOut", "MomentOut", "MasterParamOut"}}, {"adagrad", {"ParamOut", "MomentOut", "MasterParamOut"}},
{"adamax", {"ParamOut", "MomentOut", "InfNormOut"}}, {"adamax", {"ParamOut", "MomentOut", "InfNormOut"}},
{"dpsgd", {"ParamOut"}}, {"dpsgd", {"ParamOut"}},
......
...@@ -20,13 +20,15 @@ ...@@ -20,13 +20,15 @@
data_type : x data_type : x
- op : adadelta_ - op : adadelta_
args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, float rho, float epsilon) args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, Tensor master_param, float rho, float epsilon, bool multi_precision)
output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out) output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out), Tensor(master_param_out)
infer_meta : infer_meta :
func : AdadeltaInferMeta func : AdadeltaInferMeta
kernel : kernel :
func : adadelta func : adadelta
inplace : (param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out) data_type : param
optional : master_param
inplace : (param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out), (master_param -> master_param_out)
- op : adagrad_ - op : adagrad_
args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, Tensor master_param, float epsilon, bool multi_precision) args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, Tensor master_param, float epsilon, bool multi_precision)
......
...@@ -38,11 +38,14 @@ void AdadeltaInferMeta(const MetaTensor& param, ...@@ -38,11 +38,14 @@ void AdadeltaInferMeta(const MetaTensor& param,
const MetaTensor& grad, const MetaTensor& grad,
const MetaTensor& avg_squared_grad, const MetaTensor& avg_squared_grad,
const MetaTensor& avg_squared_update, const MetaTensor& avg_squared_update,
const MetaTensor& master_param,
float rho, float rho,
float epsilon, float epsilon,
bool multi_precision,
MetaTensor* param_out, MetaTensor* param_out,
MetaTensor* avg_squared_grad_out, MetaTensor* avg_squared_grad_out,
MetaTensor* avg_squared_update_out) { MetaTensor* avg_squared_update_out,
MetaTensor* master_param_out) {
auto param_dims = param.dims(); auto param_dims = param.dims();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dims, param_dims,
......
...@@ -43,11 +43,14 @@ void AdadeltaInferMeta(const MetaTensor& param, ...@@ -43,11 +43,14 @@ void AdadeltaInferMeta(const MetaTensor& param,
const MetaTensor& grad, const MetaTensor& grad,
const MetaTensor& avg_squared_grad, const MetaTensor& avg_squared_grad,
const MetaTensor& avg_squared_update, const MetaTensor& avg_squared_update,
const MetaTensor& master_param,
float rho, float rho,
float epsilon, float epsilon,
bool multi_precision,
MetaTensor* param_out, MetaTensor* param_out,
MetaTensor* avg_squared_grad_out, MetaTensor* avg_squared_grad_out,
MetaTensor* avg_squared_update_out); MetaTensor* avg_squared_update_out,
MetaTensor* master_param_outs);
void AdagradInferMeta(const MetaTensor& param, void AdagradInferMeta(const MetaTensor& param,
const MetaTensor& grad, const MetaTensor& grad,
......
...@@ -24,10 +24,13 @@ void AdadeltaKernel(const Context& dev_ctx, ...@@ -24,10 +24,13 @@ void AdadeltaKernel(const Context& dev_ctx,
const DenseTensor& grad, const DenseTensor& grad,
const DenseTensor& avg_squared_grad, const DenseTensor& avg_squared_grad,
const DenseTensor& avg_squared_update, const DenseTensor& avg_squared_update,
const paddle::optional<DenseTensor>& master_param,
float rho, float rho,
float epsilon, float epsilon,
bool multi_precision,
DenseTensor* param_out, DenseTensor* param_out,
DenseTensor* avg_squared_grad_out, DenseTensor* avg_squared_grad_out,
DenseTensor* avg_squared_update_out); DenseTensor* avg_squared_update_out,
DenseTensor* master_param_outs);
} // namespace phi } // namespace phi
...@@ -18,5 +18,10 @@ ...@@ -18,5 +18,10 @@
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h" #include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(adadelta,
adadelta, GPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {} GPU,
ALL_LAYOUT,
phi::AdadeltaKernel,
float,
double,
phi::dtype::float16) {}
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/kernels/adadelta_kernel.h" #include "paddle/phi/kernels/adadelta_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
...@@ -26,40 +27,58 @@ void AdadeltaKernel(const Context& dev_ctx, ...@@ -26,40 +27,58 @@ void AdadeltaKernel(const Context& dev_ctx,
const DenseTensor& grad, const DenseTensor& grad,
const DenseTensor& avg_squared_grad, const DenseTensor& avg_squared_grad,
const DenseTensor& avg_squared_update, const DenseTensor& avg_squared_update,
const paddle::optional<DenseTensor>& master_param,
float rho, float rho,
float epsilon, float epsilon,
bool multi_precision,
DenseTensor* param_out, DenseTensor* param_out,
DenseTensor* avg_squared_grad_out, DenseTensor* avg_squared_grad_out,
DenseTensor* avg_squared_update_out) { DenseTensor* avg_squared_update_out,
DenseTensor* master_param_outs) {
using MPDType = typename phi::dtype::template MPTypeTrait<T>::Type;
dev_ctx.template Alloc<T>(param_out); dev_ctx.template Alloc<T>(param_out);
dev_ctx.template Alloc<T>(avg_squared_grad_out); dev_ctx.template Alloc<MPDType>(avg_squared_grad_out);
dev_ctx.template Alloc<T>(avg_squared_update_out); dev_ctx.template Alloc<MPDType>(avg_squared_update_out);
T rho_ = static_cast<T>(rho); MPDType rho_ = static_cast<MPDType>(rho);
T epsilon_ = static_cast<T>(epsilon); MPDType epsilon_ = static_cast<MPDType>(epsilon);
auto eigen_param = EigenVector<T>::Flatten(param); auto eigen_param = EigenVector<T>::Flatten(param);
auto eigen_grad = EigenVector<T>::Flatten(grad); auto eigen_grad = EigenVector<T>::Flatten(grad);
// Squared gradient accumulator // Squared gradient accumulator
auto eigen_avg_squared_grad = EigenVector<T>::Flatten(avg_squared_grad); auto eigen_avg_squared_grad = EigenVector<MPDType>::Flatten(avg_squared_grad);
// Squared updates accumulator // Squared updates accumulator
auto eigen_avg_squared_update = EigenVector<T>::Flatten(avg_squared_update); auto eigen_avg_squared_update =
EigenVector<MPDType>::Flatten(avg_squared_update);
auto eigen_param_out = EigenVector<T>::Flatten(*param_out); auto eigen_param_out = EigenVector<T>::Flatten(*param_out);
auto eigen_avg_squared_grad_out = auto eigen_avg_squared_grad_out =
EigenVector<T>::Flatten(*avg_squared_grad_out); EigenVector<MPDType>::Flatten(*avg_squared_grad_out);
auto eigen_avg_squared_update_out = auto eigen_avg_squared_update_out =
EigenVector<T>::Flatten(*avg_squared_update_out); EigenVector<MPDType>::Flatten(*avg_squared_update_out);
auto& place = *dev_ctx.eigen_device(); auto& place = *dev_ctx.eigen_device();
auto eigen_grad_cast = eigen_grad.template cast<MPDType>();
eigen_avg_squared_grad_out.device(place) = eigen_avg_squared_grad_out.device(place) =
rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad.square(); rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad_cast.square();
auto update = -((eigen_avg_squared_update + epsilon_) / auto update = -((eigen_avg_squared_update + epsilon_) /
(eigen_avg_squared_grad_out + epsilon_)) (eigen_avg_squared_grad_out + epsilon_))
.sqrt() * .sqrt() *
eigen_grad; eigen_grad_cast;
eigen_avg_squared_update_out.device(place) = eigen_avg_squared_update_out.device(place) =
rho_ * eigen_avg_squared_update + (1 - rho_) * update.square(); rho_ * eigen_avg_squared_update + (1 - rho_) * update.square();
eigen_param_out.device(place) = eigen_param + update;
if (multi_precision) {
auto eigen_master_param_out =
EigenVector<MPDType>::Flatten(*master_param_outs);
auto eigen_master_param = EigenVector<MPDType>::Flatten(*master_param);
eigen_master_param_out.device(place) = eigen_master_param + update;
eigen_param_out.device(place) =
(eigen_param.template cast<MPDType>() + update).template cast<T>();
} else {
eigen_param_out.device(place) = eigen_param + update.template cast<T>();
}
} }
} // namespace phi } // namespace phi
...@@ -25,11 +25,14 @@ void AdadeltaKernel(const Context& dev_ctx, ...@@ -25,11 +25,14 @@ void AdadeltaKernel(const Context& dev_ctx,
const DenseTensor& grad, const DenseTensor& grad,
const DenseTensor& avg_squared_grad, const DenseTensor& avg_squared_grad,
const DenseTensor& avg_squared_update, const DenseTensor& avg_squared_update,
const paddle::optional<DenseTensor>& master_param,
float rho, float rho,
float epsilon, float epsilon,
bool multi_precision,
DenseTensor* param_out, DenseTensor* param_out,
DenseTensor* avg_squared_grad_out, DenseTensor* avg_squared_grad_out,
DenseTensor* avg_squared_update_out) { DenseTensor* avg_squared_update_out,
DenseTensor* master_param_outs) {
dev_ctx.template Alloc<T>(param_out); dev_ctx.template Alloc<T>(param_out);
dev_ctx.template Alloc<T>(avg_squared_grad_out); dev_ctx.template Alloc<T>(avg_squared_grad_out);
dev_ctx.template Alloc<T>(avg_squared_update_out); dev_ctx.template Alloc<T>(avg_squared_update_out);
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature AdadeltaOpArgumentMapping(const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("Grad")) {
return KernelSignature(
"adadelta",
{"Param", "Grad", "AvgSquaredGrad", "AvgSquaredUpdate", "MasterParam"},
{"rho", "epsilon", "multi_precision"},
{"ParamOut",
"AvgSquaredGradOut",
"AvgSquaredUpdateOut",
"MasterParamOut"});
}
return KernelSignature("unregistered", {}, {}, {});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(adadelta, phi::AdadeltaOpArgumentMapping);
...@@ -3181,14 +3181,87 @@ class AdadeltaOptimizer(Optimizer): ...@@ -3181,14 +3181,87 @@ class AdadeltaOptimizer(Optimizer):
name=name, name=name,
) )
self.type = "adadelta" self.type = "adadelta"
self._multi_precision = False
self._master_weights = {}
self._epsilon = epsilon self._epsilon = epsilon
self._rho = rho self._rho = rho
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + '_fp32_master'
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._avg_squared_grad_acc_str, master_p)
self._add_accumulator(
self._avg_squared_update_acc_str, master_p
)
continue
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(self._avg_squared_grad_acc_str, p) self._add_accumulator(self._avg_squared_grad_acc_str, p)
self._add_accumulator(self._avg_squared_update_acc_str, p) self._add_accumulator(self._avg_squared_update_acc_str, p)
...@@ -3202,6 +3275,15 @@ class AdadeltaOptimizer(Optimizer): ...@@ -3202,6 +3275,15 @@ class AdadeltaOptimizer(Optimizer):
avg_squared_update_acc = self._get_accumulator( avg_squared_update_acc = self._get_accumulator(
self._avg_squared_update_acc_str, param_and_grad[0] self._avg_squared_update_acc_str, param_and_grad[0]
) )
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if in_dygraph_mode(): if in_dygraph_mode():
_C_ops.adadelta_( _C_ops.adadelta_(
...@@ -3209,25 +3291,38 @@ class AdadeltaOptimizer(Optimizer): ...@@ -3209,25 +3291,38 @@ class AdadeltaOptimizer(Optimizer):
param_and_grad[1], param_and_grad[1],
avg_squared_grad_acc, avg_squared_grad_acc,
avg_squared_update_acc, avg_squared_update_acc,
master_weight,
self._rho, self._rho,
self._epsilon, self._epsilon,
find_master,
) )
else: else:
# Create the adadelta optimizer op # Create the adadelta optimizer op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"AvgSquaredGrad": avg_squared_grad_acc,
"AvgSquaredUpdate": avg_squared_update_acc,
}
outputs = {
"ParamOut": param_and_grad[0],
"AvgSquaredGradOut": avg_squared_grad_acc,
"AvgSquaredUpdateOut": avg_squared_update_acc,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adadelta_op = block.append_op( adadelta_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs=inputs,
"Param": param_and_grad[0], outputs=outputs,
"Grad": param_and_grad[1], attrs={
"AvgSquaredGrad": avg_squared_grad_acc, "epsilon": self._epsilon,
"AvgSquaredUpdate": avg_squared_update_acc, "rho": self._rho,
}, "multi_precision": find_master,
outputs={
"ParamOut": param_and_grad[0],
"AvgSquaredGradOut": avg_squared_grad_acc,
"AvgSquaredUpdateOut": avg_squared_update_acc,
}, },
attrs={"epsilon": self._epsilon, "rho": self._rho},
stop_gradient=True, stop_gradient=True,
) )
......
...@@ -203,5 +203,281 @@ class TestAdadeltaV2Group(TestAdadeltaV2): ...@@ -203,5 +203,281 @@ class TestAdadeltaV2Group(TestAdadeltaV2):
adam.clear_gradients() adam.clear_gradients()
class TestAdadeltaOpMultiPrecison(unittest.TestCase):
def _test_adadelta_op_dygraph_place_amp(self, place, use_amp=False):
import paddle
paddle.disable_static()
paddle.seed(10)
paddle.set_device(place)
input = paddle.randn((5, 5))
model = paddle.nn.Linear(5, 5)
optimizer = paddle.optimizer.Adadelta(
learning_rate=0.01,
parameters=model.parameters(),
weight_decay=0.1,
)
optimizer._multi_precision = use_amp
for idx in range(2):
if place == 'gpu' and use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
if place == 'gpu' and use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.step(optimizer)
optimizer.clear_grad()
else:
output = model(input)
loss = paddle.mean(output)
loss.backward()
optimizer.step()
optimizer.clear_grad()
paddle.enable_static()
def _get_places(self):
import paddle
places = ['cpu']
if paddle.is_compiled_with_cuda():
places.append('gpu')
return places
def test_main(self):
for place in self._get_places():
use_amp_list = [True, False]
for use_amp in use_amp_list:
self._test_adadelta_op_dygraph_place_amp(place, use_amp)
class TestAdadeltaMultiPrecision2_0(unittest.TestCase):
def dygraph_adadelta_mp(self, mp, use_amp):
paddle.disable_static()
paddle.seed(100)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.optimizer.Adadelta(
0.5, parameters=model.parameters()
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_grad()
else:
output = model(input)
loss = paddle.mean(output)
loss.backward()
optimizer.step()
optimizer.clear_grad()
return output, model.parameters()
def static_adadelta_mp(self, mp, use_amp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.optimizer.Adadelta(0.1)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(place='gpu', scope=paddle.static.global_scope())
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_adadelta_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_adadelta_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_adadelta_mp(use_amp=True, mp=True)
output2_st = self.static_adadelta_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
class TestAdadeltaMultiPrecision1_0(unittest.TestCase):
def dygraph_adadelta_mp(self, use_amp, mp):
paddle.disable_static()
paddle.seed(10)
paddle.set_device('gpu')
input = paddle.randn((2, 2))
model = paddle.nn.Linear(2, 2)
optimizer = paddle.fluid.optimizer.Adadelta(
learning_rate=0.001,
parameter_list=model.parameters(),
)
optimizer._multi_precision = mp
if use_amp:
model = paddle.amp.decorate(models=model, level='O2')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
for idx in range(5):
if use_amp:
with paddle.amp.auto_cast(level='O2'):
output = model(input)
loss = paddle.mean(output)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
optimizer.clear_gradients()
else:
output = model(input)
loss = paddle.mean(output)
optimizer.minimize(loss)
optimizer.clear_gradients()
return output, model.parameters()
def static_adadelta_mp(self, use_amp, mp):
paddle.enable_static()
paddle.seed(100)
np.random.seed(100)
exe = paddle.static.Executor('gpu')
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.fluid.optimizer.Adadelta(learning_rate=0.001)
optimizer._multi_precision = mp
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
exe.run(startup_program)
if use_amp:
optimizer.amp_init(place='gpu', scope=paddle.static.global_scope())
x = np.random.random(size=(2, 2)).astype('float16')
else:
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
def test_main(self):
if not paddle.is_compiled_with_cuda():
return
"Test dygraph mode"
output1_dy, params1_dy = self.dygraph_adadelta_mp(use_amp=True, mp=True)
output2_dy, params2_dy = self.dygraph_adadelta_mp(
use_amp=False, mp=False
)
np.testing.assert_allclose(
output1_dy.astype('float32').numpy(),
output2_dy.astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
for idx in range(len(params1_dy)):
np.testing.assert_allclose(
params1_dy[idx].astype('float32').numpy(),
params2_dy[idx].astype('float32').numpy(),
rtol=1e-05,
atol=0.1,
)
"Test static mode"
output1_st = self.static_adadelta_mp(use_amp=True, mp=True)
output2_st = self.static_adadelta_mp(use_amp=False, mp=False)
for idx in range(len(output1_st)):
np.testing.assert_allclose(
output1_st[idx].astype('float32'),
output2_st[idx].astype('float32'),
rtol=1e-05,
atol=0.1,
)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -12,10 +12,14 @@ ...@@ -12,10 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import warnings
import paddle
from paddle import _C_ops from paddle import _C_ops
from ..fluid import framework from ..fluid import core, framework, unique_name
from ..fluid.dygraph import no_grad from ..fluid.dygraph import no_grad
from ..fluid.layer_helper import LayerHelper
from ..framework import in_dygraph_mode from ..framework import in_dygraph_mode
from .optimizer import Optimizer from .optimizer import Optimizer
...@@ -130,6 +134,8 @@ class Adadelta(Optimizer): ...@@ -130,6 +134,8 @@ class Adadelta(Optimizer):
grad_clip=grad_clip, grad_clip=grad_clip,
name=name, name=name,
) )
self._multi_precision = False
self._master_weights = {}
self.type = "adadelta" self.type = "adadelta"
self._epsilon = epsilon self._epsilon = epsilon
self._rho = rho self._rho = rho
...@@ -138,6 +144,62 @@ class Adadelta(Optimizer): ...@@ -138,6 +144,62 @@ class Adadelta(Optimizer):
'rho': rho, 'rho': rho,
} }
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
...@@ -145,6 +207,21 @@ class Adadelta(Optimizer): ...@@ -145,6 +207,21 @@ class Adadelta(Optimizer):
parameters = parameters.get('params') parameters = parameters.get('params')
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._avg_squared_grad_acc_str, master_p)
self._add_accumulator(
self._avg_squared_update_acc_str, master_p
)
continue
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(self._avg_squared_grad_acc_str, p) self._add_accumulator(self._avg_squared_grad_acc_str, p)
self._add_accumulator(self._avg_squared_update_acc_str, p) self._add_accumulator(self._avg_squared_update_acc_str, p)
...@@ -158,6 +235,15 @@ class Adadelta(Optimizer): ...@@ -158,6 +235,15 @@ class Adadelta(Optimizer):
avg_squared_update_acc = self._get_accumulator( avg_squared_update_acc = self._get_accumulator(
self._avg_squared_update_acc_str, param_and_grad[0] self._avg_squared_update_acc_str, param_and_grad[0]
) )
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if in_dygraph_mode(): if in_dygraph_mode():
with no_grad(): with no_grad():
...@@ -166,8 +252,10 @@ class Adadelta(Optimizer): ...@@ -166,8 +252,10 @@ class Adadelta(Optimizer):
param_and_grad[1], param_and_grad[1],
avg_squared_grad_acc, avg_squared_grad_acc,
avg_squared_update_acc, avg_squared_update_acc,
master_weight,
self._rho, self._rho,
self._epsilon, self._epsilon,
find_master,
) )
return None return None
else: else:
...@@ -175,20 +263,29 @@ class Adadelta(Optimizer): ...@@ -175,20 +263,29 @@ class Adadelta(Optimizer):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
# Create the adadelta optimizer op # Create the adadelta optimizer op
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"AvgSquaredGrad": avg_squared_grad_acc,
"AvgSquaredUpdate": avg_squared_update_acc,
}
outputs = {
"ParamOut": param_and_grad[0],
"AvgSquaredGradOut": avg_squared_grad_acc,
"AvgSquaredUpdateOut": avg_squared_update_acc,
}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adadelta_op = block.append_op( adadelta_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs=inputs,
"Param": param_and_grad[0], outputs=outputs,
"Grad": param_and_grad[1], attrs={
"AvgSquaredGrad": avg_squared_grad_acc, "epsilon": self._epsilon,
"AvgSquaredUpdate": avg_squared_update_acc, "rho": self._rho,
}, "multi_precision": find_master,
outputs={
"ParamOut": param_and_grad[0],
"AvgSquaredGradOut": avg_squared_grad_acc,
"AvgSquaredUpdateOut": avg_squared_update_acc,
}, },
attrs={"epsilon": self._epsilon, "rho": self._rho},
stop_gradient=True, stop_gradient=True,
) )
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册