提交 8ac7687e 编写于 作者: W WangXi 提交者: gongweibao

Fix dgc accuracy by mv regularization to local (#21278)

上级 b9f8ae84
......@@ -29,6 +29,9 @@ class DGCOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(Grad) of DGCop should not be null.");
PADDLE_ENFORCE_EQ(
ctx->HasInput("Param"), true,
platform::errors::NotFound("Input(Param) of DGCop is not found."));
PADDLE_ENFORCE(ctx->HasInput("current_step"),
"Input(current_step) of DGCop should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true,
......@@ -66,6 +69,7 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("U", "(Tensor) U velocity tensor of DGC");
AddInput("V", "(Tensor) V velocity tensor of DGC");
AddInput("Grad", "(Tensor) Input gradient");
AddInput("Param", "(Tensor) Input parameter");
AddInput("current_step", "(Tensor) Current step.");
AddInput("nranks", "(Tensor) nranks.");
......@@ -99,6 +103,16 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, 0.0)"
"The period when begin k_select.");
AddAttr<float>("regular_coeff",
"(float, 0.0)"
"The coeff of regularization, weight decay parameter")
.SetDefault(0.0);
AddAttr<int>("regular_type",
"(int, 0)"
"The type of regularization, {0:None, 1:L1Decay, 2:L2Decay")
.SetDefault(0);
AddComment(R"DOC(
Original paper is https://arxiv.org/abs/1712.01887
......
......@@ -43,6 +43,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
auto v = ctx.Input<framework::Tensor>("V");
auto g = ctx.Input<framework::Tensor>("Grad");
auto grad_out = ctx.Output<framework::Tensor>("Grad_out");
// attrs
float m = ctx.Attr<float>("m");
bool use_nesterov = ctx.Attr<bool>("use_nesterov");
......@@ -55,6 +57,39 @@ class DGCOpKernel : public framework::OpKernel<T> {
const int nranks = static_cast<const int>(*nranks_tensor->data<float>());
PADDLE_ENFORCE_GT(nranks, 1, "DGC is not useful when num_trainers <= 1");
// regularization
auto p = ctx.Input<framework::Tensor>("Param");
float regular_coeff = ctx.Attr<float>("regular_coeff");
int regular_type = ctx.Attr<int>("regular_type");
auto p_e = framework::EigenVector<T>::Flatten(*p);
auto g_e = framework::EigenVector<T>::Flatten(*g);
auto grad_out_e = framework::EigenVector<T>::Flatten(*grad_out);
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto& eigen_ctx = *dev_ctx.eigen_device();
// NOTE. In paddle, loss has divided by nranks. Because dgc_op is before
// allreduce, so local regular_coeff need div nranks too. But now we
// multi grad with nranks in dgc_op, in that case regular_coeff don't
// need to /nranks, can prevent precision loss. For coeff often equal
// with 1e-4, if nranks=32, coeff/nranks will be 3.125e-6, the numerical
// accuracy of coeff/nranks will be too low.
PADDLE_ENFORCE_EQ(regular_type >= 0 && regular_type <= 2, true,
platform::errors::InvalidArgument(
"DGC only support one of None|L1Decay|L2Decay "
"Regularization for now."));
if (regular_type == 0) {
grad_out_e.device(eigen_ctx) = (1.0 * nranks) * g_e;
} else if (regular_type == 1) {
// L1Decay. grad = grad + coeff * sign(param)
grad_out_e.device(eigen_ctx) =
(1.0 * nranks) * g_e + regular_coeff * p_e.sign();
} else if (regular_type == 2) {
// L2Decay. grad = grad + coeff * param
grad_out_e.device(eigen_ctx) = (1.0 * nranks) * g_e + regular_coeff * p_e;
}
// current step
auto current_step_tensor = ctx.Input<framework::Tensor>("current_step");
const float* current_step = current_step_tensor->data<float>();
......@@ -91,19 +126,17 @@ class DGCOpKernel : public framework::OpKernel<T> {
// FIXME(gongwb): use cublas.
auto u_out_e = framework::EigenVector<T>::Flatten(*u_out);
auto u_e = framework::EigenVector<T>::Flatten(*u);
auto g_e = framework::EigenVector<T>::Flatten(*g);
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto& eigen_ctx = *dev_ctx.eigen_device();
if (static_cast<int>(*current_step) ==
static_cast<int>(rampup_begin_step)) {
// calc local momentum from global momentum
u_out_e.device(eigen_ctx) = (1.0 / nranks) * u_e;
}
// calc local momentum from global momentum
// NOTE. If grad not multi nranks, need add below code.
// if (static_cast<int>(*current_step) ==
// static_cast<int>(rampup_begin_step)) {
// u_out_e.device(eigen_ctx) = (1.0 / nranks) * u_e;
// }
if (use_nesterov) {
// u = m * (u + g)
u_out_e.device(eigen_ctx) = m * (u_e + g_e);
u_out_e.device(eigen_ctx) = m * (u_e + grad_out_e);
// v = u + v + g
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
......@@ -113,7 +146,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
ctx, g, v, 0, AddFunctor<T>(), v_out);
} else {
// u = m * u + g
u_out_e.device(eigen_ctx) = m * u_e + g_e;
u_out_e.device(eigen_ctx) = m * u_e + grad_out_e;
// v = u + v
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
......@@ -138,7 +171,6 @@ class DGCOpKernel : public framework::OpKernel<T> {
LOG(FATAL) << "v_out numel:" << v_out->numel();
}
auto grad_out = ctx.Output<framework::Tensor>("Grad_out");
math::SetConstant<DeviceContext, T> tset;
tset(dev_ctx, grad_out, static_cast<T>(0));
}
......
......@@ -27,13 +27,20 @@ class DGCMomentumOp : public MomentumOp {
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("current_step"), true,
"current_step should be set.");
PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true,
platform::errors::NotFound(
"Input(nranks) of DGCMomentumOp is not found."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Grad_out"), true,
platform::errors::NotFound(
"Output(Grad_out) of DGCMomentumOp is not found."));
return MomentumOp::InferShape(ctx);
}
framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const framework::Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override {
if (var_name == "current_step") {
if (var_name == "current_step" || var_name == "nranks") {
VLOG(10) << "var_name:" << var_name << " need not to transform";
return expected_kernel_type;
}
......@@ -47,6 +54,10 @@ class DGCMomentumOpMaker : public MomentumOpMaker {
public:
void Make() override {
AddInput("current_step", "(Tensor) Current step.");
AddInput("nranks", "(Tensor) The number of trainers.");
AddOutput("Grad_out", "(Tensor) Output grad gradient");
AddAttr<float>("rampup_begin_step",
"(float, -1.0)"
"The period when begin DGC.")
......
......@@ -38,6 +38,26 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
auto current_step_tensor = context.Input<framework::Tensor>("current_step");
auto* current_step = current_step_tensor->data<T>();
// nranks
auto nranks_tensor = context.Input<framework::Tensor>("nranks");
const int nranks = static_cast<const int>(*nranks_tensor->data<float>());
PADDLE_ENFORCE_GT(
nranks, 1,
platform::errors::InvalidArgument(
"DGC is not useful when num_trainers <= 1, but now nranks=%d",
nranks));
const framework::Tensor* g = context.Input<framework::Tensor>("Grad");
framework::Tensor* g_out = context.Output<framework::Tensor>("Grad_out");
auto g_e = framework::EigenVector<T>::Flatten(*g);
auto g_out_e = framework::EigenVector<T>::Flatten(*g_out);
auto& dev_ctx = context.template device_context<DeviceContext>();
auto& eigen_ctx = *dev_ctx.eigen_device();
// NOTE. In dgc_op we multi grad with nranks, so we need /nranks here.
g_out_e.device(eigen_ctx) = (1.0 / nranks) * g_e;
VLOG(10) << "current_step:" << *current_step
<< ", rampup_begin_step:" << rampup_begin_step;
......
......@@ -966,6 +966,22 @@ class DGCMomentumOptimizer(Optimizer):
self._clip_norm = local_grad_clip_norm / (num_trainers *
num_trainers)
self._get_dgc_regularization_param()
def _get_dgc_regularization_param(self):
self.regular_coeff = 0.0
self.regular_type = 0
if self.regularization is not None:
self.regular_coeff = self.regularization._regularization_coeff
from .regularizer import L1Decay, L2Decay
if isinstance(self.regularization, L1Decay):
self.regular_type = 1
elif isinstance(self.regularization, L2Decay):
self.regular_type = 2
else:
assert False, 'regularization must be None|L1Decay|L2Deacy'
def _is_use_dgc(self, param_var, grad_var):
var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
if var_numel < 16384 or \
......@@ -997,7 +1013,11 @@ class DGCMomentumOptimizer(Optimizer):
type = "momentum"
else:
type = "dgc_momentum"
inputs.update({"current_step": self._global_step_var})
inputs.update({
"current_step": self._global_step_var,
"nranks": self._nranks_var
})
outputs.update({'Grad_out': param_and_grad[1]})
attrs.update({"rampup_begin_step": float(self._rampup_begin_step)})
# create the dgc momentum optimize op
......@@ -1160,12 +1180,14 @@ class DGCMomentumOptimizer(Optimizer):
encoded_var, gather_var):
block = framework.default_main_program().global_block()
op_maker = core.op_proto_and_checker_maker
dgc_op = block.append_op(
type="dgc",
inputs={
"U": u_var,
"V": v_var,
"Grad": clip_var,
"Param": param_var,
"current_step": self._global_step_var,
"nranks": self._nranks_var,
},
......@@ -1183,6 +1205,8 @@ class DGCMomentumOptimizer(Optimizer):
"use_nesterov": self._use_nesterov,
"rampup_begin_step": float(self._rampup_begin_step),
"rampup_step": float(self._rampup_step),
"regular_coeff": float(self.regular_coeff),
"regular_type": int(self.regular_type),
},
stop_gradient=True)
......@@ -1191,6 +1215,37 @@ class DGCMomentumOptimizer(Optimizer):
dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
[param_var.name, grad_var.name])
def apply_gradients(self, params_grads):
params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads)
not_dgc_params_grads = []
dgc_params_grads = []
for param, grad in params_grads:
if not self._is_use_dgc(param, grad):
not_dgc_params_grads.append((param, grad))
else:
dgc_params_grads.append((param, grad))
# DGC clip and regularization in local
not_dgc_params_grads = append_gradient_clip_ops(not_dgc_params_grads)
# Add regularization if any
not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
self.regularization)
params_grads = not_dgc_params_grads + dgc_params_grads
params_grads = sorted(params_grads, key=lambda x: x[0].name)
optimize_ops = self._create_optimization_pass(params_grads)
if table_optimize_op is not None:
optimize_ops.append(table_optimize_op)
params_grads.append(table_param_and_grad)
return optimize_ops
class LarsMomentumOptimizer(Optimizer):
"""
......
......@@ -34,16 +34,19 @@ class TestDGCMomentumOp1(unittest.TestCase):
self.op_type = "dgc_momentum"
self.dtype = np.float32
nranks_val = 2
param = np.random.random((123, 321)).astype(self.dtype)
grad = np.random.random((123, 321)).astype(self.dtype)
velocity = np.zeros((123, 321)).astype(self.dtype)
learning_rate = np.array([0.001]).astype(self.dtype)
current_step = np.full((1), step).astype("float32")
nranks = np.full((1), nranks_val).astype("float32")
mu = 0.0001
use_nesterov = False
rampup_begin_step = 10.0
# get tensor
self.param_name, self.param_tensor = self.get_tensor('Param', param)
self.grad_name, self.grad_tensor = self.get_tensor('Grad', grad)
self.velocity_name, self.velocity_tensor = self.get_tensor('Velocity',
......@@ -52,6 +55,8 @@ class TestDGCMomentumOp1(unittest.TestCase):
'LearningRate', learning_rate)
self.current_step_name, self.current_step_tensor = self.get_tensor(
'current_step', current_step, core.CPUPlace())
self.nranks_name, self.nranks_tensor = self.get_tensor('nranks', nranks,
core.CPUPlace())
self.kwargs = {
# inputs
......@@ -60,6 +65,7 @@ class TestDGCMomentumOp1(unittest.TestCase):
'Velocity': self.velocity_name,
'LearningRate': self.learning_rate_name,
'current_step': self.current_step_name,
'nranks': self.nranks_name,
# attrs
'mu': mu,
......@@ -68,17 +74,18 @@ class TestDGCMomentumOp1(unittest.TestCase):
# outputs
'ParamOut': self.param_name,
'VelocityOut': self.velocity_name
'VelocityOut': self.velocity_name,
'Grad_out': self.grad_name,
}
velocity_out = mu * velocity + grad
velocity_out = mu * velocity + grad / nranks
if use_nesterov:
param_out = param - grad * learning_rate - \
velocity_out * mu * learning_rate
else:
param_out = param - learning_rate * velocity_out
sgd_out = param - learning_rate * grad
sgd_out = param - learning_rate * grad / nranks
self.outputs = {
'ParamOut': param_out,
......
......@@ -44,6 +44,9 @@ class TestDGCOp(unittest.TestCase):
self.grad_name = "Grad"
self.grad = np.random.random(size).astype("float32")
self.param_name = "Param"
self.param = np.random.random(size).astype("float32")
self.current_step_name = "current_step"
self.current_step = np.full((1), 0.0).astype("float32")
......@@ -66,6 +69,9 @@ class TestDGCOp(unittest.TestCase):
self.grad_tensor = self.scope.var(self.grad_name).get_tensor()
self.grad_tensor.set(self.grad, place)
self.param_tensor = self.scope.var(self.param_name).get_tensor()
self.param_tensor.set(self.param, place)
self.current_step_tensor = self.scope.var(
self.current_step_name).get_tensor()
self.current_step_tensor.set(self.current_step, core.CPUPlace())
......@@ -96,6 +102,7 @@ class TestDGCOp(unittest.TestCase):
'U': self.u_name,
'V': self.v_name,
'Grad': self.grad_name,
'Param': self.param_name,
'current_step': self.current_step_name,
'nranks': self.nranks_name,
......@@ -113,6 +120,8 @@ class TestDGCOp(unittest.TestCase):
'use_nesterov': True,
'rampup_begin_step': float(0.0),
'rampup_step': float(10.0),
'regular_coeff': float(1e-4),
'regular_type': int(2),
}
dgc_op = Operator('dgc', **kwargs)
......
......@@ -18,6 +18,7 @@ import unittest
import paddle.fluid.framework as framework
import paddle.fluid.optimizer as optimizer
import paddle.fluid.regularizer as regularizer
import paddle.compat as cpt
from paddle.fluid.backward import append_backward
from paddle.fluid.transpiler.details import program_to_code
......@@ -31,7 +32,10 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
def get_velocity_str(self):
return self._u_velocity_acc_str
def check_dgc_momentum_optimizer(self, dims=[5, 10, 8], name="momentum"):
def check_dgc_momentum_optimizer(self,
dims=[5, 10, 8],
name="momentum",
regularization=None):
init_program = framework.Program()
program = framework.Program()
block = program.global_block()
......@@ -58,8 +62,12 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1})
learning_rate = 0.01
dgc_momentum_optimizer = self.MockDGCMomentum(
learning_rate=learning_rate, momentum=0.2, rampup_begin_step=0)
learning_rate=learning_rate,
momentum=0.2,
rampup_begin_step=0,
regularization=regularization)
mean_out = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op(
......@@ -96,12 +104,15 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
program_to_code(program, fout=f)
def test_momentum_without_dgc(self):
self.check_dgc_momentum_optimizer()
self.check_dgc_momentum_optimizer(
regularization=regularizer.L1Decay(1e-4))
def test_momentum_with_dgc(self):
# 16 * 1024 = 16384, use dgc momentum
self.check_dgc_momentum_optimizer(
dims=[16, 1024, 8], name="dgc_momentum")
dims=[16, 1024, 8],
name="dgc_momentum",
regularization=regularizer.L2Decay(1e-4))
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册