提交 6ce49eea 编写于 作者: W WangXi 提交者: gongweibao

Fix dgc accuracy by mv regularization to local, test=release/1.6 (#21390)

上级 06545fcf
...@@ -29,6 +29,9 @@ class DGCOp : public framework::OperatorWithKernel { ...@@ -29,6 +29,9 @@ class DGCOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null."); PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"), PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(Grad) of DGCop should not be null."); "Input(Grad) of DGCop should not be null.");
PADDLE_ENFORCE_EQ(
ctx->HasInput("Param"), true,
platform::errors::NotFound("Input(Param) of DGCop is not found."));
PADDLE_ENFORCE(ctx->HasInput("current_step"), PADDLE_ENFORCE(ctx->HasInput("current_step"),
"Input(current_step) of DGCop should not be null."); "Input(current_step) of DGCop should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true, PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true,
...@@ -66,6 +69,7 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -66,6 +69,7 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("U", "(Tensor) U velocity tensor of DGC"); AddInput("U", "(Tensor) U velocity tensor of DGC");
AddInput("V", "(Tensor) V velocity tensor of DGC"); AddInput("V", "(Tensor) V velocity tensor of DGC");
AddInput("Grad", "(Tensor) Input gradient"); AddInput("Grad", "(Tensor) Input gradient");
AddInput("Param", "(Tensor) Input parameter");
AddInput("current_step", "(Tensor) Current step."); AddInput("current_step", "(Tensor) Current step.");
AddInput("nranks", "(Tensor) nranks."); AddInput("nranks", "(Tensor) nranks.");
...@@ -99,6 +103,16 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -99,6 +103,16 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, 0.0)" "(float, 0.0)"
"The period when begin k_select."); "The period when begin k_select.");
AddAttr<float>("regular_coeff",
"(float, 0.0)"
"The coeff of regularization, weight decay parameter")
.SetDefault(0.0);
AddAttr<int>("regular_type",
"(int, 0)"
"The type of regularization, {0:None, 1:L1Decay, 2:L2Decay")
.SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
Original paper is https://arxiv.org/abs/1712.01887 Original paper is https://arxiv.org/abs/1712.01887
......
...@@ -43,6 +43,8 @@ class DGCOpKernel : public framework::OpKernel<T> { ...@@ -43,6 +43,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
auto v = ctx.Input<framework::Tensor>("V"); auto v = ctx.Input<framework::Tensor>("V");
auto g = ctx.Input<framework::Tensor>("Grad"); auto g = ctx.Input<framework::Tensor>("Grad");
auto grad_out = ctx.Output<framework::Tensor>("Grad_out");
// attrs // attrs
float m = ctx.Attr<float>("m"); float m = ctx.Attr<float>("m");
bool use_nesterov = ctx.Attr<bool>("use_nesterov"); bool use_nesterov = ctx.Attr<bool>("use_nesterov");
...@@ -55,6 +57,39 @@ class DGCOpKernel : public framework::OpKernel<T> { ...@@ -55,6 +57,39 @@ class DGCOpKernel : public framework::OpKernel<T> {
const int nranks = static_cast<const int>(*nranks_tensor->data<float>()); const int nranks = static_cast<const int>(*nranks_tensor->data<float>());
PADDLE_ENFORCE_GT(nranks, 1, "DGC is not useful when num_trainers <= 1"); PADDLE_ENFORCE_GT(nranks, 1, "DGC is not useful when num_trainers <= 1");
// regularization
auto p = ctx.Input<framework::Tensor>("Param");
float regular_coeff = ctx.Attr<float>("regular_coeff");
int regular_type = ctx.Attr<int>("regular_type");
auto p_e = framework::EigenVector<T>::Flatten(*p);
auto g_e = framework::EigenVector<T>::Flatten(*g);
auto grad_out_e = framework::EigenVector<T>::Flatten(*grad_out);
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto& eigen_ctx = *dev_ctx.eigen_device();
// NOTE. In paddle, loss has divided by nranks. Because dgc_op is before
// allreduce, so local regular_coeff need div nranks too. But now we
// multi grad with nranks in dgc_op, in that case regular_coeff don't
// need to /nranks, can prevent precision loss. For coeff often equal
// with 1e-4, if nranks=32, coeff/nranks will be 3.125e-6, the numerical
// accuracy of coeff/nranks will be too low.
PADDLE_ENFORCE_EQ(regular_type >= 0 && regular_type <= 2, true,
platform::errors::InvalidArgument(
"DGC only support one of None|L1Decay|L2Decay "
"Regularization for now."));
if (regular_type == 0) {
grad_out_e.device(eigen_ctx) = (1.0 * nranks) * g_e;
} else if (regular_type == 1) {
// L1Decay. grad = grad + coeff * sign(param)
grad_out_e.device(eigen_ctx) =
(1.0 * nranks) * g_e + regular_coeff * p_e.sign();
} else if (regular_type == 2) {
// L2Decay. grad = grad + coeff * param
grad_out_e.device(eigen_ctx) = (1.0 * nranks) * g_e + regular_coeff * p_e;
}
// current step // current step
auto current_step_tensor = ctx.Input<framework::Tensor>("current_step"); auto current_step_tensor = ctx.Input<framework::Tensor>("current_step");
const float* current_step = current_step_tensor->data<float>(); const float* current_step = current_step_tensor->data<float>();
...@@ -91,19 +126,17 @@ class DGCOpKernel : public framework::OpKernel<T> { ...@@ -91,19 +126,17 @@ class DGCOpKernel : public framework::OpKernel<T> {
// FIXME(gongwb): use cublas. // FIXME(gongwb): use cublas.
auto u_out_e = framework::EigenVector<T>::Flatten(*u_out); auto u_out_e = framework::EigenVector<T>::Flatten(*u_out);
auto u_e = framework::EigenVector<T>::Flatten(*u); auto u_e = framework::EigenVector<T>::Flatten(*u);
auto g_e = framework::EigenVector<T>::Flatten(*g);
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto& eigen_ctx = *dev_ctx.eigen_device();
if (static_cast<int>(*current_step) ==
static_cast<int>(rampup_begin_step)) {
// calc local momentum from global momentum // calc local momentum from global momentum
u_out_e.device(eigen_ctx) = (1.0 / nranks) * u_e; // NOTE. If grad not multi nranks, need add below code.
} // if (static_cast<int>(*current_step) ==
// static_cast<int>(rampup_begin_step)) {
// u_out_e.device(eigen_ctx) = (1.0 / nranks) * u_e;
// }
if (use_nesterov) { if (use_nesterov) {
// u = m * (u + g) // u = m * (u + g)
u_out_e.device(eigen_ctx) = m * (u_e + g_e); u_out_e.device(eigen_ctx) = m * (u_e + grad_out_e);
// v = u + v + g // v = u + v + g
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>( ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
...@@ -113,7 +146,7 @@ class DGCOpKernel : public framework::OpKernel<T> { ...@@ -113,7 +146,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
ctx, g, v, 0, AddFunctor<T>(), v_out); ctx, g, v, 0, AddFunctor<T>(), v_out);
} else { } else {
// u = m * u + g // u = m * u + g
u_out_e.device(eigen_ctx) = m * u_e + g_e; u_out_e.device(eigen_ctx) = m * u_e + grad_out_e;
// v = u + v // v = u + v
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>( ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
...@@ -138,7 +171,6 @@ class DGCOpKernel : public framework::OpKernel<T> { ...@@ -138,7 +171,6 @@ class DGCOpKernel : public framework::OpKernel<T> {
LOG(FATAL) << "v_out numel:" << v_out->numel(); LOG(FATAL) << "v_out numel:" << v_out->numel();
} }
auto grad_out = ctx.Output<framework::Tensor>("Grad_out");
math::SetConstant<DeviceContext, T> tset; math::SetConstant<DeviceContext, T> tset;
tset(dev_ctx, grad_out, static_cast<T>(0)); tset(dev_ctx, grad_out, static_cast<T>(0));
} }
......
...@@ -27,13 +27,20 @@ class DGCMomentumOp : public MomentumOp { ...@@ -27,13 +27,20 @@ class DGCMomentumOp : public MomentumOp {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("current_step"), true, PADDLE_ENFORCE_EQ(ctx->HasInput("current_step"), true,
"current_step should be set."); "current_step should be set.");
PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true,
platform::errors::NotFound(
"Input(nranks) of DGCMomentumOp is not found."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Grad_out"), true,
platform::errors::NotFound(
"Output(Grad_out) of DGCMomentumOp is not found."));
return MomentumOp::InferShape(ctx); return MomentumOp::InferShape(ctx);
} }
framework::OpKernelType GetKernelTypeForVar( framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const framework::Tensor& tensor, const std::string& var_name, const framework::Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override { const framework::OpKernelType& expected_kernel_type) const override {
if (var_name == "current_step") { if (var_name == "current_step" || var_name == "nranks") {
VLOG(10) << "var_name:" << var_name << " need not to transform"; VLOG(10) << "var_name:" << var_name << " need not to transform";
return expected_kernel_type; return expected_kernel_type;
} }
...@@ -47,6 +54,10 @@ class DGCMomentumOpMaker : public MomentumOpMaker { ...@@ -47,6 +54,10 @@ class DGCMomentumOpMaker : public MomentumOpMaker {
public: public:
void Make() override { void Make() override {
AddInput("current_step", "(Tensor) Current step."); AddInput("current_step", "(Tensor) Current step.");
AddInput("nranks", "(Tensor) The number of trainers.");
AddOutput("Grad_out", "(Tensor) Output grad gradient");
AddAttr<float>("rampup_begin_step", AddAttr<float>("rampup_begin_step",
"(float, -1.0)" "(float, -1.0)"
"The period when begin DGC.") "The period when begin DGC.")
......
...@@ -38,6 +38,26 @@ class DGCMomentumKernel : public framework::OpKernel<T> { ...@@ -38,6 +38,26 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
auto current_step_tensor = context.Input<framework::Tensor>("current_step"); auto current_step_tensor = context.Input<framework::Tensor>("current_step");
auto* current_step = current_step_tensor->data<T>(); auto* current_step = current_step_tensor->data<T>();
// nranks
auto nranks_tensor = context.Input<framework::Tensor>("nranks");
const int nranks = static_cast<const int>(*nranks_tensor->data<float>());
PADDLE_ENFORCE_GT(
nranks, 1,
platform::errors::InvalidArgument(
"DGC is not useful when num_trainers <= 1, but now nranks=%d",
nranks));
const framework::Tensor* g = context.Input<framework::Tensor>("Grad");
framework::Tensor* g_out = context.Output<framework::Tensor>("Grad_out");
auto g_e = framework::EigenVector<T>::Flatten(*g);
auto g_out_e = framework::EigenVector<T>::Flatten(*g_out);
auto& dev_ctx = context.template device_context<DeviceContext>();
auto& eigen_ctx = *dev_ctx.eigen_device();
// NOTE. In dgc_op we multi grad with nranks, so we need /nranks here.
g_out_e.device(eigen_ctx) = (1.0 / nranks) * g_e;
VLOG(10) << "current_step:" << *current_step VLOG(10) << "current_step:" << *current_step
<< ", rampup_begin_step:" << rampup_begin_step; << ", rampup_begin_step:" << rampup_begin_step;
......
...@@ -966,6 +966,22 @@ class DGCMomentumOptimizer(Optimizer): ...@@ -966,6 +966,22 @@ class DGCMomentumOptimizer(Optimizer):
self._clip_norm = local_grad_clip_norm / (num_trainers * self._clip_norm = local_grad_clip_norm / (num_trainers *
num_trainers) num_trainers)
self._get_dgc_regularization_param()
def _get_dgc_regularization_param(self):
self.regular_coeff = 0.0
self.regular_type = 0
if self.regularization is not None:
self.regular_coeff = self.regularization._regularization_coeff
from .regularizer import L1Decay, L2Decay
if isinstance(self.regularization, L1Decay):
self.regular_type = 1
elif isinstance(self.regularization, L2Decay):
self.regular_type = 2
else:
assert False, 'regularization must be None|L1Decay|L2Deacy'
def _is_use_dgc(self, param_var, grad_var): def _is_use_dgc(self, param_var, grad_var):
var_numel = abs(reduce(lambda x, y: x * y, param_var.shape)) var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
if var_numel < 16384 or \ if var_numel < 16384 or \
...@@ -997,7 +1013,11 @@ class DGCMomentumOptimizer(Optimizer): ...@@ -997,7 +1013,11 @@ class DGCMomentumOptimizer(Optimizer):
type = "momentum" type = "momentum"
else: else:
type = "dgc_momentum" type = "dgc_momentum"
inputs.update({"current_step": self._global_step_var}) inputs.update({
"current_step": self._global_step_var,
"nranks": self._nranks_var
})
outputs.update({'Grad_out': param_and_grad[1]})
attrs.update({"rampup_begin_step": float(self._rampup_begin_step)}) attrs.update({"rampup_begin_step": float(self._rampup_begin_step)})
# create the dgc momentum optimize op # create the dgc momentum optimize op
...@@ -1160,12 +1180,14 @@ class DGCMomentumOptimizer(Optimizer): ...@@ -1160,12 +1180,14 @@ class DGCMomentumOptimizer(Optimizer):
encoded_var, gather_var): encoded_var, gather_var):
block = framework.default_main_program().global_block() block = framework.default_main_program().global_block()
op_maker = core.op_proto_and_checker_maker op_maker = core.op_proto_and_checker_maker
dgc_op = block.append_op( dgc_op = block.append_op(
type="dgc", type="dgc",
inputs={ inputs={
"U": u_var, "U": u_var,
"V": v_var, "V": v_var,
"Grad": clip_var, "Grad": clip_var,
"Param": param_var,
"current_step": self._global_step_var, "current_step": self._global_step_var,
"nranks": self._nranks_var, "nranks": self._nranks_var,
}, },
...@@ -1183,6 +1205,8 @@ class DGCMomentumOptimizer(Optimizer): ...@@ -1183,6 +1205,8 @@ class DGCMomentumOptimizer(Optimizer):
"use_nesterov": self._use_nesterov, "use_nesterov": self._use_nesterov,
"rampup_begin_step": float(self._rampup_begin_step), "rampup_begin_step": float(self._rampup_begin_step),
"rampup_step": float(self._rampup_step), "rampup_step": float(self._rampup_step),
"regular_coeff": float(self.regular_coeff),
"regular_type": int(self.regular_type),
}, },
stop_gradient=True) stop_gradient=True)
...@@ -1191,6 +1215,37 @@ class DGCMomentumOptimizer(Optimizer): ...@@ -1191,6 +1215,37 @@ class DGCMomentumOptimizer(Optimizer):
dgc_op._set_attr(op_maker.kOpRoleVarAttrName(), dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
[param_var.name, grad_var.name]) [param_var.name, grad_var.name])
def apply_gradients(self, params_grads):
params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads)
not_dgc_params_grads = []
dgc_params_grads = []
for param, grad in params_grads:
if not self._is_use_dgc(param, grad):
not_dgc_params_grads.append((param, grad))
else:
dgc_params_grads.append((param, grad))
# DGC clip and regularization in local
not_dgc_params_grads = append_gradient_clip_ops(not_dgc_params_grads)
# Add regularization if any
not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
self.regularization)
params_grads = not_dgc_params_grads + dgc_params_grads
params_grads = sorted(params_grads, key=lambda x: x[0].name)
optimize_ops = self._create_optimization_pass(params_grads)
if table_optimize_op is not None:
optimize_ops.append(table_optimize_op)
params_grads.append(table_param_and_grad)
return optimize_ops
class LarsMomentumOptimizer(Optimizer): class LarsMomentumOptimizer(Optimizer):
""" """
......
...@@ -34,16 +34,19 @@ class TestDGCMomentumOp1(unittest.TestCase): ...@@ -34,16 +34,19 @@ class TestDGCMomentumOp1(unittest.TestCase):
self.op_type = "dgc_momentum" self.op_type = "dgc_momentum"
self.dtype = np.float32 self.dtype = np.float32
nranks_val = 2
param = np.random.random((123, 321)).astype(self.dtype) param = np.random.random((123, 321)).astype(self.dtype)
grad = np.random.random((123, 321)).astype(self.dtype) grad = np.random.random((123, 321)).astype(self.dtype)
velocity = np.zeros((123, 321)).astype(self.dtype) velocity = np.zeros((123, 321)).astype(self.dtype)
learning_rate = np.array([0.001]).astype(self.dtype) learning_rate = np.array([0.001]).astype(self.dtype)
current_step = np.full((1), step).astype("float32") current_step = np.full((1), step).astype("float32")
nranks = np.full((1), nranks_val).astype("float32")
mu = 0.0001 mu = 0.0001
use_nesterov = False use_nesterov = False
rampup_begin_step = 10.0 rampup_begin_step = 10.0
# get tensor
self.param_name, self.param_tensor = self.get_tensor('Param', param) self.param_name, self.param_tensor = self.get_tensor('Param', param)
self.grad_name, self.grad_tensor = self.get_tensor('Grad', grad) self.grad_name, self.grad_tensor = self.get_tensor('Grad', grad)
self.velocity_name, self.velocity_tensor = self.get_tensor('Velocity', self.velocity_name, self.velocity_tensor = self.get_tensor('Velocity',
...@@ -52,6 +55,8 @@ class TestDGCMomentumOp1(unittest.TestCase): ...@@ -52,6 +55,8 @@ class TestDGCMomentumOp1(unittest.TestCase):
'LearningRate', learning_rate) 'LearningRate', learning_rate)
self.current_step_name, self.current_step_tensor = self.get_tensor( self.current_step_name, self.current_step_tensor = self.get_tensor(
'current_step', current_step, core.CPUPlace()) 'current_step', current_step, core.CPUPlace())
self.nranks_name, self.nranks_tensor = self.get_tensor('nranks', nranks,
core.CPUPlace())
self.kwargs = { self.kwargs = {
# inputs # inputs
...@@ -60,6 +65,7 @@ class TestDGCMomentumOp1(unittest.TestCase): ...@@ -60,6 +65,7 @@ class TestDGCMomentumOp1(unittest.TestCase):
'Velocity': self.velocity_name, 'Velocity': self.velocity_name,
'LearningRate': self.learning_rate_name, 'LearningRate': self.learning_rate_name,
'current_step': self.current_step_name, 'current_step': self.current_step_name,
'nranks': self.nranks_name,
# attrs # attrs
'mu': mu, 'mu': mu,
...@@ -68,17 +74,18 @@ class TestDGCMomentumOp1(unittest.TestCase): ...@@ -68,17 +74,18 @@ class TestDGCMomentumOp1(unittest.TestCase):
# outputs # outputs
'ParamOut': self.param_name, 'ParamOut': self.param_name,
'VelocityOut': self.velocity_name 'VelocityOut': self.velocity_name,
'Grad_out': self.grad_name,
} }
velocity_out = mu * velocity + grad velocity_out = mu * velocity + grad / nranks
if use_nesterov: if use_nesterov:
param_out = param - grad * learning_rate - \ param_out = param - grad * learning_rate - \
velocity_out * mu * learning_rate velocity_out * mu * learning_rate
else: else:
param_out = param - learning_rate * velocity_out param_out = param - learning_rate * velocity_out
sgd_out = param - learning_rate * grad sgd_out = param - learning_rate * grad / nranks
self.outputs = { self.outputs = {
'ParamOut': param_out, 'ParamOut': param_out,
......
...@@ -44,6 +44,9 @@ class TestDGCOp(unittest.TestCase): ...@@ -44,6 +44,9 @@ class TestDGCOp(unittest.TestCase):
self.grad_name = "Grad" self.grad_name = "Grad"
self.grad = np.random.random(size).astype("float32") self.grad = np.random.random(size).astype("float32")
self.param_name = "Param"
self.param = np.random.random(size).astype("float32")
self.current_step_name = "current_step" self.current_step_name = "current_step"
self.current_step = np.full((1), 0.0).astype("float32") self.current_step = np.full((1), 0.0).astype("float32")
...@@ -66,6 +69,9 @@ class TestDGCOp(unittest.TestCase): ...@@ -66,6 +69,9 @@ class TestDGCOp(unittest.TestCase):
self.grad_tensor = self.scope.var(self.grad_name).get_tensor() self.grad_tensor = self.scope.var(self.grad_name).get_tensor()
self.grad_tensor.set(self.grad, place) self.grad_tensor.set(self.grad, place)
self.param_tensor = self.scope.var(self.param_name).get_tensor()
self.param_tensor.set(self.param, place)
self.current_step_tensor = self.scope.var( self.current_step_tensor = self.scope.var(
self.current_step_name).get_tensor() self.current_step_name).get_tensor()
self.current_step_tensor.set(self.current_step, core.CPUPlace()) self.current_step_tensor.set(self.current_step, core.CPUPlace())
...@@ -96,6 +102,7 @@ class TestDGCOp(unittest.TestCase): ...@@ -96,6 +102,7 @@ class TestDGCOp(unittest.TestCase):
'U': self.u_name, 'U': self.u_name,
'V': self.v_name, 'V': self.v_name,
'Grad': self.grad_name, 'Grad': self.grad_name,
'Param': self.param_name,
'current_step': self.current_step_name, 'current_step': self.current_step_name,
'nranks': self.nranks_name, 'nranks': self.nranks_name,
...@@ -113,6 +120,8 @@ class TestDGCOp(unittest.TestCase): ...@@ -113,6 +120,8 @@ class TestDGCOp(unittest.TestCase):
'use_nesterov': True, 'use_nesterov': True,
'rampup_begin_step': float(0.0), 'rampup_begin_step': float(0.0),
'rampup_step': float(10.0), 'rampup_step': float(10.0),
'regular_coeff': float(1e-4),
'regular_type': int(2),
} }
dgc_op = Operator('dgc', **kwargs) dgc_op = Operator('dgc', **kwargs)
......
...@@ -18,6 +18,7 @@ import unittest ...@@ -18,6 +18,7 @@ import unittest
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
import paddle.fluid.optimizer as optimizer import paddle.fluid.optimizer as optimizer
import paddle.fluid.regularizer as regularizer
import paddle.compat as cpt import paddle.compat as cpt
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
from paddle.fluid.transpiler.details import program_to_code from paddle.fluid.transpiler.details import program_to_code
...@@ -31,7 +32,10 @@ class TestDGCMomentumOptimizer(unittest.TestCase): ...@@ -31,7 +32,10 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
def get_velocity_str(self): def get_velocity_str(self):
return self._u_velocity_acc_str return self._u_velocity_acc_str
def check_dgc_momentum_optimizer(self, dims=[5, 10, 8], name="momentum"): def check_dgc_momentum_optimizer(self,
dims=[5, 10, 8],
name="momentum",
regularization=None):
init_program = framework.Program() init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
...@@ -58,8 +62,12 @@ class TestDGCMomentumOptimizer(unittest.TestCase): ...@@ -58,8 +62,12 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
learning_rate = 0.01 learning_rate = 0.01
dgc_momentum_optimizer = self.MockDGCMomentum( dgc_momentum_optimizer = self.MockDGCMomentum(
learning_rate=learning_rate, momentum=0.2, rampup_begin_step=0) learning_rate=learning_rate,
momentum=0.2,
rampup_begin_step=0,
regularization=regularization)
mean_out = block.create_var( mean_out = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="mean.out") dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op( block.append_op(
...@@ -96,12 +104,15 @@ class TestDGCMomentumOptimizer(unittest.TestCase): ...@@ -96,12 +104,15 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
program_to_code(program, fout=f) program_to_code(program, fout=f)
def test_momentum_without_dgc(self): def test_momentum_without_dgc(self):
self.check_dgc_momentum_optimizer() self.check_dgc_momentum_optimizer(
regularization=regularizer.L1Decay(1e-4))
def test_momentum_with_dgc(self): def test_momentum_with_dgc(self):
# 16 * 1024 = 16384, use dgc momentum # 16 * 1024 = 16384, use dgc momentum
self.check_dgc_momentum_optimizer( self.check_dgc_momentum_optimizer(
dims=[16, 1024, 8], name="dgc_momentum") dims=[16, 1024, 8],
name="dgc_momentum",
regularization=regularizer.L2Decay(1e-4))
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册