未验证 提交 5d039f40 编写于 作者: J JZ-LIANG 提交者: GitHub

modified the implement of Lars optimizer (#26733)

add lars to fleet meta optimizer
上级 a1b640bc
...@@ -52,6 +52,8 @@ message DGCConfig { ...@@ -52,6 +52,8 @@ message DGCConfig {
message LarsConfig { message LarsConfig {
optional float lars_coeff = 1 [ default = 0.001 ]; optional float lars_coeff = 1 [ default = 0.001 ];
optional float lars_weight_decay = 2 [ default = 0.0005 ]; optional float lars_weight_decay = 2 [ default = 0.0005 ];
optional float epsilon = 3 [ default = 0.0 ];
repeated string exclude_from_weight_decay = 4;
} }
message LambConfig { message LambConfig {
......
...@@ -48,6 +48,9 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -48,6 +48,9 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("lars_weight_decay", AddAttr<float>("lars_weight_decay",
"(float, default 0.0005) LARS weight decay") "(float, default 0.0005) LARS weight decay")
.SetDefault(0.0005); .SetDefault(0.0005);
AddAttr<float>("epsilon",
"(float, default 0.0) epsilon to avoid Division by Zero.")
.SetDefault(0.0);
AddComment(R"DOC( AddComment(R"DOC(
Lars Momentum Optimizer. Lars Momentum Optimizer.
......
...@@ -23,14 +23,16 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v, ...@@ -23,14 +23,16 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
const T* learning_rate, const T mu, const T* learning_rate, const T mu,
const int64_t num, const T lars_coeff, const int64_t num, const T lars_coeff,
const T lars_weight_decay, const T* p_norm, const T lars_weight_decay, const T* p_norm,
const T* g_norm, T* p_out, T* v_out) { const T* g_norm, T* p_out, T* v_out,
const T epsilon) {
T lr = learning_rate[0]; T lr = learning_rate[0];
T local_lr = learning_rate[0]; T local_lr = learning_rate[0];
CUDA_KERNEL_LOOP(i, num) { CUDA_KERNEL_LOOP(i, num) {
if (p_norm[0] > 0 && g_norm[0] > 0) { if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) {
local_lr = lr * lars_coeff * p_norm[0] / local_lr = lr * lars_coeff * p_norm[0] /
(g_norm[0] + lars_weight_decay * p_norm[0]); (g_norm[0] + lars_weight_decay * p_norm[0] + epsilon);
} }
T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]); T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
v_out[i] = v_new; v_out[i] = v_new;
p_out[i] = p[i] - v_new; p_out[i] = p[i] - v_new;
...@@ -54,6 +56,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> { ...@@ -54,6 +56,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
T mu = static_cast<T>(ctx.Attr<float>("mu")); T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff"); T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay"); T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
T epsilon = ctx.Attr<float>("epsilon");
auto* p = param->data<T>(); auto* p = param->data<T>();
auto* v = velocity->data<T>(); auto* v = velocity->data<T>();
...@@ -79,7 +82,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> { ...@@ -79,7 +82,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
eg_norm.device(*place) = eigen_g.square().sum().sqrt(); eg_norm.device(*place) = eigen_g.square().sum().sqrt();
MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>( MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay, p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
p_norm_data, g_norm_data, p_out, v_out); p_norm_data, g_norm_data, p_out, v_out, epsilon);
} }
}; };
......
...@@ -39,6 +39,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> { ...@@ -39,6 +39,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
T mu = static_cast<T>(ctx.Attr<float>("mu")); T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff"); T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay"); T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
T epsilon = ctx.Attr<float>("epsilon");
auto p_out = framework::EigenVector<T>::Flatten(*param_out); auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto v_out = framework::EigenVector<T>::Flatten(*velocity_out); auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
...@@ -59,9 +60,9 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> { ...@@ -59,9 +60,9 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
ep_norm = p.square().sum().sqrt(); ep_norm = p.square().sum().sqrt();
eg_norm = g.square().sum().sqrt(); eg_norm = g.square().sum().sqrt();
T local_lr = lr[0]; T local_lr = lr[0];
if (ep_norm(0) > 0 && eg_norm(0) > 0) { if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
local_lr = lr[0] * lars_coeff * ep_norm(0) / local_lr = lr[0] * lars_coeff * ep_norm(0) /
(eg_norm(0) + lars_weight_decay * ep_norm(0)); (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
} }
v_out = v * mu + local_lr * (g + lars_weight_decay * p); v_out = v * mu + local_lr * (g + lars_weight_decay * p);
p_out = p - v_out; p_out = p - v_out;
......
...@@ -91,6 +91,10 @@ class LambOptimizer(MetaOptimizerBase): ...@@ -91,6 +91,10 @@ class LambOptimizer(MetaOptimizerBase):
return self.lamb_opt.backward(loss, startup_program, parameter_list, return self.lamb_opt.backward(loss, startup_program, parameter_list,
no_grad_set, callbacks) no_grad_set, callbacks)
# the following function will be used by AMP if both LARS and AMP are turn on together.
def apply_gradients(self, params_grads):
return self.lamb_opt.apply_gradients(params_grads=params_grads)
def minimize_impl(self, def minimize_impl(self,
loss, loss,
startup_program=None, startup_program=None,
......
...@@ -44,13 +44,16 @@ class LarsOptimizer(MetaOptimizerBase): ...@@ -44,13 +44,16 @@ class LarsOptimizer(MetaOptimizerBase):
parameter_list=opt._parameter_list, parameter_list=opt._parameter_list,
regularization=opt.regularization, regularization=opt.regularization,
grad_clip=opt._grad_clip, grad_clip=opt._grad_clip,
name=opt._name) name=opt._name,
exclude_from_weight_decay=configs['exclude_from_weight_decay'],
epsilon=configs['epsilon'])
def _can_apply(self): def _can_apply(self):
if self.user_defined_strategy.lars: if self.user_defined_strategy.lars:
if not isinstance(self.inner_opt, Momentum): if not isinstance(self.inner_opt, Momentum):
logging.warn( logging.warn(
"lars need the inner optimizer to be Momentum optimizer.") "lars need the inner optimizer to be Momentum optimizer but got {}.".
format(self.inner_opt.type))
return False return False
return True return True
return False return False
...@@ -75,6 +78,10 @@ class LarsOptimizer(MetaOptimizerBase): ...@@ -75,6 +78,10 @@ class LarsOptimizer(MetaOptimizerBase):
return self.lars_opt.backward(loss, startup_program, parameter_list, return self.lars_opt.backward(loss, startup_program, parameter_list,
no_grad_set, callbacks) no_grad_set, callbacks)
# the following function will be used by AMP if both LARS and AMP are turn on together.
def apply_gradients(self, params_grads):
return self.lars_opt.apply_gradients(params_grads=params_grads)
def minimize_impl(self, def minimize_impl(self,
loss, loss,
startup_program=None, startup_program=None,
......
...@@ -1604,7 +1604,7 @@ class LarsMomentumOptimizer(Optimizer): ...@@ -1604,7 +1604,7 @@ class LarsMomentumOptimizer(Optimizer):
& local\_learning\_rate = learning\_rate * lars\_coeff * \\ & local\_learning\_rate = learning\_rate * lars\_coeff * \\
\\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||} \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
& velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param) & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon)
& param = param - velocity & param = param - velocity
...@@ -1628,7 +1628,9 @@ class LarsMomentumOptimizer(Optimizer): ...@@ -1628,7 +1628,9 @@ class LarsMomentumOptimizer(Optimizer):
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \ name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None. For details, please refer to :ref:`api_guide_Name`. Default is None.
exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1659,7 +1661,9 @@ class LarsMomentumOptimizer(Optimizer): ...@@ -1659,7 +1661,9 @@ class LarsMomentumOptimizer(Optimizer):
parameter_list=None, parameter_list=None,
regularization=None, regularization=None,
grad_clip=None, grad_clip=None,
name=None): name=None,
exclude_from_weight_decay=None,
epsilon=0):
assert learning_rate is not None assert learning_rate is not None
assert momentum is not None assert momentum is not None
super(LarsMomentumOptimizer, self).__init__( super(LarsMomentumOptimizer, self).__init__(
...@@ -1672,6 +1676,11 @@ class LarsMomentumOptimizer(Optimizer): ...@@ -1672,6 +1676,11 @@ class LarsMomentumOptimizer(Optimizer):
self._momentum = momentum self._momentum = momentum
self._lars_coeff = float(lars_coeff) self._lars_coeff = float(lars_coeff)
self._lars_weight_decay = float(lars_weight_decay) self._lars_weight_decay = float(lars_weight_decay)
self._epsilon = float(epsilon)
if exclude_from_weight_decay is None:
self._exclude_from_weight_decay = []
else:
self._exclude_from_weight_decay = exclude_from_weight_decay
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -1682,6 +1691,14 @@ class LarsMomentumOptimizer(Optimizer): ...@@ -1682,6 +1691,14 @@ class LarsMomentumOptimizer(Optimizer):
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
_lars_weight_decay = self._lars_weight_decay
param_name = param_and_grad[0].name
if len(self._exclude_from_weight_decay) > 0:
for name in self._exclude_from_weight_decay:
if name in param_name:
_lars_weight_decay = 0.0
break
velocity_acc = self._get_accumulator(self._velocity_acc_str, velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0]) param_and_grad[0])
# create the momentum optimize op # create the momentum optimize op
...@@ -1700,7 +1717,8 @@ class LarsMomentumOptimizer(Optimizer): ...@@ -1700,7 +1717,8 @@ class LarsMomentumOptimizer(Optimizer):
attrs={ attrs={
"mu": self._momentum, "mu": self._momentum,
"lars_coeff": self._lars_coeff, "lars_coeff": self._lars_coeff,
"lars_weight_decay": self._lars_weight_decay "lars_weight_decay": _lars_weight_decay,
"epsilon": self._epsilon
}, },
stop_gradient=True) stop_gradient=True)
......
...@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker ...@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker
class TestFleetLambMetaOptimizer(unittest.TestCase): class TestFleetLambMetaOptimizer(unittest.TestCase):
def setUp(self): def setUp(self):
os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_TRAINER_ID"] = "1"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ[
os.environ["PADDLE_TRAINERS_NUM"] = "2" "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
def net(self, main_prog, startup_prog): def net(self, main_prog, startup_prog):
with fluid.program_guard(main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog):
...@@ -97,13 +95,54 @@ class TestFleetLambMetaOptimizer(unittest.TestCase): ...@@ -97,13 +95,54 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
ops_with_bias = [ ops_without_wd = [
op for op in avg_cost.block.ops op for op in avg_cost.block.ops
if op.type == 'lamb' and op.attr('op_role_var')[0].endswith('.b_0') if op.type == 'lamb' and op.attr('op_role_var')[0].endswith('.b_0')
] ]
for op in ops_with_bias: for op in ops_without_wd:
self.assertEqual(op.attr('weight_decay'), 0) self.assertEqual(op.attr('weight_decay'), 0)
def test_lamb_apply_with_amp(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.amp = True
strategy.amp_configs = {
"init_loss_scaling": 32768,
"decr_every_n_nan_or_inf": 2,
"incr_every_n_steps": 1000,
"incr_ratio": 2.0,
"use_dynamic_loss_scaling": True,
"decr_ratio": 0.5,
"custom_white_list": ['softmax'],
"custom_black_list": ['tanh'],
}
strategy.lamb = True
strategy.lamb_configs = {
'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': [],
}
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops]
self.assertIn('lamb', ops)
self.assertIn('cast', ops)
self.assertIn('isfinite', ops)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker ...@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker
class TestFleetLarsMetaOptimizer(unittest.TestCase): class TestFleetLarsMetaOptimizer(unittest.TestCase):
def setUp(self): def setUp(self):
os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_TRAINER_ID"] = "1"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ[
os.environ["PADDLE_TRAINERS_NUM"] = "2" "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
def net(self, main_prog, startup_prog): def net(self, main_prog, startup_prog):
with fluid.program_guard(main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog):
...@@ -52,6 +50,8 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): ...@@ -52,6 +50,8 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
strategy.lars_configs = { strategy.lars_configs = {
"lars_coeff": 0.001, "lars_coeff": 0.001,
"lars_weight_decay": 0.0005, "lars_weight_decay": 0.0005,
"epsilon": 0,
"exclude_from_weight_decay": ["batch_norm", ".b"],
} }
return avg_cost, strategy return avg_cost, strategy
...@@ -83,6 +83,70 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): ...@@ -83,6 +83,70 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
ops = [op.type for op in avg_cost.block.ops] ops = [op.type for op in avg_cost.block.ops]
self.assertNotIn('lars_momentum', ops) self.assertNotIn('lars_momentum', ops)
def test_lars_exclude_fn(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops_without_wd = [
op for op in avg_cost.block.ops
if op.type == 'lars_momentum' and ("batch_norm" in op.attr(
'op_role_var')[0] or ".b" in op.attr('op_role_var')[0])
]
for op in ops_without_wd:
self.assertEqual(op.attr('lars_weight_decay'), 0)
def test_lars_apply_with_amp(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.amp = True
strategy.amp_configs = {
"init_loss_scaling": 32768,
"decr_every_n_nan_or_inf": 2,
"incr_every_n_steps": 1000,
"incr_ratio": 2.0,
"use_dynamic_loss_scaling": True,
"decr_ratio": 0.5,
"custom_white_list": ['softmax'],
"custom_black_list": ['tanh'],
}
strategy.lars = True
strategy.lars_configs = {
"lars_coeff": 0.001,
"lars_weight_decay": 0.0005,
"epsilon": 0,
"exclude_from_weight_decay": ["batch_norm", ".b"],
}
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops]
self.assertIn('lars_momentum', ops)
self.assertIn('cast', ops)
self.assertIn('isfinite', ops)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册