未验证 提交 d8d73ff3 编写于 作者: Q Qiyang Min 提交者: GitHub

Merge pull request #15584 from velconia/imperative_lr_scheduler

Support imperative learning rate scheduler
...@@ -32,6 +32,9 @@ from .profiler import * ...@@ -32,6 +32,9 @@ from .profiler import *
from . import checkpoint from . import checkpoint
from .checkpoint import * from .checkpoint import *
from . import learning_rate_scheduler
from .learning_rate_scheduler import *
__all__ = [] __all__ = []
__all__ += layers.__all__ __all__ += layers.__all__
__all__ += base.__all__ __all__ += base.__all__
...@@ -39,3 +42,4 @@ __all__ += nn.__all__ ...@@ -39,3 +42,4 @@ __all__ += nn.__all__
__all__ += tracer.__all__ __all__ += tracer.__all__
__all__ += profiler.__all__ __all__ += profiler.__all__
__all__ += checkpoint.__all__ __all__ += checkpoint.__all__
__all__ += learning_rate_scheduler.__all__
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import math
from .. import unique_name
__all__ = [
'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
]
class LearningRateDecay(object):
"""
Base class of learning rate decay
"""
def __init__(self, begin=0, step=1, dtype='float32'):
self.step_num = begin
self.step_size = step
self.dtype = dtype
def __call__(self):
lr = self.step()
if isinstance(lr, float):
lr = self.create_lr_var(lr)
self.step_num += self.step_size
return lr
def create_lr_var(self, lr):
from .. import layers
lr = layers.create_global_var(
name=unique_name.generate("learning_rate"),
shape=[1],
value=float(lr),
dtype=self.dtype,
persistable=True)
return lr
def step(self):
raise NotImplementedError()
class PiecewiseDecay(LearningRateDecay):
def __init__(self, boundaries, values, begin, step=1, dtype='float32'):
super(PiecewiseDecay, self).__init__(begin, step, dtype)
self.boundaries = boundaries
self.values = values
self.vars = []
for value in values:
self.vars.append(self.create_lr_var(value))
def step(self):
for i in range(len(self.boundaries)):
if self.step_num < self.boundaries[i]:
return self.vars[i]
return self.vars[len(self.values) - 1]
class NaturalExpDecay(LearningRateDecay):
def __init__(self,
learning_rate,
decay_steps,
decay_rate,
staircase=False,
begin=0,
step=1,
dtype='float32'):
super(NaturalExpDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.decay_steps = decay_steps
self.decay_rate = decay_rate
self.staircase = staircase
def step(self):
from .. import layers
div_res = self.create_lr_var(self.step_num / self.decay_steps)
if self.staircase:
div_res = layers.floor(div_res)
decayed_lr = self.learning_rate * layers.exp(-1 * self.decay_rate *
div_res)
return decayed_lr
class ExponentialDecay(LearningRateDecay):
def __init__(self,
learning_rate,
decay_steps,
decay_rate,
staircase=False,
begin=0,
step=1,
dtype='float32'):
super(ExponentialDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.decay_steps = decay_steps
self.decay_rate = decay_rate
self.staircase = staircase
def step(self):
from .. import layers
div_res = self.create_lr_var(self.step_num / self.decay_steps)
if self.staircase:
div_res = layers.floor(div_res)
decayed_lr = self.learning_rate * (self.decay_rate**div_res)
return decayed_lr
class InverseTimeDecay(LearningRateDecay):
def __init__(self,
learning_rate,
decay_steps,
decay_rate,
staircase=False,
begin=0,
step=1,
dtype='float32'):
super(InverseTimeDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.decay_steps = decay_steps
self.decay_rate = decay_rate
self.staircase = staircase
def step(self):
from .. import layers
div_res = self.create_lr_var(self.step_num / self.decay_steps)
if self.staircase:
div_res = layers.floor(div_res)
decayed_lr = self.learning_rate / (1 + self.decay_rate * div_res)
return decayed_lr
class PolynomialDecay(LearningRateDecay):
def __init__(self,
learning_rate,
decay_steps,
end_learning_rate=0.0001,
power=1.0,
cycle=False,
begin=0,
step=1,
dtype='float32'):
super(PolynomialDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.decay_steps = decay_steps
self.end_learning_rate = end_learning_rate
self.power = power
self.cycle = cycle
def step(self):
from .. import layers
tmp_step_num = self.step_num
tmp_decay_steps = self.decay_steps
if self.cycle:
div_res = layers.ceil(
self.create_lr_var(tmp_step_num / float(self.decay_steps)))
if tmp_step_num == 0:
div_res = self.create_lr_var(1.0)
tmp_decay_steps = self.decay_steps * div_res
else:
tmp_step_num = self.create_lr_var(tmp_step_num
if tmp_step_num < self.decay_steps
else self.decay_steps)
decayed_lr = (self.learning_rate - self.end_learning_rate) * \
((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
return decayed_lr
class CosineDecay(LearningRateDecay):
def __init__(self,
learning_rate,
step_each_epoch,
epochs,
begin=0,
step=1,
dtype='float32'):
super(CosineDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.step_each_epoch = step_each_epoch
self.epochs = epochs
def step(self):
from .. import layers
cur_epoch = layers.floor(
self.create_lr_var(self.step_num / self.step_each_epoch))
decayed_lr = self.learning_rate * 0.5 * (
layers.cos(cur_epoch * math.pi / self.epochs) + 1)
return decayed_lr
class NoamDecay(LearningRateDecay):
def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
super(NoamDecay, self).__init__(begin, step, dtype)
self.d_model = d_model
self.warmup_steps = warmup_steps
def step(self):
from .. import layers
a = self.create_lr_var(self.step_num**-0.5)
b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num)
lr_value = (self.d_model**-0.5) * layers.elementwise_min(a, b)
return lr_value
...@@ -22,13 +22,16 @@ strategy according to this module. ...@@ -22,13 +22,16 @@ strategy according to this module.
from __future__ import print_function from __future__ import print_function
import math
from . import control_flow from . import control_flow
from . import nn from . import nn
from . import ops from . import ops
from . import tensor from . import tensor
from ..initializer import init_on_cpu from ..initializer import init_on_cpu
from ..framework import default_main_program, Parameter, unique_name, name_scope from ..framework import default_main_program, Parameter, unique_name, name_scope
import math from ..dygraph import base as imperative_base
from ..dygraph import learning_rate_scheduler as imperate_lr
__all__ = [ __all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
...@@ -66,13 +69,17 @@ def noam_decay(d_model, warmup_steps): ...@@ -66,13 +69,17 @@ def noam_decay(d_model, warmup_steps):
The decayed learning rate. The decayed learning rate.
""" """
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter(1) if imperative_base.enabled():
decay = imperate_lr.NoamDecay(d_model, warmup_steps)
return decay
else:
global_step = _decay_step_counter(1)
a = global_step**-0.5 a = global_step**-0.5
b = (warmup_steps**-1.5) * global_step b = (warmup_steps**-1.5) * global_step
lr_value = (d_model**-0.5) * nn.elementwise_min(a, b) lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
return lr_value return lr_value
def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
...@@ -112,14 +119,19 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -112,14 +119,19 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
""" """
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter() if imperative_base.enabled():
decay = imperate_lr.ExponentialDecay(learning_rate, decay_steps,
decay_rate, staircase)
return decay
else:
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate * (decay_rate**div_res) decayed_lr = learning_rate * (decay_rate**div_res)
return decayed_lr return decayed_lr
def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
...@@ -141,14 +153,19 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -141,14 +153,19 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
The decayed learning rate The decayed learning rate
""" """
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter() if imperative_base.enabled():
decay = imperate_lr.NaturalExpDecay(learning_rate, decay_steps,
decay_rate, staircase)
return decay
else:
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
return decayed_lr return decayed_lr
def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
...@@ -187,15 +204,20 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -187,15 +204,20 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
""" """
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter() if imperative_base.enabled():
decay = imperate_lr.InverseTimeDecay(learning_rate, decay_steps,
decay_rate, staircase)
return decay
else:
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate / (1 + decay_rate * div_res) decayed_lr = learning_rate / (1 + decay_rate * div_res)
return decayed_lr return decayed_lr
def polynomial_decay(learning_rate, def polynomial_decay(learning_rate,
...@@ -227,27 +249,33 @@ def polynomial_decay(learning_rate, ...@@ -227,27 +249,33 @@ def polynomial_decay(learning_rate,
Variable: The decayed learning rate Variable: The decayed learning rate
""" """
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter() if imperative_base.enabled():
decay = imperate_lr.PolynomialDecay(learning_rate, decay_steps,
if cycle: end_learning_rate, power, cycle)
div_res = ops.ceil(global_step / decay_steps) return decay
zero_var = tensor.fill_constant(
shape=[1], dtype='float32', value=0.0)
one_var = tensor.fill_constant(
shape=[1], dtype='float32', value=1.0)
with control_flow.Switch() as switch:
with switch.case(global_step == zero_var):
tensor.assign(input=one_var, output=div_res)
decay_steps = decay_steps * div_res
else: else:
decay_steps_var = tensor.fill_constant( global_step = _decay_step_counter()
shape=[1], dtype='float32', value=float(decay_steps))
global_step = nn.elementwise_min(x=global_step, y=decay_steps_var) if cycle:
div_res = ops.ceil(global_step / decay_steps)
zero_var = tensor.fill_constant(
shape=[1], dtype='float32', value=0.0)
one_var = tensor.fill_constant(
shape=[1], dtype='float32', value=1.0)
with control_flow.Switch() as switch:
with switch.case(global_step == zero_var):
tensor.assign(input=one_var, output=div_res)
decay_steps = decay_steps * div_res
else:
decay_steps_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(decay_steps))
global_step = nn.elementwise_min(
x=global_step, y=decay_steps_var)
decayed_lr = (learning_rate - end_learning_rate) * \ decayed_lr = (learning_rate - end_learning_rate) * \
((1 - global_step / decay_steps) ** power) + end_learning_rate ((1 - global_step / decay_steps) ** power) + end_learning_rate
return decayed_lr return decayed_lr
def piecewise_decay(boundaries, values): def piecewise_decay(boundaries, values):
...@@ -279,34 +307,38 @@ def piecewise_decay(boundaries, values): ...@@ -279,34 +307,38 @@ def piecewise_decay(boundaries, values):
if len(values) - len(boundaries) != 1: if len(values) - len(boundaries) != 1:
raise ValueError("len(values) - len(boundaries) should be 1") raise ValueError("len(values) - len(boundaries) should be 1")
global_step = _decay_step_counter() if imperative_base.enabled():
decay = imperate_lr.PiecewiseDecay(boundaries, values, 0)
return decay
else:
global_step = _decay_step_counter()
lr = tensor.create_global_var( lr = tensor.create_global_var(
shape=[1], shape=[1],
value=0.0, value=0.0,
dtype='float32', dtype='float32',
persistable=True, persistable=True,
name="learning_rate") name="learning_rate")
with control_flow.Switch() as switch: with control_flow.Switch() as switch:
for i in range(len(boundaries)): for i in range(len(boundaries)):
boundary_val = tensor.fill_constant( boundary_val = tensor.fill_constant(
shape=[1],
dtype='float32',
value=float(boundaries[i]),
force_cpu=True)
value_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val):
tensor.assign(value_var, lr)
last_value_var = tensor.fill_constant(
shape=[1], shape=[1],
dtype='float32', dtype='float32',
value=float(boundaries[i]), value=float(values[len(values) - 1]))
force_cpu=True) with switch.default():
value_var = tensor.fill_constant( tensor.assign(last_value_var, lr)
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val):
tensor.assign(value_var, lr)
last_value_var = tensor.fill_constant(
shape=[1],
dtype='float32',
value=float(values[len(values) - 1]))
with switch.default():
tensor.assign(last_value_var, lr)
return lr return lr
def cosine_decay(learning_rate, step_each_epoch, epochs): def cosine_decay(learning_rate, step_each_epoch, epochs):
...@@ -336,12 +368,17 @@ def cosine_decay(learning_rate, step_each_epoch, epochs): ...@@ -336,12 +368,17 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
learning_rate = base_lr, step_each_epoch=10000, epochs=120) learning_rate = base_lr, step_each_epoch=10000, epochs=120)
""" """
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter() if imperative_base.enabled():
decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch,
epochs)
return decay
else:
global_step = _decay_step_counter()
cur_epoch = ops.floor(global_step / step_each_epoch) cur_epoch = ops.floor(global_step / step_each_epoch)
decayed_lr = learning_rate * 0.5 * ( decayed_lr = learning_rate * 0.5 * (
ops.cos(cur_epoch * math.pi / epochs) + 1) ops.cos(cur_epoch * math.pi / epochs) + 1)
return decayed_lr return decayed_lr
def append_LARS(params_grads, learning_rate, weight_decay): def append_LARS(params_grads, learning_rate, weight_decay):
...@@ -363,6 +400,9 @@ def append_LARS(params_grads, learning_rate, weight_decay): ...@@ -363,6 +400,9 @@ def append_LARS(params_grads, learning_rate, weight_decay):
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
""" """
assert not imperative_base.enabled(
), "append_LARS is NOT supported in dygraph mode now"
def _balanced_weight(param_norm, grad_norm): def _balanced_weight(param_norm, grad_norm):
if weight_decay == 1.0: if weight_decay == 1.0:
return grad_norm + param_norm return grad_norm + param_norm
......
...@@ -30,6 +30,8 @@ from .initializer import Constant ...@@ -30,6 +30,8 @@ from .initializer import Constant
from .layer_helper import LayerHelper from .layer_helper import LayerHelper
from .layers import ops from .layers import ops
from .regularizer import append_regularization_ops from .regularizer import append_regularization_ops
from .dygraph import base as imperative_base
from .dygraph.learning_rate_scheduler import LearningRateDecay
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.layers import tensor from paddle.fluid.layers import tensor
from functools import reduce from functools import reduce
...@@ -53,9 +55,19 @@ class Optimizer(object): ...@@ -53,9 +55,19 @@ class Optimizer(object):
""" """
def __init__(self, learning_rate, regularization=None, name=None): def __init__(self, learning_rate, regularization=None, name=None):
if not isinstance(learning_rate, float) and \ if framework._in_dygraph_mode():
not isinstance(learning_rate, framework.Variable): if not isinstance(learning_rate, float) and \
raise TypeError("learning rate should be float or Variable") not isinstance(learning_rate, LearningRateDecay):
raise TypeError(
"learning rate should be float or LearningRateDecay, got %s here"
% type(learning_rate))
else:
if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable):
raise TypeError(
"learning rate should be float or Variable, got %s here" %
type(learning_rate))
self._name = name self._name = name
self.regularization = regularization self.regularization = regularization
self._learning_rate = learning_rate self._learning_rate = learning_rate
...@@ -79,24 +91,49 @@ class Optimizer(object): ...@@ -79,24 +91,49 @@ class Optimizer(object):
return self._opti_name_list return self._opti_name_list
def _create_global_learning_rate(self): def _create_global_learning_rate(self):
lr = self._global_learning_rate() if imperative_base.enabled():
# create learning rate Variable
if isinstance(self._learning_rate, float):
lr = self._global_learning_rate()
if isinstance(lr, framework.Variable): if isinstance(lr, framework.Variable):
return return
else: else:
if not isinstance(self._learning_rate, float): self._learning_rate_map[framework.default_main_program(
)] = layers.create_global_var(
name=unique_name.generate("learning_rate"),
shape=[1],
value=float(self._learning_rate),
dtype='float32' if self._dtype is None else self._dtype,
persistable=True)
# get learning rate Variable from LearningRateDecay
elif isinstance(self._learning_rate, LearningRateDecay):
self._learning_rate_map[framework.default_main_program(
)] = self._learning_rate()
else:
raise TypeError( raise TypeError(
"learning rate variable is create outside optimizer," "optimizer's learning rate must be float or LearningRateDecay"
"can not create new learning rate variable for new program") )
else:
lr = self._global_learning_rate()
# create learning rate in the current main program if isinstance(lr, framework.Variable):
self._learning_rate_map[framework.default_main_program( return
)] = layers.create_global_var( else:
name=unique_name.generate("learning_rate"), if not isinstance(self._learning_rate, float):
shape=[1], raise TypeError(
value=float(self._learning_rate), "learning rate variable is create outside optimizer,"
dtype='float32' if self._dtype is None else self._dtype, "can not create new learning rate variable for new program"
persistable=True) )
# create learning rate in the current main program
self._learning_rate_map[framework.default_main_program(
)] = layers.create_global_var(
name=unique_name.generate("learning_rate"),
shape=[1],
value=float(self._learning_rate),
dtype='float32' if self._dtype is None else self._dtype,
persistable=True)
def _global_learning_rate(self, program=None): def _global_learning_rate(self, program=None):
""" """
...@@ -605,10 +642,10 @@ class DGCMomentumOptimizer(MomentumOptimizer): ...@@ -605,10 +642,10 @@ class DGCMomentumOptimizer(MomentumOptimizer):
DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication. DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.
This optimizer will do two things: This optimizer will do two things:
1. Compress the gradient by get TopK import value from tensor \ 1. Compress the gradient by get TopK import value from tensor \
and use it for allreduce to reduce network bandwidth. and use it for allreduce to reduce network bandwidth.
2. Call momentum to optimize on the cost. 2. Call momentum to optimize on the cost.
Args: Args:
......
...@@ -78,7 +78,7 @@ list(REMOVE_ITEM TEST_OPS test_image_classification_resnet) ...@@ -78,7 +78,7 @@ list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
list(REMOVE_ITEM TEST_OPS test_imperative_resnet) list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
list(REMOVE_ITEM TEST_OPS test_imperative_optimizer) list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer) list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
foreach(TEST_OP ${TEST_OPS}) foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP}) py_test_modules(${TEST_OP} MODULES ${TEST_OP})
...@@ -89,7 +89,7 @@ py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL) ...@@ -89,7 +89,7 @@ py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL) py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
FLAGS_cudnn_deterministic=1) FLAGS_cudnn_deterministic=1)
py_test_modules(test_imperative_optimizer MODULES test_imperative_optimizer ENVS py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
FLAGS_cudnn_deterministic=1) FLAGS_cudnn_deterministic=1)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
py_test_modules(test_dist_train MODULES test_dist_train SERIAL) py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import contextlib
import unittest
import numpy as np
import six
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
class SimpleImgConvPool(fluid.dygraph.Layer):
def __init__(self,
name_scope,
num_channels,
num_filters,
filter_size,
pool_size,
pool_stride,
pool_padding=0,
pool_type='max',
global_pooling=False,
conv_stride=1,
conv_padding=0,
conv_dilation=1,
conv_groups=1,
act=None,
use_cudnn=False,
param_attr=None,
bias_attr=None):
super(SimpleImgConvPool, self).__init__(name_scope)
self._conv2d = Conv2D(
self.full_name(),
num_channels=num_channels,
num_filters=num_filters,
filter_size=filter_size,
stride=conv_stride,
padding=conv_padding,
dilation=conv_dilation,
groups=conv_groups,
param_attr=None,
bias_attr=None,
use_cudnn=use_cudnn)
self._pool2d = Pool2D(
self.full_name(),
pool_size=pool_size,
pool_type=pool_type,
pool_stride=pool_stride,
pool_padding=pool_padding,
global_pooling=global_pooling,
use_cudnn=use_cudnn)
def forward(self, inputs):
x = self._conv2d(inputs)
x = self._pool2d(x)
return x
class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope):
super(MNIST, self).__init__(name_scope)
self._simple_img_conv_pool_1 = SimpleImgConvPool(
self.full_name(), 1, 20, 5, 2, 2, act="relu")
self._simple_img_conv_pool_2 = SimpleImgConvPool(
self.full_name(), 20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4
SIZE = 10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(),
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs):
x = self._simple_img_conv_pool_1(inputs)
x = self._simple_img_conv_pool_2(x)
x = self._fc(x)
return x
class TestImperativeMnist(unittest.TestCase):
def test_mnist_float32(self):
seed = 90
epoch_num = 1
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mnist = MNIST("mnist")
sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
dy_param_init_value = {}
for epoch in range(epoch_num):
for batch_id, data in enumerate(train_reader()):
dy_x_data = np.array(
[x[0].reshape(1, 28, 28)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(128, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
label._stop_gradient = True
cost = mnist(img)
loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss)
dy_out = avg_loss._numpy()
if epoch == 0 and batch_id == 0:
for param in mnist.parameters():
dy_param_init_value[param.name] = param._numpy()
avg_loss._backward()
sgd.minimize(avg_loss)
mnist.clear_gradients()
dy_param_value = {}
for param in mnist.parameters():
dy_param_value[param.name] = param._numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist")
sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
img = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
cost = mnist(img)
loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss)
sgd.minimize(avg_loss)
# initialize params and fetch them
static_param_init_value = {}
static_param_name_list = []
for param in mnist.parameters():
static_param_name_list.append(param.name)
out = exe.run(fluid.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init_value[static_param_name_list[i]] = out[i]
for epoch in range(epoch_num):
for batch_id, data in enumerate(train_reader()):
static_x_data = np.array(
[x[0].reshape(1, 28, 28)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape([128, 1])
fetch_list = [avg_loss.name]
fetch_list.extend(static_param_name_list)
out = exe.run(
fluid.default_main_program(),
feed={"pixel": static_x_data,
"label": y_data},
fetch_list=fetch_list)
static_param_value = {}
static_out = out[0]
for i in range(1, len(out)):
static_param_value[static_param_name_list[i - 1]] = out[
i]
self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
for key, value in six.iteritems(static_param_init_value):
self.assertTrue(np.allclose(value, dy_param_init_value[key]))
self.assertTrue(np.allclose(static_out, dy_out))
for key, value in six.iteritems(static_param_value):
self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
if __name__ == '__main__':
unittest.main()
...@@ -22,131 +22,71 @@ import six ...@@ -22,131 +22,71 @@ import six
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer, Adam
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC from paddle.fluid.dygraph.nn import FC
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
class SimpleImgConvPool(fluid.dygraph.Layer): class MLP(fluid.dygraph.Layer):
def __init__(self, def __init__(self, name_scope, param_attr=None, bias_attr=None):
name_scope, super(MLP, self).__init__(name_scope)
num_channels,
num_filters,
filter_size,
pool_size,
pool_stride,
pool_padding=0,
pool_type='max',
global_pooling=False,
conv_stride=1,
conv_padding=0,
conv_dilation=1,
conv_groups=1,
act=None,
use_cudnn=False,
param_attr=None,
bias_attr=None):
super(SimpleImgConvPool, self).__init__(name_scope)
self._conv2d = Conv2D(
self.full_name(),
num_channels=num_channels,
num_filters=num_filters,
filter_size=filter_size,
stride=conv_stride,
padding=conv_padding,
dilation=conv_dilation,
groups=conv_groups,
param_attr=None,
bias_attr=None,
use_cudnn=use_cudnn)
self._pool2d = Pool2D(
self.full_name(),
pool_size=pool_size,
pool_type=pool_type,
pool_stride=pool_stride,
pool_padding=pool_padding,
global_pooling=global_pooling,
use_cudnn=use_cudnn)
def forward(self, inputs): self._fc1 = FC(self.full_name(), 10)
x = self._conv2d(inputs) self._fc2 = FC(self.full_name(), 10)
x = self._pool2d(x)
return x
class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope):
super(MNIST, self).__init__(name_scope)
self._simple_img_conv_pool_1 = SimpleImgConvPool( def forward(self, inputs):
self.full_name(), 1, 20, 5, 2, 2, act="relu") y = self._fc1(inputs)
y = self._fc2(y)
self._simple_img_conv_pool_2 = SimpleImgConvPool( return y
self.full_name(), 20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4
SIZE = 10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(),
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs): class TestImperativeOptimizerBase(unittest.TestCase):
x = self._simple_img_conv_pool_1(inputs) def setUp(self):
x = self._simple_img_conv_pool_2(x) self.batch_num = 20
x = self._fc(x)
return x
def get_optimizer(self):
raise NotImplementedError()
class TestDygraphMnist(unittest.TestCase): def _check_mlp(self):
def test_mnist_float32(self):
seed = 90 seed = 90
epoch_num = 1
with fluid.dygraph.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
mnist = MNIST("mnist") mlp = MLP('mlp')
sgd = SGDOptimizer(learning_rate=1e-3) optimizer = self.get_optimizer()
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True) paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
dy_param_init_value = {} dy_param_init_value = {}
for epoch in range(epoch_num): for batch_id, data in enumerate(train_reader()):
for batch_id, data in enumerate(train_reader()): if batch_id >= self.batch_num:
dy_x_data = np.array( break
[x[0].reshape(1, 28, 28)
for x in data]).astype('float32') dy_x_data = np.array(
y_data = np.array( [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
[x[1] for x in data]).astype('int64').reshape(128, 1) y_data = np.array([x[1] for x in data]).astype('int64').reshape(
128, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data) img = to_variable(dy_x_data)
label._stop_gradient = True label = to_variable(y_data)
label._stop_gradient = True
cost = mnist(img)
loss = fluid.layers.cross_entropy(cost, label) cost = mlp(img)
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.reduce_mean(cost)
dy_out = avg_loss._numpy()
dy_out = avg_loss._numpy()
if batch_id == 0:
if epoch == 0 and batch_id == 0: for param in mlp.parameters():
for param in mnist.parameters(): dy_param_init_value[param.name] = param._numpy()
dy_param_init_value[param.name] = param._numpy()
avg_loss._backward()
avg_loss._backward() optimizer.minimize(avg_loss)
sgd.minimize(avg_loss) mlp.clear_gradients()
mnist.clear_gradients() dy_param_value = {}
for param in mlp.parameters():
dy_param_value = {} dy_param_value[param.name] = param._numpy()
for param in mnist.parameters():
dy_param_value[param.name] = param._numpy()
with new_program_scope(): with new_program_scope():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
...@@ -155,23 +95,22 @@ class TestDygraphMnist(unittest.TestCase): ...@@ -155,23 +95,22 @@ class TestDygraphMnist(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist") mlp = MLP('mlp')
sgd = SGDOptimizer(learning_rate=1e-3) optimizer = self.get_optimizer()
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True) paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
img = fluid.layers.data( img = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype='float32') name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
cost = mnist(img) cost = mlp(img)
loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.reduce_mean(cost)
avg_loss = fluid.layers.mean(loss) optimizer.minimize(avg_loss)
sgd.minimize(avg_loss)
# initialize params and fetch them # initialize params and fetch them
static_param_init_value = {} static_param_init_value = {}
static_param_name_list = [] static_param_name_list = []
for param in mnist.parameters(): for param in mlp.parameters():
static_param_name_list.append(param.name) static_param_name_list.append(param.name)
out = exe.run(fluid.default_startup_program(), out = exe.run(fluid.default_startup_program(),
...@@ -180,29 +119,26 @@ class TestDygraphMnist(unittest.TestCase): ...@@ -180,29 +119,26 @@ class TestDygraphMnist(unittest.TestCase):
for i in range(len(static_param_name_list)): for i in range(len(static_param_name_list)):
static_param_init_value[static_param_name_list[i]] = out[i] static_param_init_value[static_param_name_list[i]] = out[i]
for epoch in range(epoch_num): for batch_id, data in enumerate(train_reader()):
for batch_id, data in enumerate(train_reader()): if batch_id >= self.batch_num:
static_x_data = np.array( break
[x[0].reshape(1, 28, 28)
for x in data]).astype('float32') static_x_data = np.array(
y_data = np.array( [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
[x[1] for x in data]).astype('int64').reshape([128, 1]) y_data = np.array([x[1] for x in data]).astype('int64').reshape(
[128, 1])
fetch_list = [avg_loss.name]
fetch_list.extend(static_param_name_list) fetch_list = [avg_loss.name]
out = exe.run( fetch_list.extend(static_param_name_list)
fluid.default_main_program(), out = exe.run(fluid.default_main_program(),
feed={"pixel": static_x_data, feed={"pixel": static_x_data,
"label": y_data}, "label": y_data},
fetch_list=fetch_list) fetch_list=fetch_list)
static_param_value = {} static_param_value = {}
static_out = out[0] static_out = out[0]
for i in range(1, len(out)): for i in range(1, len(out)):
static_param_value[static_param_name_list[i - 1]] = out[ static_param_value[static_param_name_list[i - 1]] = out[i]
i]
self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
for key, value in six.iteritems(static_param_init_value): for key, value in six.iteritems(static_param_init_value):
self.assertTrue(np.allclose(value, dy_param_init_value[key])) self.assertTrue(np.allclose(value, dy_param_init_value[key]))
...@@ -210,7 +146,92 @@ class TestDygraphMnist(unittest.TestCase): ...@@ -210,7 +146,92 @@ class TestDygraphMnist(unittest.TestCase):
self.assertTrue(np.allclose(static_out, dy_out)) self.assertTrue(np.allclose(static_out, dy_out))
for key, value in six.iteritems(static_param_value): for key, value in six.iteritems(static_param_value):
self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5)) self.assertTrue(np.allclose(value, dy_param_value[key]))
class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
bd = [3, 6, 9]
optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
return optimizer
def test_adam(self):
self._check_mlp()
class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle))
return optimizer
def test_sgd_cycle(self):
self.cycle = True
self._check_mlp()
def test_sgd(self):
self.cycle = False
self._check_mlp()
class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000))
return optimizer
def test_sgd(self):
self._check_mlp()
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册