未验证 提交 eeda90d6 编写于 作者: M MRXLT 提交者: GitHub

[WIP] update optimizer for 2.0 (#26288)

refine Optimizer/Adam/Admax/RMSProp && add Admw

* buf fix

* update comment

* unify arguments place; notest

* fix ut, test=develop

* bug fix

* fix conflicts, test=develop

* add examples code

* bug fix

* fix comments

* fix sample code

* add sample code for Optimizer

* add adamax ut, test=develop

* fix rmsprop ut, test=develop

* add ut for optimizer.py and adamw.py

* remove TestAdamOptimizerBetaVariable

* update api && add ut

* update doc && fix ut

* add ut
Co-authored-by: Nmapingshuo <mps2012@yeah.net>
上级 e2b82e04
......@@ -40,6 +40,7 @@ from paddle.fluid.layers import tensor
from functools import reduce
from .wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt
import paddle
__all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
......@@ -3690,7 +3691,8 @@ class PipelineOptimizer(object):
def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
if framework.in_dygraph_mode():
raise Exception("In dygraph, don't support PipelineOptimizer.")
if not isinstance(optimizer, Optimizer):
if not isinstance(optimizer, Optimizer) and not isinstance(
optimizer, paddle.optimizer.Optimizer):
raise ValueError("The 'optimizer' parameter for "
"PipelineOptimizer must be an instance of "
"Optimizer, but the given type is {}.".format(
......
......@@ -20,6 +20,7 @@ from op_test import OpTest
from paddle.fluid import core
from paddle.fluid.op import Operator
import paddle.fluid as fluid
import paddle
class TestAdamOp1(OpTest):
......@@ -401,11 +402,11 @@ class TestAdamOpBetaVariable(OpTest):
self.check_output()
class TestAdamOptimizerBetaVariable(unittest.TestCase):
def test_adam_optimizer(self):
def test_with_place(place, shape):
class TestAdamOpV2(unittest.TestCase):
def test_adam_op(self):
place = fluid.CPUPlace()
shape = [2, 3, 8, 8]
exe = fluid.Executor(place)
train_prog = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(train_prog, startup):
......@@ -415,32 +416,97 @@ class TestAdamOptimizerBetaVariable(unittest.TestCase):
loss = fluid.layers.reduce_mean(conv)
beta1 = fluid.layers.create_global_var(
shape=[1],
value=0.85,
dtype='float32',
persistable=True)
shape=[1], value=0.85, dtype='float32', persistable=True)
beta2 = fluid.layers.create_global_var(
shape=[1],
value=0.95,
dtype='float32',
persistable=True)
opt = fluid.optimizer.Adam(
learning_rate=1e-5, beta1=beta1, beta2=beta2)
shape=[1], value=0.95, dtype='float32', persistable=True)
betas = [beta1, beta2]
opt = paddle.optimizer.Adam(
learning_rate=1e-5,
beta1=beta1,
beta2=beta2,
weight_decay=0.01,
epsilon=1e-8)
opt.minimize(loss)
exe.run(startup)
data_np = np.random.random(shape).astype('float32')
rets = exe.run(train_prog,
feed={"data": data_np},
fetch_list=[loss])
rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
assert rets[0] is not None
shape = [2, 3, 8, 8]
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for place in places:
test_with_place(place, shape)
def test_adam_op_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = fluid.dygraph.to_variable(value)
linear = fluid.Linear(13, 5, dtype="float32")
adam = paddle.optimizer.Adam(
learning_rate=0.01, parameters=linear.parameters())
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()
def test_adam_op_with_state_dict(self):
import paddle
paddle.disable_static()
emb = paddle.nn.Embedding([10, 10])
adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)
#learning_rate is Decay
learning_rate = fluid.dygraph.CosineDecay(0.1, 10000, 120)
adam = paddle.optimizer.Adam(
learning_rate=learning_rate,
weight_decay=fluid.regularizer.L2Decay(0.001),
parameters=emb.parameters())
lr = adam.get_lr()
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)
#leanrning_rate is Tensor
with self.assertRaises(TypeError):
learning_rate = np.array([0.01]).astype("float32")
learning_rate = paddle.to_tensor(learning_rate)
adam = paddle.optimizer.Adam(
learning_rate=learning_rate, parameters=emb.parameters())
params = adam.get_opti_var_name_list()
assert (params is not None)
def test_adam_with_grad_clip(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = fluid.dygraph.to_variable(value)
linear = fluid.Linear(13, 5, dtype="float32")
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
adam = paddle.optimizer.Adam(
0.1, parameters=linear.parameters(), grad_clip=clip)
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()
def test_adam_op_with_set_lr(self):
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
lr = 0.01
adam.set_lr(lr)
cur_lr = adam.get_lr()
assert (lr == cur_lr)
lr_var = paddle.create_global_var(shape=[1], value=lr, dtype='float32')
adam.set_lr(lr_var)
cur_lr = adam.get_lr()
assert (np.float32(lr) == cur_lr)
with self.assertRaises(TypeError):
lr = int(1)
adam.set_lr(lr)
if __name__ == "__main__":
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import paddle
import paddle.fluid as fluid
class TestAdamaxAPI(unittest.TestCase):
def test_adamax_api_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
adam = paddle.optimizer.Adamax(
learning_rate=0.01,
parameters=linear.parameters(),
weight_decay=0.01)
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()
def test_adamax_api(self):
place = fluid.CPUPlace()
shape = [2, 3, 8, 8]
exe = fluid.Executor(place)
train_prog = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
data = fluid.data(name="data", shape=shape)
conv = fluid.layers.conv2d(data, 8, 3)
loss = paddle.mean(conv)
beta1 = 0.85
beta2 = 0.95
opt = paddle.optimizer.Adamax(
learning_rate=1e-5,
beta1=beta1,
beta2=beta2,
weight_decay=0.01,
epsilon=1e-8)
opt.minimize(loss)
exe.run(startup)
data_np = np.random.random(shape).astype('float32')
rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
assert rets[0] is not None
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import numpy as np
import paddle.fluid as fluid
class TestAdamWOp(unittest.TestCase):
def test_adamw_op_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
adam = paddle.optimizer.AdamW(
learning_rate=0.01,
parameters=linear.parameters(),
apply_decay_param_fun=lambda name: True,
weight_decay=0.01)
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()
def test_adamw_op_coverage(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
adam = paddle.optimizer.AdamW(
learning_rate=0.0,
parameters=linear.parameters(),
apply_decay_param_fun=lambda name: True,
weight_decay=0.01)
assert (adam.__str__() is not None)
def test_adamw_op(self):
place = fluid.CPUPlace()
shape = [2, 3, 8, 8]
exe = fluid.Executor(place)
train_prog = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
data = fluid.data(name="data", shape=shape)
conv = fluid.layers.conv2d(data, 8, 3)
loss = paddle.mean(conv)
beta1 = fluid.layers.create_global_var(
shape=[1], value=0.85, dtype='float32', persistable=True)
beta2 = fluid.layers.create_global_var(
shape=[1], value=0.95, dtype='float32', persistable=True)
betas = [beta1, beta2]
opt = paddle.optimizer.AdamW(
learning_rate=1e-5,
beta1=beta1,
beta2=beta2,
weight_decay=0.01,
epsilon=1e-8)
opt.minimize(loss)
exe.run(startup)
data_np = np.random.random(shape).astype('float32')
rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
assert rets[0] is not None
if __name__ == "__main__":
unittest.main()
......@@ -55,7 +55,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -100,7 +100,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -55,7 +55,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
strategy.a_sync_configs = {"k_steps": 100}
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -47,7 +47,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = False
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -165,7 +165,7 @@ class TestPSPassWithBow(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(loss)
......
......@@ -51,7 +51,7 @@ class TestFleetAMPOptimizer(unittest.TestCase):
"custom_black_list": ['tanh'],
}
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -60,7 +60,8 @@ class TestFleetDGCOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -72,7 +73,7 @@ class TestFleetDGCOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -87,7 +88,8 @@ class TestFleetDGCOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -44,7 +44,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.gradient_merge = True
strategy.gradient_merge_configs = {"k_steps": 2, "avg": True}
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -58,7 +58,7 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -118,10 +118,129 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.nccl_comm_num = 2
strategy.sync_nccl_allreduce = True
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
exe.run(paddle.fluid.default_startup_program())
import numpy as np
def gen_data():
return {
"x": np.random.random(size=(128, 32)).astype('float32'),
"y": np.random.randint(
2, size=(128, 1)).astype('int64')
}
for i in range(10):
cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name])
print("cost of step[{}] = {}".format(i, cost_val))
proc_a = launch_func(node_func, node_a)
proc_a.start()
proc_b = launch_func(node_func, node_b)
proc_b.start()
proc_a.join()
proc_b.join()
def test_graph_execution_optimizer_not_apply_v2(self):
node_a = {
"PADDLE_TRAINER_ID": "0",
"PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36003",
"PADDLE_TRAINERS_NUM": "2",
"PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
"http_proxy": "",
"https_proxy": ""
}
node_b = {
"PADDLE_TRAINER_ID": "1",
"PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36004",
"PADDLE_TRAINERS_NUM": "2",
"PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004",
"http_proxy": "",
"https_proxy": ""
}
def node_func():
import paddle.distributed.fleet as fleet
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(
name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2],
size=2,
act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
proc_a = launch_func(node_func, node_a)
proc_a.start()
proc_b = launch_func(node_func, node_b)
proc_b.start()
proc_a.join()
proc_b.join()
def test_graph_execution_optimizer(self):
node_a = {
"PADDLE_TRAINER_ID": "0",
"PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36001",
"PADDLE_TRAINERS_NUM": "2",
"PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002",
"http_proxy": "",
"https_proxy": ""
}
node_b = {
"PADDLE_TRAINER_ID": "1",
"PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36002",
"PADDLE_TRAINERS_NUM": "2",
"PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002",
"http_proxy": "",
"https_proxy": ""
}
def node_func():
import paddle.distributed.fleet as fleet
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(
name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2],
size=2,
act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.nccl_comm_num = 2
strategy.sync_nccl_allreduce = True
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
exe.run(paddle.fluid.default_startup_program())
......
......@@ -60,7 +60,7 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.nccl_comm_num = 2
strategy.sync_nccl_allreduce = True
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -62,7 +62,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -75,7 +75,8 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Momentum(learning_rate=0.1, momentum=0.9)
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.1, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -88,7 +89,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
strategy.lamb_configs = {
'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': ['.b_0'],
......
......@@ -62,7 +62,8 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......@@ -75,7 +76,7 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -46,7 +46,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
config['k_steps'] = 1
strategy.localsgd_configs = config
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -53,7 +53,7 @@ class TestFleetMetaOptimizer(unittest.TestCase):
strategy.pipeline = True
strategy.pipeline_configs = {'micro_batch': 2}
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
......@@ -45,7 +45,7 @@ class TestFleetRecomputeMetaOptimizer(unittest.TestCase):
strategy.recompute = True
strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]}
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import contextlib
import unittest
import numpy as np
import six
import itertools
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer
from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer
from paddle.fluid.dygraph import Linear
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
# Note(wangzhongpu)
# In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer.
class MLP(fluid.Layer):
def __init__(self, param_attr=None, bias_attr=None):
super(MLP, self).__init__()
self._fc1 = Linear(784, 10)
self._fc2 = Linear(10, 10)
def forward(self, inputs):
y = self._fc1(inputs)
y = self._fc2(y)
return y
class TestImperativeOptimizerBase(unittest.TestCase):
def setUp(self):
self.batch_num = 20
def get_optimizer_dygraph(self, parameter_list):
raise NotImplementedError()
def get_optimizer(self):
raise NotImplementedError()
def reader_decorator(self, reader):
def _reader_imple():
for item in reader():
image = np.array(item[0]).reshape(1, 784)
label = np.array(item[1]).astype('int64').reshape(1)
yield image, label
return _reader_imple
def _check_exception(self, exception_message, place=None):
seed = 90
batch_size = 128
if place == None:
place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
with fluid.dygraph.guard(place):
try:
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mlp = MLP()
optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters())
except Exception as e:
assert str(e) == exception_message
def _check_mlp(self, place=None):
seed = 90
batch_size = 128
if place == None:
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
with fluid.dygraph.guard(place):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mlp = MLP()
optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters())
batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator(
paddle.batch(
self.reader_decorator(paddle.dataset.mnist.train()),
batch_size=batch_size,
drop_last=True),
places=fluid.CPUPlace())
dy_param_init_value = {}
for batch_id, data in enumerate(batch_py_reader()):
if batch_id >= self.batch_num:
break
img = data[0]
label = data[1]
label.stop_gradient = True
img = fluid.layers.reshape(img, shape=[batch_size, -1])
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
dy_out = avg_loss.numpy()
if batch_id == 0:
for param in mlp.parameters():
dy_param_init_value[param.name] = param.numpy()
avg_loss.backward()
optimizer.minimize(avg_loss)
mlp.clear_gradients()
dy_param_value = {}
for param in mlp.parameters():
dy_param_value[param.name] = param.numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
if place == None:
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
exe = fluid.Executor(place)
mlp = MLP()
optimizer = self.get_optimizer()
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
img = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
img = fluid.layers.reshape(img, shape=[batch_size, 784])
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
optimizer.minimize(avg_loss)
# initialize params and fetch them
static_param_init_value = {}
static_param_name_list = []
for param in mlp.parameters():
static_param_name_list.append(param.name)
out = exe.run(fluid.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init_value[static_param_name_list[i]] = out[i]
for batch_id, data in enumerate(train_reader()):
if batch_id >= self.batch_num:
break
static_x_data = np.array(
[x[0].reshape(1, 28, 28) for x in data]).astype('float32')
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
[128, 1])
fetch_list = [avg_loss.name]
fetch_list.extend(static_param_name_list)
out = exe.run(fluid.default_main_program(),
feed={"pixel": static_x_data,
"label": y_data},
fetch_list=fetch_list)
static_param_value = {}
static_out = out[0]
for i in range(1, len(out)):
static_param_value[static_param_name_list[i - 1]] = out[i]
for key, value in six.iteritems(static_param_init_value):
self.assertTrue(np.allclose(value, dy_param_init_value[key]))
self.assertTrue(np.allclose(static_out, dy_out))
for key, value in six.iteritems(static_param_value):
self.assertTrue(np.allclose(value, dy_param_value[key]))
class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
bd = [3, 6, 9]
optimizer = SGDOptimizer(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd,
values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
bd = [3, 6, 9]
optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = Adam(
learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
return optimizer
def test_adam(self):
self._check_mlp()
class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle))
return optimizer
def test_sgd_cycle(self):
self.cycle = True
self._check_mlp()
def test_sgd(self):
self.cycle = False
self._check_mlp()
class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000))
return optimizer
def test_sgd(self):
self._check_mlp()
class TestOptimizerLearningRate(unittest.TestCase):
def test_constant_lr(self):
with fluid.dygraph.guard():
a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = fluid.dygraph.nn.Linear(10, 10)
a = fluid.dygraph.to_variable(a)
b = linear(a)
loss = fluid.layers.reduce_mean(b)
adam = paddle.optimizer.Adam(0.001, parameters=linear.parameters())
self.assertTrue(
np.allclose(
adam.get_lr(), 0.001, rtol=1e-06, atol=0.0))
for i in range(10):
adam.minimize(loss)
lr = adam.get_lr()
self.assertTrue(np.allclose(lr, 0.001, rtol=1e-06, atol=0.0))
def test_lr_decay(self):
with fluid.dygraph.guard():
a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = fluid.dygraph.nn.Linear(10, 10)
a = fluid.dygraph.to_variable(a)
b = linear(a)
loss = fluid.layers.reduce_mean(b)
bd = [2, 4, 6, 8]
value = [0.2, 0.4, 0.6, 0.8, 1.0]
adam = paddle.optimizer.Adam(
fluid.dygraph.PiecewiseDecay(bd, value, 0),
parameters=linear.parameters())
self.assertTrue(
np.allclose(
adam.get_lr(), 0.2, rtol=1e-06, atol=0.0))
ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
for i in range(12):
adam.minimize(loss)
lr = adam.get_lr()
self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
def test_lr_decay_natural_exp(self):
with fluid.dygraph.guard():
a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = fluid.dygraph.nn.Linear(10, 10)
a = fluid.dygraph.to_variable(a)
b = linear(a)
loss = fluid.layers.reduce_mean(b)
base_lr = 1.0
adam = paddle.optimizer.Adam(
fluid.dygraph.NaturalExpDecay(
learning_rate=base_lr,
decay_steps=3,
decay_rate=0.5,
staircase=True),
parameters=linear.parameters())
self.assertTrue(
np.allclose(
adam.get_lr(), 1.0, rtol=1e-06, atol=0.0))
ret = [1.0, 1.0, 1.0, np.exp(-0.5), np.exp(-0.5)]
for i in range(5):
adam.minimize(loss)
lr = adam.get_lr()
self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
def test_set_lr(self):
with fluid.dygraph.guard():
a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = fluid.dygraph.nn.Linear(10, 10)
a = fluid.dygraph.to_variable(a)
b = linear(a)
loss = fluid.layers.reduce_mean(b)
adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
for i in range(5):
adam.set_lr(lr_list[i])
adam.minimize(loss)
lr = adam.get_lr()
self.assertTrue(
np.allclose(
lr, lr_list[i], rtol=1e-06, atol=0.0))
lr_var = fluid.layers.create_global_var(
shape=[1], value=0.7, dtype='float32')
adam.set_lr(lr_var)
adam.minimize(loss)
lr = adam.get_lr()
self.assertTrue(np.allclose(lr, 0.7, rtol=1e-06, atol=0.0))
with self.assertRaises(RuntimeError):
adam = paddle.optimizer.Adam(
fluid.dygraph.NaturalExpDecay(
learning_rate=0.1,
decay_steps=3,
decay_rate=0.5,
staircase=True),
parameters=linear.parameters())
adam.set_lr(0.01)
class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = MomentumOptimizer(
learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
return optimizer
def test_momentum(self):
self._check_mlp()
class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = LarsMomentumOptimizer(
learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
return optimizer
def test_larsmomentum(self):
self._check_mlp()
class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = AdagradOptimizer(
learning_rate=0.2, parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = AdagradOptimizer(learning_rate=0.2)
return optimizer
def test_adagrad(self):
self._check_mlp()
class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = AdamaxOptimizer(
learning_rate=0.2, parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = AdamaxOptimizer(learning_rate=0.2)
return optimizer
def test_adamax(self):
self._check_mlp()
class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = DpsgdOptimizer(
learning_rate=0.01,
clip=10.0,
batch_size=16.0,
sigma=1.0,
parameter_list=parameter_list)
optimizer._seed = 100
return optimizer
def get_optimizer(self):
optimizer = DpsgdOptimizer(
learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
optimizer._seed = 100
return optimizer
def test_dpsgd(self):
self._check_mlp(place=fluid.CPUPlace())
class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = DecayedAdagradOptimizer(
learning_rate=0.2, parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = DecayedAdagradOptimizer(learning_rate=0.2)
return optimizer
def test_decayadagrad(self):
self._check_mlp()
class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = AdadeltaOptimizer(
learning_rate=0.0003,
epsilon=1.0e-6,
rho=0.95,
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = AdadeltaOptimizer(
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
return optimizer
def test_adadelta(self):
self._check_mlp()
class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = RMSPropOptimizer(
learning_rate=0.1, parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = RMSPropOptimizer(learning_rate=0.1)
return optimizer
def test_rmsprop(self):
self._check_mlp()
class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = FtrlOptimizer(
learning_rate=0.1, parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = FtrlOptimizer(learning_rate=0.1)
return optimizer
def test_ftrl(self):
self._check_mlp()
def exclude_fn(param):
return param.name.endswith('.b_0')
class TestImperativeLambOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = LambOptimizer(
learning_rate=0.002,
exclude_from_weight_decay_fn=exclude_fn,
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = LambOptimizer(
learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn)
return optimizer
def test_lamb(self):
self._check_mlp()
class TestImperativeModelAverage(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = ModelAverage(
0.15, min_average_window=10000, max_average_window=12500)
return optimizer
def test_modelaverage(self):
exception_message = "In dygraph, don't support ModelAverage."
self._check_exception(exception_message)
class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = DGCMomentumOptimizer(
learning_rate=0.0001,
momentum=0.9,
rampup_step=1000,
rampup_begin_step=1252,
sparsity=[0.999, 0.999])
return optimizer
def test_dgcmomentum(self):
exception_message = "In dygraph, don't support DGCMomentumOptimizer."
self._check_exception(exception_message)
class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = ExponentialMovingAverage(0.999)
return optimizer
def test_exponentialmoving(self):
exception_message = "In dygraph, don't support ExponentialMovingAverage."
self._check_exception(exception_message)
class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(learning_rate=0.5,
parameter_list=parameter_list)
optimizer = PipelineOptimizer(optimizer)
return optimizer
def test_pipline(self):
exception_message = "In dygraph, don't support PipelineOptimizer."
self._check_exception(exception_message)
class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(learning_rate=0.5,
parameter_list=parameter_list)
optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5)
return optimizer
def test_lookahead(self):
exception_message = "In dygraph, don't support LookaheadOptimizer."
self._check_exception(exception_message)
class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(learning_rate=0.5,
parameter_list=parameter_list)
optimizer = RecomputeOptimizer(optimizer)
return optimizer
def test_recompute(self):
exception_message = "In dygraph, don't support RecomputeOptimizer."
self._check_exception(exception_message)
class TestImperativeOptimizerList(unittest.TestCase):
def test_parameter_list(self):
with fluid.dygraph.guard():
linear_1 = Linear(10, 10)
linear_2 = Linear(10, 10)
sgd = SGDOptimizer(
1.0,
parameter_list=itertools.chain(linear_1.parameters(),
linear_2.parameters()))
in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
in_data = fluid.dygraph.to_variable(in_np)
y = linear_1(in_data)
y = linear_2(y)
loss = fluid.layers.reduce_mean(y)
loss.backward()
sgd.minimize(loss)
self.assertTrue(
len(sgd._parameter_list) ==
len(linear_1.parameters() + linear_2.parameters()))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.dygraph.nn import Embedding, Linear
import paddle.fluid.framework as framework
from paddle.optimizer import Adam
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
from test_imperative_base import new_program_scope
import numpy as np
import six
import paddle
class SimpleLSTMRNN(fluid.Layer):
def __init__(self,
hidden_size,
num_steps,
num_layers=2,
init_scale=0.1,
dropout=None):
super(SimpleLSTMRNN, self).__init__()
self._hidden_size = hidden_size
self._num_layers = num_layers
self._init_scale = init_scale
self._dropout = dropout
self._input = None
self._num_steps = num_steps
self.cell_array = []
self.hidden_array = []
self.weight_1_arr = []
self.weight_2_arr = []
self.bias_arr = []
self.mask_array = []
for i in range(self._num_layers):
weight_1 = self.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 2, self._hidden_size * 4],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale))
self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
bias_1 = self.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 4],
dtype="float32",
default_initializer=fluid.initializer.Constant(0.0))
self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
def forward(self, input_embedding, init_hidden=None, init_cell=None):
self.cell_array = []
self.hidden_array = []
for i in range(self._num_layers):
pre_hidden = fluid.layers.slice(
init_hidden, axes=[0], starts=[i], ends=[i + 1])
pre_cell = fluid.layers.slice(
init_cell, axes=[0], starts=[i], ends=[i + 1])
pre_hidden = fluid.layers.reshape(
pre_hidden, shape=[-1, self._hidden_size])
pre_cell = fluid.layers.reshape(
pre_cell, shape=[-1, self._hidden_size])
self.hidden_array.append(pre_hidden)
self.cell_array.append(pre_cell)
res = []
for index in range(self._num_steps):
self._input = fluid.layers.slice(
input_embedding, axes=[1], starts=[index], ends=[index + 1])
self._input = fluid.layers.reshape(
self._input, shape=[-1, self._hidden_size])
for k in range(self._num_layers):
pre_hidden = self.hidden_array[k]
pre_cell = self.cell_array[k]
weight_1 = self.weight_1_arr[k]
bias = self.bias_arr[k]
nn = fluid.layers.concat([self._input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1)
gate_input = fluid.layers.elementwise_add(gate_input, bias)
i, j, f, o = fluid.layers.split(
gate_input, num_or_sections=4, dim=-1)
c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
i) * fluid.layers.tanh(j)
m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
self.hidden_array[k] = m
self.cell_array[k] = c
self._input = m
if self._dropout is not None and self._dropout > 0.0:
self._input = fluid.layers.dropout(
self._input,
dropout_prob=self._dropout,
dropout_implementation='upscale_in_train')
res.append(
fluid.layers.reshape(
self._input, shape=[1, -1, self._hidden_size]))
real_res = fluid.layers.concat(res, 0)
real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2])
last_hidden = fluid.layers.concat(self.hidden_array, 1)
last_hidden = fluid.layers.reshape(
last_hidden, shape=[-1, self._num_layers, self._hidden_size])
last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2])
last_cell = fluid.layers.concat(self.cell_array, 1)
last_cell = fluid.layers.reshape(
last_cell, shape=[-1, self._num_layers, self._hidden_size])
last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2])
return real_res, last_hidden, last_cell
class PtbModel(fluid.Layer):
def __init__(self,
hidden_size,
vocab_size,
num_layers=2,
num_steps=20,
init_scale=0.1,
dropout=None):
super(PtbModel, self).__init__()
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.init_scale = init_scale
self.num_layers = num_layers
self.num_steps = num_steps
self.dropout = dropout
self.simple_lstm_rnn = SimpleLSTMRNN(
hidden_size,
num_steps,
num_layers=num_layers,
init_scale=init_scale,
dropout=dropout)
self.embedding = Embedding(
size=[vocab_size, hidden_size],
dtype='float32',
is_sparse=False,
param_attr=fluid.ParamAttr(
name='embedding_para',
initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))
self.softmax_weight = self.create_parameter(
attr=fluid.ParamAttr(),
shape=[self.hidden_size, self.vocab_size],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale))
self.softmax_bias = self.create_parameter(
attr=fluid.ParamAttr(),
shape=[self.vocab_size],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale))
def forward(self, input, label, init_hidden, init_cell):
init_h = fluid.layers.reshape(
init_hidden, shape=[self.num_layers, -1, self.hidden_size])
init_c = fluid.layers.reshape(
init_cell, shape=[self.num_layers, -1, self.hidden_size])
x_emb = self.embedding(input)
x_emb = fluid.layers.reshape(
x_emb, shape=[-1, self.num_steps, self.hidden_size])
if self.dropout is not None and self.dropout > 0.0:
x_emb = fluid.layers.dropout(
x_emb,
dropout_prob=self.drop_out,
dropout_implementation='upscale_in_train')
rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
init_c)
rnn_out = fluid.layers.reshape(
rnn_out, shape=[-1, self.num_steps, self.hidden_size])
projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
projection = fluid.layers.reshape(
projection, shape=[-1, self.vocab_size])
loss = fluid.layers.softmax_with_cross_entropy(
logits=projection, label=label, soft_label=False)
loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
loss = fluid.layers.reduce_mean(loss, dim=[0])
loss = fluid.layers.reduce_sum(loss)
return loss, last_hidden, last_cell
class TestDygraphPtbRnn(unittest.TestCase):
def setUp(self):
seed = 90
hidden_size = 10
vocab_size = 1000
num_layers = 1
num_steps = 3
init_scale = 0.1
batch_size = 4
batch_num = 200
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale)
bd = []
lr_arr = [1.0]
# this a fake lr decay strategy
for i in range(1, 10):
bd.append(100 * i)
new_lr = 1.0
lr_arr.append(new_lr)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
adam = Adam(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameters=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
last_hidden = None
last_cell = None
for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
y_data = y_data.reshape((-1, 1))
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
x = to_variable(x_data)
y = to_variable(y_data)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
if i == 0:
for param in ptb_model.parameters():
dy_param_init[param.name] = param.numpy()
dy_loss.backward()
adam.minimize(dy_loss)
ptb_model.clear_gradients()
if i == batch_num - 1:
for param in ptb_model.parameters():
dy_param_updated[param.name] = param.numpy()
# check optimizer
self.opti_dict = adam.state_dict()
self.base_opti = {}
for k, v in self.opti_dict.items():
if isinstance(v, core.VarBase):
self.base_opti[v.name] = v.numpy()
self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
else:
self.base_opti[k] = v
fluid.save_dygraph(self.opti_dict, "./test_dy")
self.state_dict = ptb_model.state_dict()
self.model_base = {}
for k, v in self.state_dict.items():
np_t = v.numpy()
self.model_base[k] = np_t
paddle.save(self.state_dict, "./test_dy")
def testLoadAndSetVarBase(self):
seed = 90
hidden_size = 10
vocab_size = 1000
num_layers = 1
num_steps = 3
init_scale = 0.1
batch_size = 4
batch_num = 200
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale)
bd = []
lr_arr = [1.0]
# this a fake lr decay strategy
for i in range(1, 10):
bd.append(100 * i)
new_lr = 1.0
lr_arr.append(new_lr)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
adam = Adam(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameters=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
last_hidden = None
last_cell = None
for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
y_data = y_data.reshape((-1, 1))
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
x = to_variable(x_data)
y = to_variable(y_data)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
if i == 0:
for param in ptb_model.parameters():
dy_param_init[param.name] = param.numpy()
dy_loss.backward()
adam.minimize(dy_loss)
ptb_model.clear_gradients()
if i == batch_num - 1:
for param in ptb_model.parameters():
dy_param_updated[param.name] = param.numpy()
# check optimizer
opti_dict = adam.state_dict()
# set to zero
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
np_t = v.numpy()
var = v.value().get_tensor()
var.set(np.zeros_like(np_t), place)
self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
if isinstance(adam._learning_rate, LearningRateDecay):
adam._learning_rate.step_num = 0
para_state_dict, opti_state_dict = paddle.load("./test_dy")
adam.set_state_dict(opti_state_dict)
opti_dict = adam.state_dict()
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name]))
else:
self.assertEqual(v, self.base_opti[k])
# check parameter
state_dict = ptb_model.state_dict()
for k, v in state_dict.items():
np_t = v.numpy()
var = v.value().get_tensor()
var.set(np.zeros_like(np_t), place)
ptb_model.set_dict(para_state_dict)
state_dict = ptb_model.state_dict()
for k, v in state_dict.items():
new_t = v.numpy()
base_t = self.model_base[k]
self.assertTrue(np.array_equal(new_t, base_t))
def testSetVariable(self):
seed = 90
hidden_size = 10
vocab_size = 1000
num_layers = 1
num_steps = 3
init_scale = 0.1
batch_size = 4
batch_num = 200
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale)
bd = []
lr_arr = [1.0]
# this a fake lr decay strategy
for i in range(1, 10):
bd.append(100 * i)
new_lr = 1.0
lr_arr.append(new_lr)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
adam = Adam(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameters=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
last_hidden = None
last_cell = None
for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
y_data = y_data.reshape((-1, 1))
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
x = to_variable(x_data)
y = to_variable(y_data)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
if i == 0:
for param in ptb_model.parameters():
dy_param_init[param.name] = param.numpy()
dy_loss.backward()
adam.minimize(dy_loss)
ptb_model.clear_gradients()
if i == batch_num - 1:
for param in ptb_model.parameters():
dy_param_updated[param.name] = param.numpy()
# check optimizer
opti_dict = adam.state_dict()
# set to zero
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
np_t = v.numpy()
var = v.value().get_tensor()
var.set(np.zeros_like(np_t), place)
self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
if isinstance(adam._learning_rate, LearningRateDecay):
adam._learning_rate.step_num = 0
adam.set_state_dict(self.opti_dict)
opti_dict = adam.state_dict()
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name]))
else:
self.assertEqual(v, self.base_opti[k])
# check parameter
state_dict = ptb_model.state_dict()
for k, v in state_dict.items():
np_t = v.numpy()
var = v.value().get_tensor()
var.set(np.zeros_like(np_t), place)
ptb_model.set_dict(self.state_dict)
state_dict = ptb_model.state_dict()
for k, v in state_dict.items():
new_t = v.numpy()
base_t = self.model_base[k]
self.assertTrue(np.array_equal(new_t, base_t))
def testSetNumpy(self):
seed = 90
hidden_size = 10
vocab_size = 1000
num_layers = 1
num_steps = 3
init_scale = 0.1
batch_size = 4
batch_num = 200
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale)
bd = []
lr_arr = [1.0]
# this a fake lr decay strategy
for i in range(1, 10):
bd.append(100 * i)
new_lr = 1.0
lr_arr.append(new_lr)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
adam = Adam(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameters=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
last_hidden = None
last_cell = None
for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
y_data = y_data.reshape((-1, 1))
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
x = to_variable(x_data)
y = to_variable(y_data)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
if i == 0:
for param in ptb_model.parameters():
dy_param_init[param.name] = param.numpy()
dy_loss.backward()
adam.minimize(dy_loss)
ptb_model.clear_gradients()
if i == batch_num - 1:
for param in ptb_model.parameters():
dy_param_updated[param.name] = param.numpy()
# check optimizer
opti_dict = adam.state_dict()
np_opti_dict = {}
# set to zero
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
np_t = v.numpy()
np_opti_dict[v.name] = np_t
var = v.value().get_tensor()
var.set(np.zeros_like(np_t), place)
self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
else:
np_opti_dict[k] = v
if isinstance(adam._learning_rate, LearningRateDecay):
adam._learning_rate.step_num = 0
adam.set_state_dict(np_opti_dict)
opti_dict = adam.state_dict()
for k, v in opti_dict.items():
if isinstance(v, core.VarBase):
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name]))
else:
self.assertEqual(v, self.base_opti[k])
# check parameter
state_dict = ptb_model.state_dict()
np_state_dict = {}
for k, v in state_dict.items():
np_t = v.numpy()
np_state_dict[k] = np_t
var = v.value().get_tensor()
var.set(np.zeros_like(np_t), place)
ptb_model.set_dict(np_state_dict)
state_dict = ptb_model.state_dict()
for k, v in state_dict.items():
new_t = v.numpy()
base_t = self.model_base[k]
self.assertTrue(np.array_equal(new_t, base_t))
def testSetVariableBeforeTrain(self):
seed = 90
hidden_size = 10
vocab_size = 1000
num_layers = 1
num_steps = 3
init_scale = 0.1
batch_size = 4
batch_num = 200
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
adam = Adam(
learning_rate=0.0,
beta1=0.8,
beta2=0.6,
parameters=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
last_hidden = None
last_cell = None
adam.set_state_dict(self.opti_dict)
ptb_model.set_dict(self.state_dict)
for i in range(1):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
y_data = y_data.reshape((-1, 1))
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
x = to_variable(x_data)
y = to_variable(y_data)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
dy_loss.backward()
adam.minimize(dy_loss)
ptb_model.clear_gradients()
opti_dict = adam.state_dict()
for k, v in opti_dict.items():
if k == "global_step":
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name] + 1))
if k.find("beta1_pow_acc_0") > 0:
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name] *
adam._beta1))
if k.find("beta2_pow_acc_0") > 0:
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name] *
adam._beta2))
state_dict = ptb_model.state_dict()
for k, v in state_dict.items():
new_t = v.numpy()
base_t = self.model_base[k]
self.assertTrue(np.array_equal(new_t, base_t))
def testLoadAndSetVarBaseBeforeTrain(self):
seed = 90
hidden_size = 10
vocab_size = 1000
num_layers = 1
num_steps = 3
init_scale = 0.1
batch_size = 4
batch_num = 200
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale)
bd = []
lr_arr = [0.0]
# this a fake lr decay strategy
for i in range(1, 10):
bd.append(100 * i)
# set lr to zero not update parameter
new_lr = 0.0
lr_arr.append(new_lr)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
adam = Adam(
learning_rate=0.0,
beta1=0.8,
beta2=0.6,
parameters=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
last_hidden = None
last_cell = None
state_dict, opti_dict = fluid.load_dygraph("./test_dy")
adam.set_state_dict(opti_dict)
ptb_model.set_dict(state_dict)
for i in range(1):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
y_data = y_data.reshape((-1, 1))
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
x = to_variable(x_data)
y = to_variable(y_data)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
dy_loss.backward()
adam.minimize(dy_loss)
ptb_model.clear_gradients()
opti_dict = adam.state_dict()
for k, v in opti_dict.items():
if k == "global_step":
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name] + 1))
if k.find("beta1_pow_acc_0") > 0:
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name] *
adam._beta1))
if k.find("beta2_pow_acc_0") > 0:
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name] *
adam._beta2))
# check parameter
state_dict = ptb_model.state_dict()
for k, v in state_dict.items():
new_t = v.numpy()
base_t = self.model_base[k]
self.assertTrue(np.array_equal(new_t, base_t))
def testSetNumpyBeforeTrain(self):
seed = 90
hidden_size = 10
vocab_size = 1000
num_layers = 1
num_steps = 3
init_scale = 0.1
batch_size = 4
batch_num = 200
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale)
bd = []
lr_arr = [0.0]
# this a fake lr decay strategy
for i in range(1, 10):
bd.append(100 * i)
# set lr to 0.0, not update parameter
new_lr = 0.0
lr_arr.append(new_lr)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
adam = Adam(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
beta1=0.8,
beta2=0.6,
parameters=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
last_hidden = None
last_cell = None
np_opti_dict = {}
np_state_dict = {}
for k, v in self.opti_dict.items():
if isinstance(v, core.VarBase):
np_opti_dict[v.name] = v.numpy()
else:
np_opti_dict[k] = v
for k, v in self.state_dict.items():
np_state_dict[k] = v.numpy()
adam.set_state_dict(np_opti_dict)
ptb_model.set_dict(np_state_dict)
for i in range(1):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
y_data = y_data.reshape((-1, 1))
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
x = to_variable(x_data)
y = to_variable(y_data)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
dy_loss.backward()
adam.minimize(dy_loss)
ptb_model.clear_gradients()
opti_dict = adam.state_dict()
for k, v in opti_dict.items():
if k == "global_step":
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name] + 1))
if k.find("beta1_pow_acc_0") > 0:
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name] *
adam._beta1))
if k.find("beta2_pow_acc_0") > 0:
self.assertTrue(
np.array_equal(v.numpy(), self.base_opti[v.name] *
adam._beta2))
# check parameter
state_dict = ptb_model.state_dict()
for k, v in state_dict.items():
new_t = v.numpy()
base_t = self.model_base[k]
self.assertTrue(np.array_equal(new_t, base_t))
def testOnlyLoadParams(self):
with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding([10, 10])
state_dict = emb.state_dict()
paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
para_state_dict, opti_state_dict = paddle.load(
os.path.join('saved_dy', 'emb_dy'))
self.assertTrue(opti_state_dict == None)
para_state_dict, opti_state_dict = paddle.load(
os.path.join('saved_dy', 'emb_dy.pdparams'))
para_state_dict, opti_state_dict = paddle.load(
os.path.join('saved_dy', 'emb_dy.pdopt'))
if __name__ == '__main__':
unittest.main()
......@@ -96,8 +96,8 @@ class TestRetainGraph(unittest.TestCase):
g = Generator()
d = Discriminator()
optim_g = paddle.optimizer.Adam(parameter_list=g.parameters())
optim_d = paddle.optimizer.Adam(parameter_list=d.parameters())
optim_g = paddle.optimizer.Adam(parameters=g.parameters())
optim_d = paddle.optimizer.Adam(parameters=d.parameters())
gan_criterion = paddle.nn.MSELoss()
l1_criterion = paddle.nn.L1Loss()
......
......@@ -20,6 +20,7 @@ import numpy as np
import paddle.fluid.core as core
from paddle.fluid.op import Operator
import paddle.fluid as fluid
import paddle
def create_selected_rows_and_tensor(scope, place, height, row_num,
......@@ -222,5 +223,59 @@ class TestRmspropOp(TestBase):
size=size)
class TestRMSPropV2(unittest.TestCase):
def test_rmsprop_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.RMSProp(
learning_rate=0.01,
parameters=linear.parameters(),
weight_decay=0.01)
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()
def test_rmsprop(self):
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
rms_optimizer = paddle.optimizer.RMSProp(learning_rate=0.1)
rms_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
def test_raise_error(self):
self.assertRaises(ValueError, paddle.optimizer.RMSProp, None)
self.assertRaises(
ValueError, paddle.optimizer.RMSProp, learning_rate=0.1, rho=None)
self.assertRaises(
ValueError,
paddle.optimizer.RMSProp,
learning_rate=0.1,
epsilon=None)
self.assertRaises(
ValueError,
paddle.optimizer.RMSProp,
learning_rate=0.1,
momentum=None)
if __name__ == "__main__":
unittest.main()
......@@ -14,21 +14,25 @@
__all__ = [
'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
'Adamax', 'AdamaxOptimizer', 'AdamOptimizer', 'DecayedAdagrad',
'DecayedAdagradOptimizer', 'DGCMomentumOptimizer', 'Dpsgd',
'DpsgdOptimizer', 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer',
'LambOptimizer', 'LarsMomentum', 'LarsMomentumOptimizer',
'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'MomentumOptimizer',
'PipelineOptimizer', 'RecomputeOptimizer', 'RMSPropOptimizer', 'SGD',
'SGDOptimizer'
'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer',
'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer',
'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', 'LambOptimizer',
'LarsMomentum', 'LarsMomentumOptimizer', 'LookaheadOptimizer',
'ModelAverage', 'Momentum', 'MomentumOptimizer', 'PipelineOptimizer',
'RecomputeOptimizer', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer'
]
from ..fluid.optimizer import SGD, Momentum, Adagrad, Adam, Adamax, Dpsgd, DecayedAdagrad, \
Ftrl, SGDOptimizer, MomentumOptimizer, AdagradOptimizer, \
AdamOptimizer, AdamaxOptimizer, DpsgdOptimizer, \
DecayedAdagradOptimizer, RMSPropOptimizer, FtrlOptimizer, Adadelta, \
AdadeltaOptimizer, ModelAverage, LarsMomentum, \
LarsMomentumOptimizer, DGCMomentumOptimizer, LambOptimizer, \
from ..fluid.optimizer import SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \
Ftrl, Adadelta, \
SGDOptimizer, MomentumOptimizer, AdagradOptimizer,DpsgdOptimizer,\
DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
RecomputeOptimizer
RecomputeOptimizer, LarsMomentumOptimizer
from .optimizer import Optimizer
from .adam import Adam
from .adamw import AdamW
from .adamax import Adamax
from .rmsprop import RMSProp
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .optimizer import Optimizer
from ..fluid import core
from ..fluid import framework
from ..fluid.framework import Variable
__all__ = ["Adam"]
class Adam(Optimizer):
"""
The Adam optimizer uses an optimization described at the end
of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
it can dynamically adjusts the learning rate of each parameter using
the 1st moment estimates and the 2nd moment estimates of the gradient.
The parameter ``param_out`` update rule with gradient ``grad``:
.. math::
t & = t + 1
moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
learning\_rate & = learning\_rate * \\
\\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
Args:
learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay. The default value is 0.001.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.9.
beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \
:ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
the regularization setting here in optimizer will be ignored for this parameter. \
Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
The accumulators are updated at every step. Every element of the two moving-average
is updated in both dense mode and sparse mode. If the size of parameter is very large,
then the update may be very slow. The lazy mode only update the element that has
gradient in current mini-batch, so it will be much more faster. But this mode has
different semantics with the original Adam algorithm and may lead to different result.
The default value is False.
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
adam = paddle.optimizer.Adam(learning_rate=0.1,
parameters=linear.parameters())
out.backward()
adam.step()
adam.clear_grad()
.. code-block:: python
# Adam with beta1/beta2 as Tensor and weight_decay as float
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
adam = paddle.optimizer.Adam(learning_rate=0.1,
parameters=linear.parameters(),
beta1=beta1,
beta2=beta2,
weight_decay=0.01)
out.backward()
adam.step()
adam.clear_grad()
"""
_moment1_acc_str = "moment1"
_moment2_acc_str = "moment2"
_beta1_pow_acc_str = "beta1_pow_acc"
_beta2_pow_acc_str = "beta2_pow_acc"
def __init__(self,
learning_rate=0.001,
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
parameters=None,
weight_decay=None,
grad_clip=None,
name=None,
lazy_mode=False):
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
assert epsilon is not None
super(Adam, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
weight_decay=weight_decay,
grad_clip=grad_clip,
name=name)
self.type = "adam"
self._beta1 = beta1
self._beta2 = beta2
self._epsilon = epsilon
self._lazy_mode = lazy_mode
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
# Create accumulator tensors for first and second moments
for p in parameters:
self._add_accumulator(self._moment1_acc_str, p)
self._add_accumulator(self._moment2_acc_str, p)
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
fill_value=0.9 if isinstance(self._beta1, Variable) \
else self._beta1,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
self._add_accumulator(
name=self._beta2_pow_acc_str,
param=p,
fill_value=0.999 if isinstance(self._beta2, Variable) \
else self._beta2,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
moment1 = self._get_accumulator(self._moment1_acc_str,
param_and_grad[0])
moment2 = self._get_accumulator(self._moment2_acc_str,
param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param_and_grad[0])
lr = self._create_param_lr(param_and_grad)
# create the adam optimize op
if framework.in_dygraph_mode():
_beta1 = self._beta1 if not isinstance(
self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0)
_, _, _, _, _ = core.ops.adam(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
1000, 'beta1', _beta1, 'beta2', _beta2)
return None
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"LearningRate": [lr],
"Moment1": [moment1],
"Moment2": [moment2],
"Beta1Pow": [beta1_pow_acc],
"Beta2Pow": [beta2_pow_acc]
}
outputs = {
"ParamOut": [param_and_grad[0]],
"Moment1Out": [moment1],
"Moment2Out": [moment2],
"Beta1PowOut": [beta1_pow_acc],
"Beta2PowOut": [beta2_pow_acc],
}
attrs = {
"epsilon": self._epsilon,
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000
}
if isinstance(self._beta1, Variable):
inputs['Beta1Tensor'] = self._beta1
else:
attrs['beta1'] = self._beta1
if isinstance(self._beta2, Variable):
inputs['Beta2Tensor'] = self._beta2
else:
attrs['beta2'] = self._beta2
adam_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
return adam_op
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .optimizer import Optimizer
from ..fluid import core
from ..fluid import framework
from ..fluid.framework import Variable, name_scope
__all__ = ["Adamax"]
class Adamax(Optimizer):
"""
The Adamax optimizer is implemented based on the Adamax Optimization
in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
which makes the learning rate update algorithm more stable and simple.
The parameter ``param_out`` update rule with gradient ``grad``:
.. math::
t & = t + 1
moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
The original paper does not have an ``epsilon`` attribute,
it is added here for numerical stability to prevent the division by 0 error.
Args:
learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay. The default value is 0.001.
beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
The default value is 0.9.
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \
:ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
the regularization setting here in optimizer will be ignored for this parameter. \
Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
**Notes**:
**Currently, Adamax doesn't support sparse parameter optimization.**
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
adam = paddle.optimizer.Adamax(learning_rate=0.1,
parameters=linear.parameters(),
beta1=beta1,
beta2=beta2,
weight_decay=0.01)
out.backward()
adam.step()
adam.clear_grad()
"""
_moment_acc_str = "moment"
_inf_norm_acc_str = "inf_norm"
_beta1_pow_acc_str = "beta1_pow_acc"
def __init__(self,
learning_rate=0.001,
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
parameters=None,
weight_decay=None,
grad_clip=None,
name=None):
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
assert epsilon is not None
super(Adamax, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
weight_decay=weight_decay,
grad_clip=grad_clip,
name=name)
self.type = "adamax"
self._beta1 = beta1
self._beta2 = beta2
self._epsilon = epsilon
def _create_accumulators(self, block, parameters):
# Create accumulator tensors for first moment and infinity norm
for p in parameters:
self._add_accumulator(self._moment_acc_str, p)
self._add_accumulator(self._inf_norm_acc_str, p)
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
fill_value=self._beta1,
shape=[1])
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
inf_norm = self._get_accumulator(self._inf_norm_acc_str,
param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
# create the adamax optimize op
adamax_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": self._create_param_lr(param_and_grad),
"Moment": moment,
"InfNorm": inf_norm,
"Beta1Pow": beta1_pow_acc
},
outputs={
"ParamOut": param_and_grad[0],
"MomentOut": moment,
"InfNormOut": inf_norm
},
attrs={
"beta1": self._beta1,
"beta2": self._beta2,
"epsilon": self._epsilon
},
stop_gradient=True)
return adamax_op
def _finish_update(self, block, parameters_and_grads):
"""Update Beta1 Power accumulator
"""
assert isinstance(block, framework.Block)
for param, grad in parameters_and_grads:
if grad is None or param.trainable is False:
continue
with param.block.program._optimized_guard(
[param, grad]), name_scope('adamax'):
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param)
block.append_op(
type="scale",
inputs={"X": beta1_pow_acc},
outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1},
stop_gradient=True)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .optimizer import Optimizer
from .adam import Adam
from ..fluid import framework
import paddle
__all__ = ['AdamW']
class DecoupledWeightDecay(object):
def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
if not isinstance(coeff, float) and \
not isinstance(coeff, framework.Variable):
raise TypeError("coeff should be float or Tensor.")
self._params_name = set()
self._apply_decay_param_fun = apply_decay_param_fun
self._coeff = coeff
super(DecoupledWeightDecay, self).__init__(**kwargs)
def _scale_parameters(self, params_and_grads):
"""
Adds weight decay ops.
scaled_parameter = parameter * coeff
Args:
params_and_grads: A list of (parameters, gradients) pairs,
the parameters need to decay.
Raises:
Exception: The type of coeff and parameter is not consistent.
"""
if isinstance(self._coeff, float) and self._coeff == 0.0:
return
scaled_params = []
for param, grad in params_and_grads:
# If no gradient then we don't need to do anything
if grad is None:
continue
if self._apply_decay_param_fun is not None \
and not self._apply_decay_param_fun(param.name):
continue
if isinstance(self._coeff, float):
assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
"the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
else:
assert self._coeff.dtype == param.dtype, \
"the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
assert param.name not in self._params_name
scaled_params.append((param, grad, param * self._coeff))
self._params_name.add(param.name)
return scaled_params
def backward(self, **kargs):
return super(DecoupledWeightDecay, self).backward(**kargs)
def _apply_optimize(self, **kargs):
return super(DecoupledWeightDecay, self)._apply_optimize(**kargs)
def minimize(self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None):
params_grads = self.backward(
loss=loss,
startup_program=startup_program,
parameters=parameters,
no_grad_set=no_grad_set)
scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
paddle.fluid.layers.assign(input=updated_param, output=param)
optimize_ops = self._apply_optimize(
loss=loss,
params_grads=params_grads,
startup_program=startup_program)
return optimize_ops, params_grads
@framework.dygraph_only
def step(self):
parameter_list = self._parameter_list
self._dtype = None
params_grads = []
for param in self._parameter_list:
if not param.trainable:
continue
if param._grad_ivar() is not None:
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
paddle.fluid.layers.assign(input=updated_param, output=param)
optimize_ops = self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads)
def __str__(self):
return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
class AdamW(DecoupledWeightDecay, Adam):
"""
The AdamW optimizer is implemented based on the AdamW Optimization
in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
it can resolves the problem of L2 regularization failure in the Adam optimizer.
.. math::
t & = t + 1
moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
learning\_rate & = learning\_rate * \\
\\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t}
param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
Args:
learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay. The default value is 0.001.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.9.
beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
It should be a float number or a Tensor with shape [1] and data type as float32.
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
weight_decay (float|Tensor): The weight decay coefficient, it can be float or Tensor. The default value is 0.0.
The default value is 1e-08.
apply_decay_param_fun (function|None): If it is not None,
only tensors that makes apply_decay_param_fun(Tensor)==True
will be updated. It only works when we want to specify tensors.
Default: None.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
The accumulators are updated at every step. Every element of the two moving-average
is updated in both dense mode and sparse mode. If the size of parameter is very large,
then the update may be very slow. The lazy mode only update the element that has
gradient in current mini-batch, so it will be much more faster. But this mode has
different semantics with the original Adam algorithm and may lead to different result.
The default value is False.
**Notes**:
**Currently, AdamW doesn't support sparse parameter optimization.**
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
adam = paddle.optimizer.AdamW(learning_rate=0.1,
parameters=linear.parameters(),
beta1=beta1,
beta2=beta2,
weight_decay=0.01)
out.backward()
adam.step()
adam.clear_grad()
"""
def __init__(self,
learning_rate=0.001,
parameters=None,
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
weight_decay=0.0,
apply_decay_param_fun=None,
grad_clip=None,
name=None,
lazy_mode=False):
args_dict = {
"learning_rate": learning_rate,
"parameters": parameters,
"beta1": beta1,
"beta2": beta2,
"epsilon": epsilon,
"grad_clip": grad_clip,
"name": name,
"lazy_mode": lazy_mode
}
super(AdamW, self).__init__(
weight_decay,
apply_decay_param_fun=apply_decay_param_fun,
**args_dict)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import six
import logging
from collections import defaultdict
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
from ..fluid import framework
from ..fluid import layers
from ..fluid import unique_name
from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
from ..fluid.framework import program_guard
from ..fluid.initializer import Constant
from ..fluid.layer_helper import LayerHelper
from ..fluid.layers import ops
from ..fluid.regularizer import append_regularization_ops
from ..fluid.dygraph import base as imperative_base
from ..fluid.dygraph import no_grad
from ..fluid.dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay
from paddle.fluid import core
from paddle.fluid.layers import tensor
from functools import reduce
from ..fluid.wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt
__all__ = ['Optimizer']
class Optimizer(object):
"""Optimizer Base class.
Define the common interface of an optimizer.
User should not use this class directly,
but need to use one of it's implementation.
Args:
learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \
:ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
the regularization setting here in optimizer will be ignored for this parameter. \
Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \
some derived class of ``GradientClipBase`` . There are three cliping strategies \
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , \
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`.
The default value is None.
Returns:
Base class for optimizer.
Examples:
.. code-block:: python
#Take the subclass adam as an example
#Optimizer
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
adam = paddle.optimizer.Adam(learning_rate=0.1,
parameters=linear.parameters())
out.backward()
adam.step()
adam.clear_grad()
"""
@imperative_base.no_grad()
def __init__(self,
learning_rate,
parameters=None,
weight_decay=None,
grad_clip=None,
name=None):
self._parameter_list = list(
parameters) if parameters is not None else None
self._name = name
if framework.in_dygraph_mode():
if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, LearningRateDecay):
raise TypeError(
"learning rate should be float or LearningRateDecay, got %s here"
% type(learning_rate))
if self._parameter_list is None:
raise AttributeError(
"parameters argument given to the Optimizer should not be None in dygraph mode."
)
if weight_decay is not None:
for param in self._parameter_list:
if param.regularizer is not None:
logging.info(
"If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
"The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% weight_decay.__str__())
break
else:
if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable):
raise TypeError(
"learning rate should be float or Tensor, got %s here" %
type(learning_rate))
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
if isinstance(weight_decay, float):
from ..fluid.regularizer import L2Decay
self.regularization = L2Decay(weight_decay)
else:
self.regularization = weight_decay
self._grad_clip = grad_clip
self._learning_rate = learning_rate
# the learning rate type should be inferenced from loss
self._dtype = None
# each program should have a independent learning rate
# program -> tensor(learning_rate)
self._learning_rate_map = dict()
if isinstance(self._learning_rate, framework.Variable):
self._learning_rate_map[framework.default_main_program(
)] = self._learning_rate
# Dictionary of accumulators. Some optimizer subclasses need to
# allocate and manage extra tensors associated with the parameters
# to train. These tensors are called accumulators.
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict())
self.helper = None
self._opti_name_list = []
self._accumulators_holder = {}
self._param_device_map = dict()
self.clear_gradients = self.clear_grad
@framework.dygraph_only
def state_dict(self):
'''
Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict.
If the optimizer never be called(minimize function), the state_dict is empty.
Args:
None
Returns:
state_dict(dict) : dict contains all the Tensor used by optimizer
Examples:
.. code-block:: python
import paddle
paddle.disable_static()
emb = paddle.nn.Embedding([10, 10])
adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
state_dict = adam.state_dict()
'''
state_dict = {}
for k, v in self._accumulators.items():
for para_name, var_tmp in v.items():
state_dict[var_tmp.name] = var_tmp
# global step if use lr decay
if isinstance(self._learning_rate, LearningRateDecay):
state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
if not isinstance(self._learning_rate, _LearningRateEpochDecay):
var_tmp = None
var_temp = framework._varbase_creator(
None, name='global_step', dtype='int32')
tensor.fill_constant(
[1], "int32", self._learning_rate.step_num, out=var_temp)
state_dict['global_step'] = var_temp
return state_dict
@framework.dygraph_only
def set_state_dict(self, state_dict):
'''
Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
Args:
state_dict(dict) : Dict contains all the Tensor needed by optimizer
Return:
None
Examples:
.. code-block:: python
import paddle
paddle.disable_static()
emb = paddle.nn.Embedding([10, 10])
state_dict = emb.state_dict()
paddle.framework.save(state_dict, "paddle_dy")
adam = paddle.optimizer.Adam(learning_rate=paddle.nn.functional.noam_decay( 100, 10000),
parameters=emb.parameters())
state_dict = adam.state_dict()
paddle.framework.save(state_dict, "paddle_dy")
para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
adam.set_state_dict(opti_state_dict)
'''
if isinstance(self._learning_rate, LearningRateDecay):
self._learning_rate.set_dict(state_dict["LR_Scheduler"])
if not isinstance(self._learning_rate, _LearningRateEpochDecay):
assert 'global_step' in state_dict, \
'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict'
global_step = state_dict['global_step']
if isinstance(global_step, Variable):
step_np = global_step
step_np = np.array(step_np.value().get_tensor())
assert step_np.shape == (1,), \
"global step shape is (1,), the shape is {}".format( step_np.shape )
self._learning_rate.step_num = int(step_np[0])
elif isinstance(global_step, np.ndarray):
assert global_step.shape == (1,), \
"global step shape is (1,), the shape is {}".format( global_step.shape )
self._learning_rate.step_num = global_step[0]
else:
raise RuntimeError(
"Type not supprt, value in state dict must be [VarBase, Tensor, numpy], the type is ",
type(global_step))
self._accumulators_holder = state_dict
for k, v in self._accumulators.items():
for para_name, var_tmp in v.items():
assert var_tmp.name in state_dict, \
"optimizer Tensor {} not found".format( var_tmp.name )
var = var_tmp.value()
tensor = var.get_tensor()
model_np = np.array(tensor)
load_para = state_dict[var_tmp.name]
if isinstance(load_para, Variable):
load_para_np = load_para.numpy()
elif isinstance(load_para, core.VarBase):
load_para_np = load_para.numpy()
elif isinstance(load_para, np.ndarray):
load_para_np = load_para
else:
raise RuntimeError("State dict type {} not supprt".format(
str(type(load_para))))
assert model_np.shape == load_para_np.shape, \
"Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
item.name, model_np.shape, load_para_np.shape)
assert model_np.dtype == load_para_np.dtype, \
"Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
item.name, model_np.dtype, load_para_np.dtype)
tensor.set(load_para_np, framework._current_expected_place())
def get_opti_var_name_list(self):
return self._opti_name_list
def _create_global_learning_rate(self):
if imperative_base.enabled():
# create learning rate tensor
if isinstance(self._learning_rate, float):
lr = self._global_learning_rate()
if isinstance(lr, framework.Variable):
return
else:
self._learning_rate_map[framework.default_main_program(
)] = layers.create_global_var(
name=unique_name.generate("learning_rate"),
shape=[1],
value=float(self._learning_rate),
dtype='float32' if self._dtype is None else self._dtype,
persistable=True)
# get learning rate Tensor from LearningRateDecay
elif isinstance(self._learning_rate, LearningRateDecay):
self._learning_rate_map[framework.default_main_program(
)] = self._learning_rate()
else:
raise TypeError(
"optimizer's learning rate must be float or LearningRateDecay"
)
else:
lr = self._global_learning_rate()
if isinstance(lr, framework.Variable):
return
else:
if not isinstance(self._learning_rate, float):
raise TypeError(
"learning rate Tensor is create outside optimizer,"
"can not create new learning rate Tensor for new program"
)
# create learning rate in the current main program
self._learning_rate_map[framework.default_main_program(
)] = layers.create_global_var(
name=unique_name.generate("learning_rate"),
shape=[1],
value=float(self._learning_rate),
dtype='float32' if self._dtype is None else self._dtype,
persistable=True)
@framework.dygraph_only
def set_lr(self, value):
"""
:api_attr: imperative
Set the value of the learning rate manually in the optimizer. If the optimizer use LearningRateDecay,
this API cannot be invoked, because it will lead to conflict.
Args:
value (float|Tensor): the value of learning rate
Returns:
None
Examples:
.. code-block:: python
import paddle
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
# set learning rate manually by python float value
lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
for i in range(5):
adam.set_lr(lr_list[i])
lr = adam.get_lr()
print("current lr is {}".format(lr))
# Print:
# current lr is 0.2
# current lr is 0.3
# current lr is 0.4
# current lr is 0.5
# current lr is 0.6
# set learning rate manually by framework Tensor
lr_var = paddle.create_global_var(
shape=[1], value=0.7, dtype='float32')
adam.set_lr(lr_var)
lr = adam.get_lr()
print("current lr is {}".format(lr))
# Print:
# current lr is 0.7
"""
if not isinstance(value, (framework.Variable, float)):
raise TypeError(
"The type of 'value' in optimizer.set_lr must be (float, Tensor), but received %s."
% (type(value)))
if isinstance(self._learning_rate, LearningRateDecay):
raise RuntimeError(
"optimizer's learning rate can't be LearningRateDecay when invoke this API, because this will lead to conflict."
)
if isinstance(value, float):
self._learning_rate = value
current_lr = self._global_learning_rate()
if current_lr is not None:
global_block = framework.default_main_program().global_block()
global_block.append_op(
type='fill_constant',
outputs={'Out': [current_lr]},
attrs={
'dtype': current_lr.dtype,
'shape': list(current_lr.shape),
'value': float(value)
},
stop_gradient=True)
else:
assert len(value.shape) == 1 and value.shape[
0] == 1, "optimizer's learning rate must be 1-D Tensor with shape[1]"
self._learning_rate_map[framework.default_main_program()] = value
@framework.dygraph_only
def get_lr(self):
"""
:api_attr: imperative
Get current step learning rate. The return value is all the same When LearningRateDecay is not used,
otherwise return the step learning rate.
Returns:
float: The learning rate of the current step.
Examples:
.. code-block:: python
import numpy as np
import paddle
# example1: LearningRateDecay is not used, return value is all the same
paddle.disable_static()
emb = paddle.nn.Embedding([10, 10])
adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
lr = adam.get_lr()
print(lr) # 0.001
# example2: PiecewiseDecay is used, return the step learning rate
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.reduce_mean(out)
bd = [2, 4, 6, 8]
value = [0.2, 0.4, 0.6, 0.8, 1.0]
adam = paddle.optimizer.Adam(paddle.PiecewiseDecay(bd, value, 0),
parameters=linear.parameters())
# first step: learning rate is 0.2
np.allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0) # True
# learning rate for different steps
ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
for i in range(12):
adam.step()
lr = adam.get_lr()
np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True
"""
current_lr = self._global_learning_rate()
if isinstance(current_lr, framework.Variable):
return self._global_learning_rate().numpy()[0]
if isinstance(self._learning_rate, float):
return self._learning_rate
elif isinstance(self._learning_rate, _LearningRateEpochDecay):
step_lr = self._learning_rate()
return step_lr.numpy()[0]
else:
step_lr = self._learning_rate.step()
if isinstance(step_lr, (float, int)):
return step_lr
else:
return step_lr.numpy()[0]
def _global_learning_rate(self, program=None):
"""
get global decayed learning rate
:return:
"""
if program is None:
program = framework.default_main_program()
return self._learning_rate_map.get(program, None)
def _append_optimize_op(self, block, param_and_grad):
""" append optimize operator to block and return all the added optimize_op
"""
raise NotImplementedError(
"Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\""
)
def _create_param_lr(self, param_and_grad):
# create learning rate tensor for every parameter
param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate']
if type(param_lr) == Variable:
return param_lr
else:
if param_lr == 1.0:
return self._global_learning_rate()
else:
with default_main_program()._lr_schedule_guard(
is_with_opt=True), framework.name_scope(
'scale_with_param_lr'):
return self._global_learning_rate() * param_lr
def _create_accumulators(self, block, parameters):
"""Create all accumulators needed by the parameters
Args:
block: the block in which the loss tensor is present
parameters: list of parameter tensors for the optimizer
"""
pass
def _finish_update(self, block, parameters_and_grads):
"""Finish any custom updates needed
before completing an optimization step
Args:
block: the block in which the loss tensor is present
parameters: list of parameter tensors for the optimizer
Returns:
None
"""
pass
def _add_accumulator(self,
name,
param,
dtype=None,
fill_value=0.0,
shape=None,
type=None,
device=None):
"""Utility function to add an accumulator for a parameter
Args:
block: the block in which the loss tensor is present
name: name of the accumulator
param: parameter tensor for which accumulator is to be added
dtype: data type of the accumulator tensor
fill_value: value to initialize the accumulator tensor
"""
if self._name is not None:
name = self._name + "_" + name
if (name in self._accumulators and
param.name in self._accumulators[name]):
if framework.in_dygraph_mode():
return self._accumulators[name][param.name]
raise Exception("Accumulator {} already exists for parameter {}".
format(name, param.name))
if shape == None:
shape = param.shape
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_" + name
var_name = unique_name.generate(var_name)
self._opti_name_list.append(var_name)
var = self.helper.create_global_variable(
name=var_name,
persistable=True,
dtype=dtype or param.dtype,
type=param.type if type is None else type,
shape=shape,
belong_to_optimizer=True)
if device is None:
device = self._get_device_for_param(param.name)
with device_guard(device):
self.helper.set_variable_initializer(
var, initializer=Constant(value=float(fill_value)))
if framework.in_dygraph_mode():
if len(self._accumulators_holder) > 0:
assert var_name in self._accumulators_holder, \
"Optimizer set error, {} should in state dict".format( var_name )
var.set_value(self._accumulators_holder[var_name])
self._accumulators[name][param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter tensor for which accumulator is to be fetched
Returns:
accumulator tensor for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
if (name not in self._accumulators or
param.name not in self._accumulators[name]):
raise Exception("Accumulator {} does not exist for parameter {}".
format(name, param.name))
return self._accumulators[name][param.name]
def _update_param_device_map(self, parameters_and_grads, target_block):
for param_and_grad in parameters_and_grads:
if param_and_grad[0].trainable is True:
param_name = param_and_grad[0].name
ops = target_block.ops
device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
)
for op in ops:
input_arg_names = op.input_arg_names
if param_name in input_arg_names:
self._param_device_map[param_name] = op.attr(
device_attr_name)
break
def _get_device_for_param(self, param_name):
device = None
if param_name in self._param_device_map:
device = self._param_device_map[param_name]
return device
def _create_optimization_pass(self, parameters_and_grads):
"""Add optimization operators to update gradients to tensors.
Args:
parameters_and_grads(list(tuple(Tensor, Tensor))):
a list of (tensor, gradient) pair to update.
Returns:
return_op_list: a list of operators that will complete one step of
optimization. This will include parameter update ops, global step
update ops and any other custom ops required by subclasses to manage
their internal state.
"""
# This is a default implementation of create_optimization_pass that
# can be shared by most optimizers. This implementation assumes that
# the subclass will implement the _append_optimize_op method and the
# _initialize_tensors method. The subclass can extend the
# _create_accumulators method if it needs to create accumulators
# for parameters and extend _finish_update method to add custom ops.
# Allways called under program_guard use global block as loss block
# But if current block is in control flow, append optimize op in the
# grad block of current block
global_block = framework.default_main_program().global_block()
target_block = global_block
current_block = framework.default_main_program().current_block()
if current_block.idx != global_block.idx:
assert current_block.backward_block_idx != -1, \
"current block is not global_block, but it doesn't have backward block."
target_block = framework.default_main_program().blocks[
current_block.backward_block_idx]
start = len(target_block.ops)
self.helper = LayerHelper(self.__class__.__name__)
self._update_param_device_map(parameters_and_grads, target_block)
self._create_accumulators(
target_block,
[p[0] for p in parameters_and_grads if p[0].trainable])
self._create_global_learning_rate()
if framework.in_dygraph_mode():
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
if param_and_grad[0].trainable is True:
self._append_optimize_op(target_block, param_and_grad)
else:
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
with param_and_grad[0].block.program._optimized_guard(
param_and_grad), name_scope("optimizer"):
if param_and_grad[0].trainable is True:
device = self._get_device_for_param(param_and_grad[0]
.name)
with device_guard(device):
optimize_op = self._append_optimize_op(
target_block, param_and_grad)
# Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies
self._finish_update(target_block, parameters_and_grads)
end = len(target_block.ops)
return target_block._slice_ops(start, end)
def _append_dgc_ops(self, param_and_grad):
pass
def backward(self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None,
callbacks=None):
"""
The first part of ``minimize``, do auto-diff to append backward operations for
the current program.
Args:
loss (Tensor): ``loss`` tensor to run optimizations.
startup_program (Program, optional): :ref:`api_fluid_Program` for
initializing parameters in ``parameters``. The default value
is None, at this time :ref:`api_fluid_default_startup_program` will be used.
parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
to minimize ``loss``. The default value is None, at this time all parameters
will be updated.
no_grad_set (set, optional): Set of ``Tensor`` or ``Tensor.name`` that don't need
to be updated. The default value is None.
callbacks (list, optional): list of callable objects to run when appending backward
operator for one parameter. The default value is None.
Return:
list: list of (param, grad) tensor pairs, param is ``Parameter``,
grad is the gradient value corresponding to the parameter.
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters())
out = linear(a)
out.backward()
adam.step()
adam.clear_grad()
"""
act_no_grad_set = None
if framework.in_dygraph_mode():
pass
else:
act_no_grad_set = self._get_no_grad_set(loss, no_grad_set)
self._dtype = loss.dtype
if framework.in_dygraph_mode():
params_grads = []
for param in self._parameter_list:
if not param.trainable:
continue
if param._grad_ivar() is not None:
# create gradient tensor
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
else:
if callbacks is None:
callbacks = [error_clip_callback]
else:
assert (isinstance(callbacks, list))
program = loss.block.program
assert len(loss.shape) == 1 and loss.shape[0] == 1, \
"The loss.shape should be (1L,), but the current loss.shape is {}. " \
"Maybe that you should call paddle.mean to process the current loss.".format(
loss.shape)
parameter_list = parameters if parameters \
else self._parameter_list
with program_guard(program, startup_program):
params_grads = append_backward(loss, parameter_list,
act_no_grad_set, callbacks)
# Note: since we can't use all_reduce_op now,
# dgc_op should be the last op of one grad.
self._append_dgc_ops(params_grads)
return params_grads
def apply_gradients(self, params_grads):
"""
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
Args:
params_grads (list): list of (param, grad) pair to do optimization.
Returns:
list: A list of operators appended to the current program.
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
optimizer = paddle.optimizer.Adam(learning_rate=0.1,
parameters=linear.parameters())
params_grads = optimizer.backward(loss)
optimizer.apply_gradients(params_grads)
"""
params_grads = sorted(params_grads, key=lambda x: x[0].name)
# 'optimizer(grad_clip)' or 'set_gradient_clip'
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
else:
params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any
params_grads = append_regularization_ops(params_grads,
self.regularization)
optimize_ops = self._create_optimization_pass(params_grads)
return optimize_ops
def _apply_optimize(self, loss, startup_program, params_grads):
"""
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
Args:
loss (Tensor): loss tensor to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameters`.
params_grads (list): list of (param, grad) pair to do optimization.
Returns:
list: A list of operators appended to the current program.
"""
if framework.in_dygraph_mode():
with program_guard(framework.default_main_program(),
framework.default_startup_program()):
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
params_grads = append_regularization_ops(params_grads,
self.regularization)
optimize_ops = self._create_optimization_pass(params_grads)
else:
program = loss.block.program
with program_guard(program, startup_program):
optimize_ops = self.apply_gradients(params_grads)
return optimize_ops
def _get_no_grad_set(self, loss, no_grad_set=None):
no_grad_set = _get_no_grad_set_name(no_grad_set)
parameters = loss.block.program.global_block().all_parameters()
param_no_trainable = set(
[param.name for param in parameters if param.trainable is False])
# If the parameter is no trainable, it should not have a gradient.
no_grad_set.update(param_no_trainable)
return no_grad_set
@framework.dygraph_only
def clear_grad(self):
"""
Clear the gradients of all optimized parameters for model.
Returns:
None
Examples:
.. code-block:: python
import numpy as np
import paddle
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters())
out = linear(a)
out.backward()
adam.step()
adam.clear_grad()
"""
for p in self._parameter_list:
if p.trainable:
p.clear_gradient()
@imperative_base.no_grad()
def minimize(self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None):
"""
Add operations to minimize ``loss`` by updating ``parameters``.
Args:
loss (Tensor): A ``Tensor`` containing the value to minimize.
startup_program (Program, optional): :ref:`api_fluid_Program` for
initializing parameters in ``parameters``. The default value
is None, at this time :ref:`api_fluid_default_startup_program` will be used.
parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
to minimize ``loss``. The default value is None, at this time all parameters
will be updated.
no_grad_set (set, optional): Set of ``Tensor`` or ``Tensor.name`` that don't need
to be updated. The default value is None.
Returns:
tuple: tuple (optimize_ops, params_grads), A list of operators appended
by minimize and a list of (param, grad) tensor pairs, param is
``Parameter``, grad is the gradient value corresponding to the parameter.
The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
indicate program pruning. If so, the program will be pruned by ``feed`` and
``fetch_list`` before run, see details in ``Executor``.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
adam_optimizer = paddle.optimizer.Adam(0.01)
adam_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
"""
assert isinstance(loss, Variable), "The loss should be an Tensor."
parameter_list = parameters if parameters \
else self._parameter_list
params_grads = self.backward(
loss,
startup_program=startup_program,
parameters=parameter_list,
no_grad_set=no_grad_set)
optimize_ops = self._apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads)
return optimize_ops, params_grads
@framework.dygraph_only
def step(self):
"""
Execute the optimizer once.
Returns:
None
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters())
out = linear(a)
out.backward()
adam.step()
adam.clear_grad()
"""
parameter_list = self._parameter_list
self._dtype = None
params_grads = []
for param in self._parameter_list:
if not param.trainable:
continue
if param._grad_ivar() is not None:
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
optimize_ops = self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .optimizer import Optimizer
from ..fluid import core
from ..fluid import framework
from ..fluid.framework import Variable
__all__ = ["RMSProp"]
class RMSProp(Optimizer):
"""
Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
rate method. The original slides proposed RMSProp: Slide 29 of
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
The original equation is as follows:
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
The first equation calculates moving average of the squared gradient for
each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
In some cases, adding a momentum term :math: `\\beta` is beneficial.
In our implementation, Nesterov momentum is used:
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
\\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t)
if centered is True:
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
\\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t)
where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95
and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
smoothing term to avoid division by zero, usually set somewhere in range
from 1e-4 to 1e-8.
Parameters:
learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``.
It can be a float value or a LearningRateDecay.
rho(float): rho is :math: `\\rho` in equation, default is 0.95.
epsilon(float): :math: `\\epsilon` in equation is smoothing term to
avoid division by zero, default is 1e-6.
momentum(float): :math:`\\beta` in equation is the momentum term,
default is 0.0.
centered(bool): If True, gradients are normalized by the estimated variance of
the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False.
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \
:ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
the regularization setting here in optimizer will be ignored for this parameter. \
Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None.
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
adam = paddle.optimizer.RMSProp(learning_rate=0.1,
parameters=linear.parameters(),
weight_decay=0.01)
out.backward()
adam.step()
adam.clear_grad()
"""
_momentum_acc_str = "momentum"
_mean_square_acc_str = "mean_square"
_mean_grad_acc_str = "mean_grad"
def __init__(self,
learning_rate,
rho=0.95,
epsilon=1.0e-6,
momentum=0.0,
centered=False,
parameters=None,
weight_decay=None,
grad_clip=None,
name=None):
if learning_rate is None:
raise ValueError("learning_rate is not set.")
if rho is None:
raise ValueError("rho is not set.")
if epsilon is None:
raise ValueError("epsilon is not set.")
if momentum is None:
raise ValueError("momentum is not set.")
super(RMSProp, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
weight_decay=weight_decay,
grad_clip=grad_clip,
name=name)
self.type = "rmsprop"
self._rho = rho
self._epsilon = epsilon
self._momentum = momentum
self._centered = centered
def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
for p in parameters:
self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p)
self._add_accumulator(self._mean_grad_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
momentum_acc = self._get_accumulator(self._momentum_acc_str,
param_and_grad[0])
mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
param_and_grad[0])
mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
param_and_grad[0])
rmsprop_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": momentum_acc,
"MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad),
},
outputs={
"ParamOut": param_and_grad[0],
"MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc
},
attrs={
"epsilon": self._epsilon,
"decay": self._rho,
"momentum": self._momentum,
"centered": self._centered
},
stop_gradient=True)
return rmsprop_op
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册