diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py new file mode 100644 index 0000000000000000000000000000000000000000..36f6daeb37fda17feb71d8a5205884ce47d9d612 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py @@ -0,0 +1,175 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import contextlib +import unittest +import numpy as np +import six + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC +import paddle.fluid.dygraph.nn as nn +from paddle.fluid.dygraph.base import to_variable +from test_imperative_base import new_program_scope + + +class Policy(fluid.dygraph.Layer): + def __init__(self, name_scope): + super(Policy, self).__init__(name_scope) + + self.affine1 = nn.FC(self.full_name(), size=128) + self.affine2 = nn.FC(self.full_name(), size=2) + self.dropout_ratio = 0.6 + + self.saved_log_probs = [] + self.rewards = [] + + def forward(self, inputs): + x = fluid.layers.reshape(inputs, shape=[-1, 4]) + x = self.affine1(x) + x = fluid.layers.dropout(x, self.dropout_ratio) + x = fluid.layers.relu(x) + action_scores = self.affine2(x) + return fluid.layers.softmax(action_scores, axis=1) + + +class TestImperativeMnist(unittest.TestCase): + def test_mnist_float32(self): + seed = 90 + epoch_num = 1 + + state = np.random.normal(size=4).astype("float32") + state_list = state.tolist() + reward = np.random.random(size=[1, 1]).astype("float32") + reward_list = reward.tolist() + action_list = [1] + action = np.array(action_list).astype("float32") + mask_list = [[0, 1]] + mask = np.array(mask_list).astype("float32") + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + policy = Policy("PolicyModel") + + dy_state = fluid.dygraph.base.to_variable(state) + dy_state.stop_gradient = True + loss_probs = policy(dy_state) + + dy_mask = fluid.dygraph.base.to_variable(mask) + dy_mask.stop_gradient = True + + loss_probs = fluid.layers.log(loss_probs) + loss_probs = fluid.layers.elementwise_mul(loss_probs, dy_mask) + loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1) + + dy_reward = fluid.dygraph.base.to_variable(reward) + dy_reward.stop_gradient = True + + loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs) + loss = fluid.layers.reduce_sum(loss_probs) + + sgd = SGDOptimizer(learning_rate=1e-3) + + dy_param_init_value = {} + + dy_out = loss.numpy() + + for param in policy.parameters(): + dy_param_init_value[param.name] = param.numpy() + + loss.backward() + sgd.minimize(loss) + policy.clear_gradients() + + dy_param_value = {} + for param in policy.parameters(): + dy_param_value[param.name] = param.numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + + policy = Policy("PolicyModel") + + st_sgd = SGDOptimizer(learning_rate=1e-3) + + st_state = fluid.layers.data( + name='st_state', shape=[4], dtype='float32') + st_reward = fluid.layers.data( + name='st_reward', shape=[1], dtype='float32') + st_mask = fluid.layers.data( + name='st_mask', shape=[2], dtype='float32') + + st_loss_probs = policy(st_state) + + st_loss_probs = fluid.layers.log(st_loss_probs) + st_loss_probs = fluid.layers.elementwise_mul(st_loss_probs, st_mask) + st_loss_probs = fluid.layers.reduce_sum(st_loss_probs, dim=-1) + + st_loss_probs = fluid.layers.elementwise_mul(st_reward, + st_loss_probs) + st_loss = fluid.layers.reduce_sum(st_loss_probs) + + st_sgd.minimize(st_loss) + + # initialize params and fetch them + static_param_init_value = {} + static_param_name_list = [] + for param in policy.parameters(): + static_param_name_list.append(param.name) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + + fetch_list = [st_loss.name] + fetch_list.extend(static_param_name_list) + + out = exe.run( + fluid.default_main_program(), + feed={"st_state": state, + "st_reward": reward, + "st_mask": mask}, + fetch_list=fetch_list) + + static_param_value = {} + static_out = out[0] + for i in range(1, len(out)): + static_param_value[static_param_name_list[i - 1]] = out[i] + + #self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all())) + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.equal(value, dy_param_init_value[key]).all()) + + self.assertTrue(np.equal(static_out, dy_out).all()) + + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.equal(value, dy_param_value[key]).all()) + + +if __name__ == '__main__': + unittest.main()