test_imperative_reinforcement.py 6.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
16

17
import numpy as np
18
from test_imperative_base import new_program_scope
19 20 21 22

import paddle
import paddle.fluid as fluid
import paddle.fluid.dygraph.nn as nn
23
from paddle.fluid import core
H
hong 已提交
24
from paddle.fluid.framework import _test_eager_guard
25
from paddle.fluid.optimizer import SGDOptimizer
26 27 28


class Policy(fluid.dygraph.Layer):
29
    def __init__(self, input_size):
30
        super().__init__()
31

32 33
        self.affine1 = nn.Linear(input_size, 128)
        self.affine2 = nn.Linear(128, 2)
34 35 36 37 38 39
        self.dropout_ratio = 0.6

        self.saved_log_probs = []
        self.rewards = []

    def forward(self, inputs):
40
        x = paddle.reshape(inputs, shape=[-1, 4])
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
        x = self.affine1(x)
        x = fluid.layers.dropout(x, self.dropout_ratio)
        x = fluid.layers.relu(x)
        action_scores = self.affine2(x)
        return fluid.layers.softmax(action_scores, axis=1)


class TestImperativeMnist(unittest.TestCase):
    def test_mnist_float32(self):
        seed = 90
        epoch_num = 1

        state = np.random.normal(size=4).astype("float32")
        state_list = state.tolist()
        reward = np.random.random(size=[1, 1]).astype("float32")
        reward_list = reward.tolist()
        action_list = [1]
        action = np.array(action_list).astype("float32")
        mask_list = [[0, 1]]
        mask = np.array(mask_list).astype("float32")

H
hong 已提交
62
        def run_dygraph():
C
cnn 已提交
63
            paddle.seed(seed)
L
Leo Chen 已提交
64
            paddle.framework.random._manual_program_seed(seed)
65

66
            policy = Policy(input_size=4)
67 68 69 70 71 72 73 74 75 76

            dy_state = fluid.dygraph.base.to_variable(state)
            dy_state.stop_gradient = True
            loss_probs = policy(dy_state)

            dy_mask = fluid.dygraph.base.to_variable(mask)
            dy_mask.stop_gradient = True

            loss_probs = fluid.layers.log(loss_probs)
            loss_probs = fluid.layers.elementwise_mul(loss_probs, dy_mask)
77
            loss_probs = paddle.sum(loss_probs, axis=-1)
78 79 80 81 82

            dy_reward = fluid.dygraph.base.to_variable(reward)
            dy_reward.stop_gradient = True

            loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs)
83
            loss = paddle.sum(loss_probs)
84

85 86 87
            sgd = SGDOptimizer(
                learning_rate=1e-3, parameter_list=policy.parameters()
            )
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103

            dy_param_init_value = {}

            dy_out = loss.numpy()

            for param in policy.parameters():
                dy_param_init_value[param.name] = param.numpy()

            loss.backward()
            sgd.minimize(loss)
            policy.clear_gradients()

            dy_param_value = {}
            for param in policy.parameters():
                dy_param_value[param.name] = param.numpy()

H
hong 已提交
104 105 106 107 108 109 110
            return dy_out, dy_param_init_value, dy_param_value

        with fluid.dygraph.guard():
            dy_out, dy_param_init_value, dy_param_value = run_dygraph()

        with fluid.dygraph.guard():
            with _test_eager_guard():
111 112 113 114 115
                (
                    eager_out,
                    eager_param_init_value,
                    eager_param_value,
                ) = run_dygraph()
H
hong 已提交
116

117
        with new_program_scope():
C
cnn 已提交
118
            paddle.seed(seed)
L
Leo Chen 已提交
119
            paddle.framework.random._manual_program_seed(seed)
120

121 122 123 124 125
            exe = fluid.Executor(
                fluid.CPUPlace()
                if not core.is_compiled_with_cuda()
                else fluid.CUDAPlace(0)
            )
126

127
            policy = Policy(input_size=4)
128 129 130

            st_sgd = SGDOptimizer(learning_rate=1e-3)

131 132 133 134 135 136 137 138 139
            st_state = fluid.layers.data(
                name='st_state', shape=[4], dtype='float32'
            )
            st_reward = fluid.layers.data(
                name='st_reward', shape=[1], dtype='float32'
            )
            st_mask = fluid.layers.data(
                name='st_mask', shape=[2], dtype='float32'
            )
140 141 142 143 144

            st_loss_probs = policy(st_state)

            st_loss_probs = fluid.layers.log(st_loss_probs)
            st_loss_probs = fluid.layers.elementwise_mul(st_loss_probs, st_mask)
145
            st_loss_probs = paddle.sum(st_loss_probs, axis=-1)
146

147
            st_loss_probs = fluid.layers.elementwise_mul(
148 149
                st_reward, st_loss_probs
            )
150
            st_loss = paddle.sum(st_loss_probs)
151 152 153 154 155 156 157 158 159

            st_sgd.minimize(st_loss)

            # initialize params and fetch them
            static_param_init_value = {}
            static_param_name_list = []
            for param in policy.parameters():
                static_param_name_list.append(param.name)

160 161 162 163
            out = exe.run(
                fluid.default_startup_program(),
                fetch_list=static_param_name_list,
            )
164 165 166 167 168 169 170

            for i in range(len(static_param_name_list)):
                static_param_init_value[static_param_name_list[i]] = out[i]

            fetch_list = [st_loss.name]
            fetch_list.extend(static_param_name_list)

171 172 173 174 175
            out = exe.run(
                fluid.default_main_program(),
                feed={"st_state": state, "st_reward": reward, "st_mask": mask},
                fetch_list=fetch_list,
            )
176 177 178 179 180 181

            static_param_value = {}
            static_out = out[0]
            for i in range(1, len(out)):
                static_param_value[static_param_name_list[i - 1]] = out[i]

182
        # np.testing.assert_allclose(dy_x_data.all(), static_x_data.all(), rtol=1e-5)
183

184
        for key, value in static_param_init_value.items():
185 186 187 188
            self.assertTrue(np.equal(value, dy_param_init_value[key]).all())

        self.assertTrue(np.equal(static_out, dy_out).all())

189
        for key, value in static_param_value.items():
190 191
            self.assertTrue(np.equal(value, dy_param_value[key]).all())

H
hong 已提交
192
        # check eager
193
        for key, value in static_param_init_value.items():
H
hong 已提交
194 195 196 197
            self.assertTrue(np.equal(value, eager_param_init_value[key]).all())

        self.assertTrue(np.equal(static_out, eager_out).all())

198
        for key, value in static_param_value.items():
H
hong 已提交
199 200
            self.assertTrue(np.equal(value, eager_param_value[key]).all())

201 202

if __name__ == '__main__':
H
hong 已提交
203
    paddle.enable_static()
204
    unittest.main()