test_reinforcement_learning.py 6.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
16 17 18 19
import math
import unittest

import gym
20
import numpy as np
21

L
Leo Chen 已提交
22
import paddle
23
import paddle.nn.functional as F
24
from paddle import fluid
25
from paddle.fluid.dygraph import to_variable
H
hjyp 已提交
26
from paddle.jit.api import to_static
27
from paddle.nn import Layer
28 29 30 31 32 33

SEED = 2020


class Policy(Layer):
    def __init__(self):
34
        super().__init__()
35

36 37
        self.affine1 = paddle.nn.Linear(4, 128)
        self.affine2 = paddle.nn.Linear(128, 2)
38 39 40 41 42
        self.dropout_ratio = 0.6

        self.saved_log_probs = []
        self.rewards = []

H
hjyp 已提交
43
    @to_static
44
    def forward(self, x):
45
        x = paddle.reshape(x, shape=[1, 4])
46
        x = self.affine1(x)
C
ccrrong 已提交
47
        x = paddle.nn.functional.dropout(x, self.dropout_ratio)
48
        x = F.relu(x)
49 50
        action_scores = self.affine2(x)

51
        log_prob = paddle.nn.functional.softmax(action_scores, axis=1)
52 53 54 55

        return log_prob


56
class Args:
57 58 59 60 61 62
    gamma = 0.99
    log_interval = 1
    train_step = 10


def train(args, place, to_static):
R
Ryan 已提交
63
    paddle.jit.enable_to_static(to_static)
64 65 66 67 68

    env = gym.make('CartPole-v0')
    env.seed(SEED)

    with fluid.dygraph.guard(place):
C
cnn 已提交
69
        paddle.seed(SEED)
L
Leo Chen 已提交
70
        paddle.framework.random._manual_program_seed(SEED)
71 72 73 74 75 76
        local_random = np.random.RandomState(SEED)

        policy = Policy()

        eps = np.finfo(np.float32).eps.item()
        optimizer = fluid.optimizer.AdamaxOptimizer(
77 78
            learning_rate=1e-2, parameter_list=policy.parameters()
        )
79 80

        def get_mean_and_std(values=[]):
81 82
            n = 0.0
            s = 0.0
83 84 85 86 87
            for val in values:
                s += val
                n += 1
            mean = s / n

88
            std = 0.0
89 90 91 92 93 94 95 96 97 98 99 100 101 102
            for val in values:
                std += (val - mean) * (val - mean)
            std /= n
            std = math.sqrt(std)

            return mean, std

        def sample_action(probs):
            sample = local_random.random_sample()
            idx = 0

            while idx < len(probs) and sample > probs[idx]:
                sample -= probs[idx]
                idx += 1
103 104
            mask = [0.0] * len(probs)
            mask[idx] = 1.0
105 106 107 108 109

            return idx, np.array([mask]).astype("float32")

        def choose_best_action(probs):
            idx = 0 if probs[0] > probs[1] else 1
110
            mask = [1.0, 0.0] if idx == 0 else [0.0, 1.0]
111 112 113 114 115 116 117

            return idx, np.array([mask]).astype("float32")

        def select_action(state):
            state = to_variable(state)
            state.stop_gradient = True
            loss_probs = policy(state)
118

119 120 121 122 123 124
            probs = loss_probs.numpy()

            action, _mask = sample_action(probs[0])
            mask = to_variable(_mask)
            mask.stop_gradient = True

125
            loss_probs = paddle.log(loss_probs)
126
            loss_probs = paddle.multiply(loss_probs, mask)
127
            loss_probs = paddle.sum(loss_probs, axis=-1)
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152

            policy.saved_log_probs.append(loss_probs)
            return action, loss_probs

        def finish_episode():
            R = 0
            policy_loss = []
            returns = []
            for r in policy.rewards[::-1]:
                R = r + args.gamma * R
                returns.insert(0, R)

            mean, std = get_mean_and_std(returns)

            returns = np.array(returns).astype("float32")
            returns = (returns - mean) / (std + eps)

            # calculate policy loss of each step.
            for log_prob, R in zip(policy.saved_log_probs, returns):
                log_prob_numpy = log_prob.numpy()

                R_numpy = np.ones_like(log_prob_numpy).astype("float32")
                _R = -1 * R * R_numpy
                _R = to_variable(_R)
                _R.stop_gradient = True
153
                cur_loss = paddle.multiply(_R, log_prob)
154 155
                policy_loss.append(cur_loss)

156
            policy_loss = paddle.concat(policy_loss)
157
            policy_loss = paddle.sum(policy_loss)
158 159 160 161 162 163 164 165 166 167 168 169 170 171

            policy_loss.backward()
            optimizer.minimize(policy_loss)
            policy.clear_gradients()

            del policy.rewards[:]
            del policy.saved_log_probs[:]

            return returns

        loss_data = []
        running_reward = 10
        for i_episode in itertools.count(1):
            state, ep_reward = env.reset(), 0
172 173
            # The default loop number is 10000 is models, we changed it to 1000 for smaller test
            for t in range(1, 1000):
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
                state = np.array(state).astype("float32")
                action, loss = select_action(state)
                state, reward, done, _ = env.step(action)

                # log loss_probs
                loss_data.append(loss.numpy()[0])

                policy.rewards.append(reward)
                ep_reward += reward

                if done:
                    break

            # sum loss and apply optimization
            returns = finish_episode()

            running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
            if i_episode % args.log_interval == 0:
                print(
193 194 195 196
                    'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\t loss_probs: {}'.format(
                        i_episode, ep_reward, running_reward, loss.numpy()[0]
                    )
                )
197 198 199 200 201 202 203 204 205

            if i_episode > args.train_step:
                break

        return np.array(loss_data)


class TestDeclarative(unittest.TestCase):
    def setUp(self):
206 207 208
        self.place = (
            fluid.CUDAPlace(0)
            if fluid.is_compiled_with_cuda()
209
            else fluid.CPUPlace()
210
        )
211 212 213 214 215
        self.args = Args()

    def test_train(self):
        st_out = train(self.args, self.place, to_static=True)
        dy_out = train(self.args, self.place, to_static=False)
216
        np.testing.assert_allclose(st_out, dy_out, rtol=1e-05)
217 218 219


if __name__ == '__main__':
220
    unittest.main()