From 1a5d3defb1404bbed0619077011d84c0c3bed383 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 17 Jul 2020 15:38:30 +0800 Subject: [PATCH] [Dy2stat] Add Reinforcement learning unittest (#25445) * add reinforcement learning model test=develop * align backward test=develop * add gym in paddle_build.sh test=develop * rm pip install in script test=develop * refine paddle_build.sh test=develop * fix sed error in macOS test=develop * polish code test=develop --- paddle/scripts/paddle_build.sh | 7 + .../test_reinforcement_learning.py | 218 ++++++++++++++++++ 2 files changed, 225 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index e1924e2ba28..1f7baf135d6 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -64,6 +64,9 @@ function cmake_base() { # Delete previous built whl packages rm -rf python/dist 2>/dev/null || true + # `gym` is only used in unittest, it's not suitable to add in requirements.txt. + # Add it dynamically. + echo "gym" >> ${PADDLE_ROOT}/python/requirements.txt # Support build for all python versions, currently # including cp27-cp27m and cp27-cp27mu. PYTHON_FLAGS="" @@ -119,6 +122,8 @@ function cmake_base() { exit 1 fi fi + # delete `gym` to avoid modifying requirements.txt in *.whl + sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt else if [ "$1" != "" ]; then echo "using python abi: $1" @@ -175,6 +180,8 @@ function cmake_base() { else pip install -r ${PADDLE_ROOT}/python/requirements.txt fi + # delete `gym` to avoid modifying requirements.txt in *.whl + sed -i "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt fi if [ "$SYSTEM" == "Darwin" ]; then diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py new file mode 100644 index 00000000000..2f753cd5cfc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py @@ -0,0 +1,218 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gym +import math +import itertools +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.dygraph.nn as nn +from paddle.fluid.dygraph import to_variable, Layer +from paddle.fluid.dygraph import declarative, ProgramTranslator + +import unittest + +SEED = 2020 +program_translator = ProgramTranslator() + + +class Policy(Layer): + def __init__(self): + super(Policy, self).__init__() + + self.affine1 = nn.Linear(4, 128) + self.affine2 = nn.Linear(128, 2) + self.dropout_ratio = 0.6 + + self.saved_log_probs = [] + self.rewards = [] + + @declarative + def forward(self, x): + x = fluid.layers.reshape(x, shape=[1, 4]) + x = self.affine1(x) + x = fluid.layers.dropout(x, self.dropout_ratio) + x = fluid.layers.relu(x) + action_scores = self.affine2(x) + + log_prob = fluid.layers.softmax(action_scores, axis=1) + + return log_prob + + +class Args(object): + gamma = 0.99 + log_interval = 1 + train_step = 10 + + +def train(args, place, to_static): + program_translator.enable(to_static) + + env = gym.make('CartPole-v0') + env.seed(SEED) + + with fluid.dygraph.guard(place): + fluid.default_main_program().random_seed = SEED + fluid.default_startup_program().random_seed = SEED + local_random = np.random.RandomState(SEED) + + policy = Policy() + + eps = np.finfo(np.float32).eps.item() + optimizer = fluid.optimizer.AdamaxOptimizer( + learning_rate=1e-2, parameter_list=policy.parameters()) + + def get_mean_and_std(values=[]): + n = 0. + s = 0. + for val in values: + s += val + n += 1 + mean = s / n + + std = 0. + for val in values: + std += (val - mean) * (val - mean) + std /= n + std = math.sqrt(std) + + return mean, std + + def sample_action(probs): + sample = local_random.random_sample() + idx = 0 + + while idx < len(probs) and sample > probs[idx]: + sample -= probs[idx] + idx += 1 + mask = [0.] * len(probs) + mask[idx] = 1. + + return idx, np.array([mask]).astype("float32") + + def choose_best_action(probs): + idx = 0 if probs[0] > probs[1] else 1 + mask = [1., 0.] if idx == 0 else [0., 1.] + + return idx, np.array([mask]).astype("float32") + + def select_action(state): + state = to_variable(state) + state.stop_gradient = True + loss_probs = policy(state) + # print(loss_probs.name) + probs = loss_probs.numpy() + + action, _mask = sample_action(probs[0]) + mask = to_variable(_mask) + mask.stop_gradient = True + + loss_probs = fluid.layers.log(loss_probs) + loss_probs = fluid.layers.elementwise_mul(loss_probs, mask) + loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1) + + policy.saved_log_probs.append(loss_probs) + return action, loss_probs + + def finish_episode(): + R = 0 + policy_loss = [] + returns = [] + for r in policy.rewards[::-1]: + R = r + args.gamma * R + returns.insert(0, R) + + mean, std = get_mean_and_std(returns) + + returns = np.array(returns).astype("float32") + returns = (returns - mean) / (std + eps) + + # calculate policy loss of each step. + for log_prob, R in zip(policy.saved_log_probs, returns): + log_prob_numpy = log_prob.numpy() + + R_numpy = np.ones_like(log_prob_numpy).astype("float32") + _R = -1 * R * R_numpy + _R = to_variable(_R) + _R.stop_gradient = True + cur_loss = fluid.layers.elementwise_mul(_R, log_prob) + policy_loss.append(cur_loss) + + policy_loss = fluid.layers.concat(policy_loss) + policy_loss = fluid.layers.reduce_sum(policy_loss) + + policy_loss.backward() + optimizer.minimize(policy_loss) + policy.clear_gradients() + + del policy.rewards[:] + del policy.saved_log_probs[:] + + return returns + + loss_data = [] + running_reward = 10 + for i_episode in itertools.count(1): + state, ep_reward = env.reset(), 0 + # TODO(Aurelius84): In RL, we continuously select actions with multiple steps, + # then accumulate loss to apply optimization. But currently all vars shared with + # the same inner scope, which has problem in backward. I will fix it in next PR. + for t in range(1, 2): # default 1000 + state = np.array(state).astype("float32") + action, loss = select_action(state) + state, reward, done, _ = env.step(action) + + # log loss_probs + loss_data.append(loss.numpy()[0]) + + policy.rewards.append(reward) + ep_reward += reward + + if done: + break + + # sum loss and apply optimization + returns = finish_episode() + + running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward + if i_episode % args.log_interval == 0: + print( + 'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\t loss_probs: {}'. + format(i_episode, ep_reward, running_reward, + loss.numpy()[0])) + + if i_episode > args.train_step: + break + + return np.array(loss_data) + + +class TestDeclarative(unittest.TestCase): + def setUp(self): + self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \ + else fluid.CPUPlace() + + self.args = Args() + + def test_train(self): + st_out = train(self.args, self.place, to_static=True) + dy_out = train(self.args, self.place, to_static=False) + self.assertTrue( + np.allclose(st_out, dy_out), + msg="dy_out:\n {}\n st_out:\n{}\n".format(dy_out, st_out)) + + +if __name__ == '__main__': + unittest.main() -- GitLab