From 1475ca77b9c5ebe12ac2d3b6908fdafed0d0506e Mon Sep 17 00:00:00 2001 From: LI Yunxiang <39279048+Banmahhhh@users.noreply.github.com> Date: Mon, 9 Dec 2019 10:25:36 +0800 Subject: [PATCH] Update reward calculation in QuickStart (#182) * Update reward calculation in QuickStart * update * yapf --- examples/EagerMode/QuickStart/train.py | 10 ++++--- examples/EagerMode/QuickStart/utils.py | 36 -------------------------- examples/QuickStart/train.py | 10 ++++--- examples/QuickStart/utils.py | 36 -------------------------- 4 files changed, 14 insertions(+), 78 deletions(-) delete mode 100644 examples/EagerMode/QuickStart/utils.py delete mode 100644 examples/QuickStart/utils.py diff --git a/examples/EagerMode/QuickStart/train.py b/examples/EagerMode/QuickStart/train.py index d9a55f4..af73f23 100644 --- a/examples/EagerMode/QuickStart/train.py +++ b/examples/EagerMode/QuickStart/train.py @@ -20,11 +20,9 @@ from parl.utils import logger from cartpole_model import CartpoleModel from cartpole_agent import CartpoleAgent from policy_gradient import PolicyGradient -from utils import calc_discount_norm_reward OBS_DIM = 4 ACT_DIM = 2 -GAMMA = 0.99 LEARNING_RATE = 1e-3 @@ -47,6 +45,12 @@ def run_episode(env, agent, train_or_test='train'): return obs_list, action_list, reward_list +def calc_reward_to_go(reward_list): + for i in range(len(reward_list) - 2, -1, -1): + reward_list[i] += reward_list[i + 1] + return np.array(reward_list) + + def main(): env = gym.make('CartPole-v0') model = CartpoleModel(name_scope='noIdeaWhyNeedThis', act_dim=ACT_DIM) @@ -62,7 +66,7 @@ def main(): batch_obs = np.array(obs_list) batch_action = np.array(action_list) - batch_reward = calc_discount_norm_reward(reward_list, GAMMA) + batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: diff --git a/examples/EagerMode/QuickStart/utils.py b/examples/EagerMode/QuickStart/utils.py deleted file mode 100644 index dd4b9e4..0000000 --- a/examples/EagerMode/QuickStart/utils.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np - - -def calc_discount_norm_reward(reward_list, gamma): - ''' - Calculate the discounted reward list according to the discount factor gamma, and normalize it. - Args: - reward_list(list): a list containing the rewards along the trajectory. - gamma(float): the discounted factor for accumulation reward computation. - Returns: - a list containing the discounted reward - ''' - discount_norm_reward = np.zeros_like(reward_list) - - discount_cumulative_reward = 0 - for i in reversed(range(0, len(reward_list))): - discount_cumulative_reward = ( - gamma * discount_cumulative_reward + reward_list[i]) - discount_norm_reward[i] = discount_cumulative_reward - discount_norm_reward = discount_norm_reward - np.mean(discount_norm_reward) - discount_norm_reward = discount_norm_reward / np.std(discount_norm_reward) - return discount_norm_reward diff --git a/examples/QuickStart/train.py b/examples/QuickStart/train.py index 27de75a..fb4e66d 100644 --- a/examples/QuickStart/train.py +++ b/examples/QuickStart/train.py @@ -19,11 +19,9 @@ import os.path from cartpole_agent import CartpoleAgent from cartpole_model import CartpoleModel from parl.utils import logger -from utils import calc_discount_norm_reward OBS_DIM = 4 ACT_DIM = 2 -GAMMA = 0.99 LEARNING_RATE = 1e-3 @@ -46,6 +44,12 @@ def run_episode(env, agent, train_or_test='train'): return obs_list, action_list, reward_list +def calc_reward_to_go(reward_list): + for i in range(len(reward_list) - 2, -1, -1): + reward_list[i] += reward_list[i + 1] + return np.array(reward_list) + + def main(): env = gym.make("CartPole-v0") model = CartpoleModel(act_dim=ACT_DIM) @@ -64,7 +68,7 @@ def main(): batch_obs = np.array(obs_list) batch_action = np.array(action_list) - batch_reward = calc_discount_norm_reward(reward_list, GAMMA) + batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: diff --git a/examples/QuickStart/utils.py b/examples/QuickStart/utils.py deleted file mode 100644 index dd4b9e4..0000000 --- a/examples/QuickStart/utils.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np - - -def calc_discount_norm_reward(reward_list, gamma): - ''' - Calculate the discounted reward list according to the discount factor gamma, and normalize it. - Args: - reward_list(list): a list containing the rewards along the trajectory. - gamma(float): the discounted factor for accumulation reward computation. - Returns: - a list containing the discounted reward - ''' - discount_norm_reward = np.zeros_like(reward_list) - - discount_cumulative_reward = 0 - for i in reversed(range(0, len(reward_list))): - discount_cumulative_reward = ( - gamma * discount_cumulative_reward + reward_list[i]) - discount_norm_reward[i] = discount_cumulative_reward - discount_norm_reward = discount_norm_reward - np.mean(discount_norm_reward) - discount_norm_reward = discount_norm_reward / np.std(discount_norm_reward) - return discount_norm_reward -- GitLab