Update reward calculation in QuickStart (#182)

* Update reward calculation in QuickStart * update * yapf

Update reward calculation in QuickStart (#182)
* Update reward calculation in QuickStart * update * yapf
1475ca77 · LI Yunxiang · Bo Zhou · dbb5931a · 1475ca77 · dbb5931a
4 changed file
--- a/examples/EagerMode/QuickStart/train.py
+++ b/examples/EagerMode/QuickStart/train.py
@@ -20,11 +20,9 @@ from parl.utils import logger
 from cartpole_model import CartpoleModel
 from cartpole_agent import CartpoleAgent
 from policy_gradient import PolicyGradient
-from utils import calc_discount_norm_reward
 OBS_DIM = 4
 ACT_DIM = 2
-GAMMA = 0.99
 LEARNING_RATE = 1e-3
@@ -47,6 +45,12 @@ def run_episode(env, agent, train_or_test='train'):
    return obs_list, action_list, reward_list
+def calc_reward_to_go(reward_list):
+    for i in range(len(reward_list) - 2, -1, -1):
+        reward_list[i] += reward_list[i + 1]
+    return np.array(reward_list)
 def main():
    env = gym.make('CartPole-v0')
    model = CartpoleModel(name_scope='noIdeaWhyNeedThis', act_dim=ACT_DIM)
@@ -62,7 +66,7 @@ def main():
            batch_obs = np.array(obs_list)
            batch_action = np.array(action_list)
-            batch_reward = calc_discount_norm_reward(reward_list, GAMMA)
+            batch_reward = calc_reward_to_go(reward_list)
            agent.learn(batch_obs, batch_action, batch_reward)
            if (i + 1) % 100 == 0:

--- a/examples/EagerMode/QuickStart/utils.py
+++ b/examples/EagerMode/QuickStart/utils.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-def calc_discount_norm_reward(reward_list, gamma):
-    '''
-    Calculate the discounted reward list according to the discount factor gamma, and normalize it.
-    Args:
-        reward_list(list): a list containing the rewards along the trajectory.
-        gamma(float): the discounted factor for accumulation reward computation.
-    Returns:
-        a list containing the discounted reward
-    '''
-    discount_norm_reward = np.zeros_like(reward_list)
-    discount_cumulative_reward = 0
-    for i in reversed(range(0, len(reward_list))):
-        discount_cumulative_reward = (
-            gamma * discount_cumulative_reward + reward_list[i])
-        discount_norm_reward[i] = discount_cumulative_reward
-    discount_norm_reward = discount_norm_reward - np.mean(discount_norm_reward)
-    discount_norm_reward = discount_norm_reward / np.std(discount_norm_reward)
-    return discount_norm_reward
--- a/examples/QuickStart/train.py
+++ b/examples/QuickStart/train.py
@@ -19,11 +19,9 @@ import os.path
 from cartpole_agent import CartpoleAgent
 from cartpole_model import CartpoleModel
 from parl.utils import logger
-from utils import calc_discount_norm_reward
 OBS_DIM = 4
 ACT_DIM = 2
-GAMMA = 0.99
 LEARNING_RATE = 1e-3
@@ -46,6 +44,12 @@ def run_episode(env, agent, train_or_test='train'):
    return obs_list, action_list, reward_list
+def calc_reward_to_go(reward_list):
+    for i in range(len(reward_list) - 2, -1, -1):
+        reward_list[i] += reward_list[i + 1]
+    return np.array(reward_list)
 def main():
    env = gym.make("CartPole-v0")
    model = CartpoleModel(act_dim=ACT_DIM)
@@ -64,7 +68,7 @@ def main():
        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
-        batch_reward = calc_discount_norm_reward(reward_list, GAMMA)
+        batch_reward = calc_reward_to_go(reward_list)
        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:

--- a/examples/QuickStart/utils.py
+++ b/examples/QuickStart/utils.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-def calc_discount_norm_reward(reward_list, gamma):
-    '''
-    Calculate the discounted reward list according to the discount factor gamma, and normalize it.
-    Args:
-        reward_list(list): a list containing the rewards along the trajectory.
-        gamma(float): the discounted factor for accumulation reward computation.
-    Returns:
-        a list containing the discounted reward
-    '''
-    discount_norm_reward = np.zeros_like(reward_list)
-    discount_cumulative_reward = 0
-    for i in reversed(range(0, len(reward_list))):
-        discount_cumulative_reward = (
-            gamma * discount_cumulative_reward + reward_list[i])
-        discount_norm_reward[i] = discount_cumulative_reward
-    discount_norm_reward = discount_norm_reward - np.mean(discount_norm_reward)
-    discount_norm_reward = discount_norm_reward / np.std(discount_norm_reward)
-    return discount_norm_reward