From 1475ca77b9c5ebe12ac2d3b6908fdafed0d0506e Mon Sep 17 00:00:00 2001
From: LI Yunxiang <39279048+Banmahhhh@users.noreply.github.com>
Date: Mon, 9 Dec 2019 10:25:36 +0800
Subject: [PATCH] Update reward calculation in QuickStart (#182)

* Update reward calculation in QuickStart

* update

* yapf
---
 examples/EagerMode/QuickStart/train.py | 10 ++++---
 examples/EagerMode/QuickStart/utils.py | 36 --------------------------
 examples/QuickStart/train.py           | 10 ++++---
 examples/QuickStart/utils.py           | 36 --------------------------
 4 files changed, 14 insertions(+), 78 deletions(-)
 delete mode 100644 examples/EagerMode/QuickStart/utils.py
 delete mode 100644 examples/QuickStart/utils.py

diff --git a/examples/EagerMode/QuickStart/train.py b/examples/EagerMode/QuickStart/train.py
index d9a55f4..af73f23 100644
--- a/examples/EagerMode/QuickStart/train.py
+++ b/examples/EagerMode/QuickStart/train.py
@@ -20,11 +20,9 @@ from parl.utils import logger
 from cartpole_model import CartpoleModel
 from cartpole_agent import CartpoleAgent
 from policy_gradient import PolicyGradient
-from utils import calc_discount_norm_reward
 
 OBS_DIM = 4
 ACT_DIM = 2
-GAMMA = 0.99
 LEARNING_RATE = 1e-3
 
 
@@ -47,6 +45,12 @@ def run_episode(env, agent, train_or_test='train'):
     return obs_list, action_list, reward_list
 
 
+def calc_reward_to_go(reward_list):
+    for i in range(len(reward_list) - 2, -1, -1):
+        reward_list[i] += reward_list[i + 1]
+    return np.array(reward_list)
+
+
 def main():
     env = gym.make('CartPole-v0')
     model = CartpoleModel(name_scope='noIdeaWhyNeedThis', act_dim=ACT_DIM)
@@ -62,7 +66,7 @@ def main():
 
             batch_obs = np.array(obs_list)
             batch_action = np.array(action_list)
-            batch_reward = calc_discount_norm_reward(reward_list, GAMMA)
+            batch_reward = calc_reward_to_go(reward_list)
 
             agent.learn(batch_obs, batch_action, batch_reward)
             if (i + 1) % 100 == 0:
diff --git a/examples/EagerMode/QuickStart/utils.py b/examples/EagerMode/QuickStart/utils.py
deleted file mode 100644
index dd4b9e4..0000000
--- a/examples/EagerMode/QuickStart/utils.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-
-def calc_discount_norm_reward(reward_list, gamma):
-    '''
-    Calculate the discounted reward list according to the discount factor gamma, and normalize it.
-    Args:
-        reward_list(list): a list containing the rewards along the trajectory.
-        gamma(float): the discounted factor for accumulation reward computation.
-    Returns:
-        a list containing the discounted reward
-    '''
-    discount_norm_reward = np.zeros_like(reward_list)
-
-    discount_cumulative_reward = 0
-    for i in reversed(range(0, len(reward_list))):
-        discount_cumulative_reward = (
-            gamma * discount_cumulative_reward + reward_list[i])
-        discount_norm_reward[i] = discount_cumulative_reward
-    discount_norm_reward = discount_norm_reward - np.mean(discount_norm_reward)
-    discount_norm_reward = discount_norm_reward / np.std(discount_norm_reward)
-    return discount_norm_reward
diff --git a/examples/QuickStart/train.py b/examples/QuickStart/train.py
index 27de75a..fb4e66d 100644
--- a/examples/QuickStart/train.py
+++ b/examples/QuickStart/train.py
@@ -19,11 +19,9 @@ import os.path
 from cartpole_agent import CartpoleAgent
 from cartpole_model import CartpoleModel
 from parl.utils import logger
-from utils import calc_discount_norm_reward
 
 OBS_DIM = 4
 ACT_DIM = 2
-GAMMA = 0.99
 LEARNING_RATE = 1e-3
 
 
@@ -46,6 +44,12 @@ def run_episode(env, agent, train_or_test='train'):
     return obs_list, action_list, reward_list
 
 
+def calc_reward_to_go(reward_list):
+    for i in range(len(reward_list) - 2, -1, -1):
+        reward_list[i] += reward_list[i + 1]
+    return np.array(reward_list)
+
+
 def main():
     env = gym.make("CartPole-v0")
     model = CartpoleModel(act_dim=ACT_DIM)
@@ -64,7 +68,7 @@ def main():
 
         batch_obs = np.array(obs_list)
         batch_action = np.array(action_list)
-        batch_reward = calc_discount_norm_reward(reward_list, GAMMA)
+        batch_reward = calc_reward_to_go(reward_list)
 
         agent.learn(batch_obs, batch_action, batch_reward)
         if (i + 1) % 100 == 0:
diff --git a/examples/QuickStart/utils.py b/examples/QuickStart/utils.py
deleted file mode 100644
index dd4b9e4..0000000
--- a/examples/QuickStart/utils.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-
-def calc_discount_norm_reward(reward_list, gamma):
-    '''
-    Calculate the discounted reward list according to the discount factor gamma, and normalize it.
-    Args:
-        reward_list(list): a list containing the rewards along the trajectory.
-        gamma(float): the discounted factor for accumulation reward computation.
-    Returns:
-        a list containing the discounted reward
-    '''
-    discount_norm_reward = np.zeros_like(reward_list)
-
-    discount_cumulative_reward = 0
-    for i in reversed(range(0, len(reward_list))):
-        discount_cumulative_reward = (
-            gamma * discount_cumulative_reward + reward_list[i])
-        discount_norm_reward[i] = discount_cumulative_reward
-    discount_norm_reward = discount_norm_reward - np.mean(discount_norm_reward)
-    discount_norm_reward = discount_norm_reward / np.std(discount_norm_reward)
-    return discount_norm_reward
-- 
GitLab