提交 1475ca77 编写于 作者: L LI Yunxiang 提交者: Bo Zhou

Update reward calculation in QuickStart (#182)

* Update reward calculation in QuickStart

* update

* yapf
上级 dbb5931a
......@@ -20,11 +20,9 @@ from parl.utils import logger
from cartpole_model import CartpoleModel
from cartpole_agent import CartpoleAgent
from policy_gradient import PolicyGradient
from utils import calc_discount_norm_reward
OBS_DIM = 4
ACT_DIM = 2
GAMMA = 0.99
LEARNING_RATE = 1e-3
......@@ -47,6 +45,12 @@ def run_episode(env, agent, train_or_test='train'):
return obs_list, action_list, reward_list
def calc_reward_to_go(reward_list):
for i in range(len(reward_list) - 2, -1, -1):
reward_list[i] += reward_list[i + 1]
return np.array(reward_list)
def main():
env = gym.make('CartPole-v0')
model = CartpoleModel(name_scope='noIdeaWhyNeedThis', act_dim=ACT_DIM)
......@@ -62,7 +66,7 @@ def main():
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_discount_norm_reward(reward_list, GAMMA)
batch_reward = calc_reward_to_go(reward_list)
agent.learn(batch_obs, batch_action, batch_reward)
if (i + 1) % 100 == 0:
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
def calc_discount_norm_reward(reward_list, gamma):
'''
Calculate the discounted reward list according to the discount factor gamma, and normalize it.
Args:
reward_list(list): a list containing the rewards along the trajectory.
gamma(float): the discounted factor for accumulation reward computation.
Returns:
a list containing the discounted reward
'''
discount_norm_reward = np.zeros_like(reward_list)
discount_cumulative_reward = 0
for i in reversed(range(0, len(reward_list))):
discount_cumulative_reward = (
gamma * discount_cumulative_reward + reward_list[i])
discount_norm_reward[i] = discount_cumulative_reward
discount_norm_reward = discount_norm_reward - np.mean(discount_norm_reward)
discount_norm_reward = discount_norm_reward / np.std(discount_norm_reward)
return discount_norm_reward
......@@ -19,11 +19,9 @@ import os.path
from cartpole_agent import CartpoleAgent
from cartpole_model import CartpoleModel
from parl.utils import logger
from utils import calc_discount_norm_reward
OBS_DIM = 4
ACT_DIM = 2
GAMMA = 0.99
LEARNING_RATE = 1e-3
......@@ -46,6 +44,12 @@ def run_episode(env, agent, train_or_test='train'):
return obs_list, action_list, reward_list
def calc_reward_to_go(reward_list):
for i in range(len(reward_list) - 2, -1, -1):
reward_list[i] += reward_list[i + 1]
return np.array(reward_list)
def main():
env = gym.make("CartPole-v0")
model = CartpoleModel(act_dim=ACT_DIM)
......@@ -64,7 +68,7 @@ def main():
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_discount_norm_reward(reward_list, GAMMA)
batch_reward = calc_reward_to_go(reward_list)
agent.learn(batch_obs, batch_action, batch_reward)
if (i + 1) % 100 == 0:
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
def calc_discount_norm_reward(reward_list, gamma):
'''
Calculate the discounted reward list according to the discount factor gamma, and normalize it.
Args:
reward_list(list): a list containing the rewards along the trajectory.
gamma(float): the discounted factor for accumulation reward computation.
Returns:
a list containing the discounted reward
'''
discount_norm_reward = np.zeros_like(reward_list)
discount_cumulative_reward = 0
for i in reversed(range(0, len(reward_list))):
discount_cumulative_reward = (
gamma * discount_cumulative_reward + reward_list[i])
discount_norm_reward[i] = discount_cumulative_reward
discount_norm_reward = discount_norm_reward - np.mean(discount_norm_reward)
discount_norm_reward = discount_norm_reward / np.std(discount_norm_reward)
return discount_norm_reward
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册