diff --git a/README.md b/README.md
index fbdab33228e3468752f36f016856fddd2ae3290e..0d25da350b57a6926d405b512f041e6679f3e318 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ class AtariModel(parl.Model):
return Q
"""
three steps to build an agent
- 1. define a forward model which is critic_model is this example
+ 1. define a forward model which is critic_model in this example
2. a. to build a DQN algorithm, just pass the critic_model to `DQN`
b. to build a DDQN algorithm, just replace DQN in following line with DDQN
3. define the I/O part in AtariAgent so that it could update the algorithm based on the interactive data
diff --git a/examples/DDPG/.benchmark/DDPG_Hopper-v2.png b/examples/DDPG/.benchmark/DDPG_Hopper-v2.png
new file mode 100644
index 0000000000000000000000000000000000000000..28a2f93d4d3e93d581ce9bf2cfec4f856186fa79
Binary files /dev/null and b/examples/DDPG/.benchmark/DDPG_Hopper-v2.png differ
diff --git a/examples/DDPG/.benchmark/DDPG_Humanoid-v2.png b/examples/DDPG/.benchmark/DDPG_Humanoid-v2.png
new file mode 100644
index 0000000000000000000000000000000000000000..91d72ad1e438e053b6b28e941a514c1148b4e867
Binary files /dev/null and b/examples/DDPG/.benchmark/DDPG_Humanoid-v2.png differ
diff --git a/examples/DDPG/README.md b/examples/DDPG/README.md
index d62d062411a469e44f0ab55ca6d3a51d19f64aa2..f40d9858d4dcfcdb1ca07fe47fd2c3b57a3654d4 100644
--- a/examples/DDPG/README.md
+++ b/examples/DDPG/README.md
@@ -9,7 +9,8 @@ Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco
### Benchmark result
-
+
+
## How to use
### Dependencies:
diff --git a/examples/DQN/.benchmark/DQN_SpaceInvaders.png b/examples/DQN/.benchmark/DQN_SpaceInvaders.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc6888dd1f0a552931c07894bd2a766f0a6cfd12
Binary files /dev/null and b/examples/DQN/.benchmark/DQN_SpaceInvaders.png differ
diff --git a/examples/DQN/README.md b/examples/DQN/README.md
index 9bba38ce7b63354b80b0fed2cd75a1f9ca82ab7c..dcf7cd40dd5a213fde9ed9a21602567782dafe8d 100644
--- a/examples/DQN/README.md
+++ b/examples/DQN/README.md
@@ -11,7 +11,7 @@ Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari g
-
+
## How to use
### Dependencies:
diff --git a/examples/PPO/.benchmark/PPO_HalfCheetah-v2.png b/examples/PPO/.benchmark/PPO_HalfCheetah-v2.png
index 7d0c031367fbb25e14f6ca0eb67bd4606a55ff67..773cc18a3d53875d91316019b84b9891ffbd28b4 100644
Binary files a/examples/PPO/.benchmark/PPO_HalfCheetah-v2.png and b/examples/PPO/.benchmark/PPO_HalfCheetah-v2.png differ
diff --git a/examples/PPO/.benchmark/PPO_Hopper-v2.png b/examples/PPO/.benchmark/PPO_Hopper-v2.png
new file mode 100644
index 0000000000000000000000000000000000000000..8ad66b7a14c66027cb54cfa60f73869215040623
Binary files /dev/null and b/examples/PPO/.benchmark/PPO_Hopper-v2.png differ
diff --git a/examples/PPO/README.md b/examples/PPO/README.md
index 22c269017cf29cd94dd9a770cec9e228385ae0a8..6bc01d3f28c996d2b90d1b25b1a5212ee50bc0c4 100644
--- a/examples/PPO/README.md
+++ b/examples/PPO/README.md
@@ -12,7 +12,7 @@ Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco
### Benchmark result
-
+
## How to use
### Dependencies:
diff --git a/examples/PPO/train.py b/examples/PPO/train.py
index 7508f6fe52900d0aa5525c47b7209c987f4c21f0..492038b0b17f22b7af844eaa29f8831ff70b7e77 100755
--- a/examples/PPO/train.py
+++ b/examples/PPO/train.py
@@ -84,18 +84,50 @@ def run_evaluate_episode(env, agent, scaler):
def collect_trajectories(env, agent, scaler, episodes):
- all_obs, all_actions, all_rewards, all_unscaled_obs = [], [], [], []
+ trajectories, all_unscaled_obs = [], []
for e in range(episodes):
obs, actions, rewards, unscaled_obs = run_train_episode(
env, agent, scaler)
- all_obs.append(obs)
- all_actions.append(actions)
- all_rewards.append(rewards)
+ trajectories.append({
+ 'obs': obs,
+ 'actions': actions,
+ 'rewards': rewards,
+ })
all_unscaled_obs.append(unscaled_obs)
- scaler.update(np.concatenate(all_unscaled_obs)
- ) # update running statistics for scaling observations
- return np.concatenate(all_obs), np.concatenate(
- all_actions), np.concatenate(all_rewards)
+ # update running statistics for scaling observations
+ scaler.update(np.concatenate(all_unscaled_obs))
+ return trajectories
+
+
+def build_train_data(trajectories, agent):
+ train_obs, train_actions, train_advantages, train_discount_sum_rewards = [], [], [], []
+ for trajectory in trajectories:
+ pred_values = agent.value_predict(trajectory['obs'])
+
+ # scale rewards
+ scale_rewards = trajectory['rewards'] * (1 - args.gamma)
+
+ discount_sum_rewards = calc_discount_sum_rewards(
+ scale_rewards, args.gamma).astype('float32')
+
+ advantages = calc_gae(scale_rewards, pred_values, args.gamma, args.lam)
+
+ # normalize advantages
+ advantages = (advantages - advantages.mean()) / (
+ advantages.std() + 1e-6)
+ advantages = advantages.astype('float32')
+
+ train_obs.append(trajectory['obs'])
+ train_actions.append(trajectory['actions'])
+ train_advantages.append(advantages)
+ train_discount_sum_rewards.append(discount_sum_rewards)
+
+ train_obs = np.concatenate(train_obs)
+ train_actions = np.concatenate(train_actions)
+ train_advantages = np.concatenate(train_advantages)
+ train_discount_sum_rewards = np.concatenate(train_discount_sum_rewards)
+
+ return train_obs, train_actions, train_advantages, train_discount_sum_rewards
def main():
@@ -123,33 +155,22 @@ def main():
test_flag = 0
total_steps = 0
while total_steps < args.train_total_steps:
- obs, actions, rewards = collect_trajectories(
+ trajectories = collect_trajectories(
env, agent, scaler, episodes=args.episodes_per_batch)
- total_steps += obs.shape[0]
-
- pred_values = agent.value_predict(obs)
+ total_steps += sum([t['obs'].shape[0] for t in trajectories])
+ total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories])
- # scale rewards
- scale_rewards = rewards * (1 - args.gamma)
-
- discount_sum_rewards = calc_discount_sum_rewards(
- scale_rewards, args.gamma)
- discount_sum_rewards = discount_sum_rewards.astype('float32')
-
- advantages = calc_gae(scale_rewards, pred_values, args.gamma, args.lam)
- # normalize advantages
- advantages = (advantages - advantages.mean()) / (
- advantages.std() + 1e-6)
- advantages = advantages.astype('float32')
+ train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data(
+ trajectories, agent)
- policy_loss, kl = agent.policy_learn(obs, actions, advantages)
- value_loss = agent.value_learn(obs, discount_sum_rewards)
+ policy_loss, kl = agent.policy_learn(train_obs, train_actions,
+ train_advantages)
+ value_loss = agent.value_learn(train_obs, train_discount_sum_rewards)
logger.info(
'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
- .format(total_steps,
- np.sum(rewards) / args.episodes_per_batch, policy_loss, kl,
- value_loss))
+ .format(total_steps, total_train_rewards / args.episodes_per_batch,
+ policy_loss, kl, value_loss))
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
test_flag += 1