diff --git a/README.md b/README.md index fbdab33228e3468752f36f016856fddd2ae3290e..0d25da350b57a6926d405b512f041e6679f3e318 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ class AtariModel(parl.Model): return Q """ three steps to build an agent - 1. define a forward model which is critic_model is this example + 1. define a forward model which is critic_model in this example 2. a. to build a DQN algorithm, just pass the critic_model to `DQN` b. to build a DDQN algorithm, just replace DQN in following line with DDQN 3. define the I/O part in AtariAgent so that it could update the algorithm based on the interactive data diff --git a/examples/DDPG/.benchmark/DDPG_Hopper-v2.png b/examples/DDPG/.benchmark/DDPG_Hopper-v2.png new file mode 100644 index 0000000000000000000000000000000000000000..28a2f93d4d3e93d581ce9bf2cfec4f856186fa79 Binary files /dev/null and b/examples/DDPG/.benchmark/DDPG_Hopper-v2.png differ diff --git a/examples/DDPG/.benchmark/DDPG_Humanoid-v2.png b/examples/DDPG/.benchmark/DDPG_Humanoid-v2.png new file mode 100644 index 0000000000000000000000000000000000000000..91d72ad1e438e053b6b28e941a514c1148b4e867 Binary files /dev/null and b/examples/DDPG/.benchmark/DDPG_Humanoid-v2.png differ diff --git a/examples/DDPG/README.md b/examples/DDPG/README.md index d62d062411a469e44f0ab55ca6d3a51d19f64aa2..f40d9858d4dcfcdb1ca07fe47fd2c3b57a3654d4 100644 --- a/examples/DDPG/README.md +++ b/examples/DDPG/README.md @@ -9,7 +9,8 @@ Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco ### Benchmark result -DDPG_HalfCheetah-v2 +DDPG_HalfCheetah-v2 DDPG_Humanoid-v2 +DDPG_Hopper-v2 ## How to use ### Dependencies: diff --git a/examples/DQN/.benchmark/DQN_SpaceInvaders.png b/examples/DQN/.benchmark/DQN_SpaceInvaders.png new file mode 100644 index 0000000000000000000000000000000000000000..fc6888dd1f0a552931c07894bd2a766f0a6cfd12 Binary files /dev/null and b/examples/DQN/.benchmark/DQN_SpaceInvaders.png differ diff --git a/examples/DQN/README.md b/examples/DQN/README.md index 9bba38ce7b63354b80b0fed2cd75a1f9ca82ab7c..dcf7cd40dd5a213fde9ed9a21602567782dafe8d 100644 --- a/examples/DQN/README.md +++ b/examples/DQN/README.md @@ -11,7 +11,7 @@ Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari g DQN_Pong DQN_Breakout
-DQN_BeamRider +DQN_BeamRider DQN_SpaceInvaders ## How to use ### Dependencies: diff --git a/examples/PPO/.benchmark/PPO_HalfCheetah-v2.png b/examples/PPO/.benchmark/PPO_HalfCheetah-v2.png index 7d0c031367fbb25e14f6ca0eb67bd4606a55ff67..773cc18a3d53875d91316019b84b9891ffbd28b4 100644 Binary files a/examples/PPO/.benchmark/PPO_HalfCheetah-v2.png and b/examples/PPO/.benchmark/PPO_HalfCheetah-v2.png differ diff --git a/examples/PPO/.benchmark/PPO_Hopper-v2.png b/examples/PPO/.benchmark/PPO_Hopper-v2.png new file mode 100644 index 0000000000000000000000000000000000000000..8ad66b7a14c66027cb54cfa60f73869215040623 Binary files /dev/null and b/examples/PPO/.benchmark/PPO_Hopper-v2.png differ diff --git a/examples/PPO/README.md b/examples/PPO/README.md index 22c269017cf29cd94dd9a770cec9e228385ae0a8..6bc01d3f28c996d2b90d1b25b1a5212ee50bc0c4 100644 --- a/examples/PPO/README.md +++ b/examples/PPO/README.md @@ -12,7 +12,7 @@ Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco ### Benchmark result -PPO_HalfCheetah-v2 +PPO_HalfCheetah-v2 PPO_Hopper-v2 ## How to use ### Dependencies: diff --git a/examples/PPO/train.py b/examples/PPO/train.py index 7508f6fe52900d0aa5525c47b7209c987f4c21f0..492038b0b17f22b7af844eaa29f8831ff70b7e77 100755 --- a/examples/PPO/train.py +++ b/examples/PPO/train.py @@ -84,18 +84,50 @@ def run_evaluate_episode(env, agent, scaler): def collect_trajectories(env, agent, scaler, episodes): - all_obs, all_actions, all_rewards, all_unscaled_obs = [], [], [], [] + trajectories, all_unscaled_obs = [], [] for e in range(episodes): obs, actions, rewards, unscaled_obs = run_train_episode( env, agent, scaler) - all_obs.append(obs) - all_actions.append(actions) - all_rewards.append(rewards) + trajectories.append({ + 'obs': obs, + 'actions': actions, + 'rewards': rewards, + }) all_unscaled_obs.append(unscaled_obs) - scaler.update(np.concatenate(all_unscaled_obs) - ) # update running statistics for scaling observations - return np.concatenate(all_obs), np.concatenate( - all_actions), np.concatenate(all_rewards) + # update running statistics for scaling observations + scaler.update(np.concatenate(all_unscaled_obs)) + return trajectories + + +def build_train_data(trajectories, agent): + train_obs, train_actions, train_advantages, train_discount_sum_rewards = [], [], [], [] + for trajectory in trajectories: + pred_values = agent.value_predict(trajectory['obs']) + + # scale rewards + scale_rewards = trajectory['rewards'] * (1 - args.gamma) + + discount_sum_rewards = calc_discount_sum_rewards( + scale_rewards, args.gamma).astype('float32') + + advantages = calc_gae(scale_rewards, pred_values, args.gamma, args.lam) + + # normalize advantages + advantages = (advantages - advantages.mean()) / ( + advantages.std() + 1e-6) + advantages = advantages.astype('float32') + + train_obs.append(trajectory['obs']) + train_actions.append(trajectory['actions']) + train_advantages.append(advantages) + train_discount_sum_rewards.append(discount_sum_rewards) + + train_obs = np.concatenate(train_obs) + train_actions = np.concatenate(train_actions) + train_advantages = np.concatenate(train_advantages) + train_discount_sum_rewards = np.concatenate(train_discount_sum_rewards) + + return train_obs, train_actions, train_advantages, train_discount_sum_rewards def main(): @@ -123,33 +155,22 @@ def main(): test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: - obs, actions, rewards = collect_trajectories( + trajectories = collect_trajectories( env, agent, scaler, episodes=args.episodes_per_batch) - total_steps += obs.shape[0] - - pred_values = agent.value_predict(obs) + total_steps += sum([t['obs'].shape[0] for t in trajectories]) + total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories]) - # scale rewards - scale_rewards = rewards * (1 - args.gamma) - - discount_sum_rewards = calc_discount_sum_rewards( - scale_rewards, args.gamma) - discount_sum_rewards = discount_sum_rewards.astype('float32') - - advantages = calc_gae(scale_rewards, pred_values, args.gamma, args.lam) - # normalize advantages - advantages = (advantages - advantages.mean()) / ( - advantages.std() + 1e-6) - advantages = advantages.astype('float32') + train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data( + trajectories, agent) - policy_loss, kl = agent.policy_learn(obs, actions, advantages) - value_loss = agent.value_learn(obs, discount_sum_rewards) + policy_loss, kl = agent.policy_learn(train_obs, train_actions, + train_advantages) + value_loss = agent.value_learn(train_obs, train_discount_sum_rewards) logger.info( 'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}' - .format(total_steps, - np.sum(rewards) / args.episodes_per_batch, policy_loss, kl, - value_loss)) + .format(total_steps, total_train_rewards / args.episodes_per_batch, + policy_loss, kl, value_loss)) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1