diff --git a/examples/DDPG/README.md b/examples/DDPG/README.md index f11a30aa852f0630acabf873de518f7b9b64f15f..d62d062411a469e44f0ab55ca6d3a51d19f64aa2 100644 --- a/examples/DDPG/README.md +++ b/examples/DDPG/README.md @@ -8,12 +8,12 @@ Based on PARL, the DDPG model of deep reinforcement learning is reproduced, and Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game. ### Benchmark result -- HalfCheetah-v2 - + +DDPG_HalfCheetah-v2 ## How to use ### Dependencies: -+ python2.7 or python3.5+ ++ python3.5+ + [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle) + [parl](https://github.com/PaddlePaddle/PARL) + gym diff --git a/examples/DDPG/train.py b/examples/DDPG/train.py index c8f7ce3f92b47c639942d3099cfec32535c6043f..ef02304782b5c8ef27ba381e87f54422e2ff6d9d 100644 --- a/examples/DDPG/train.py +++ b/examples/DDPG/train.py @@ -37,7 +37,9 @@ ENV_SEED = 1 def run_train_episode(env, agent, rpm): obs = env.reset() total_reward = 0 + steps = 0 while True: + steps += 1 batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) @@ -62,7 +64,7 @@ def run_train_episode(env, agent, rpm): if done: break - return total_reward + return total_reward, steps def run_evaluate_episode(env, agent): @@ -105,18 +107,36 @@ def main(): rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) - for i in range(MAX_EPISODES): - train_reward = run_train_episode(env, agent, rpm) - logger.info('Episode: {} Reward: {}'.format(i, train_reward)) - if (i + 1) % TEST_EVERY_EPISODES == 0: + test_flag = 0 + total_steps = 0 + while total_steps < args.train_total_steps: + train_reward, steps = run_train_episode(env, agent, rpm) + total_steps += steps + logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) + + if total_steps // args.test_every_steps >= test_flag: + while total_steps // args.test_every_steps >= test_flag: + test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent) - logger.info('Episode {}, Evaluate reward: {}'.format( - i, evaluate_reward)) + logger.info('Steps {}, Evaluate reward: {}'.format( + total_steps, evaluate_reward)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( '--env', help='Mujoco environment name', default='HalfCheetah-v2') + parser.add_argument( + '--train_total_steps', + type=int, + default=int(1e7), + help='maximum training steps') + parser.add_argument( + '--test_every_steps', + type=int, + default=int(1e4), + help='the step interval between two consecutive evaluations') + args = parser.parse_args() + main() diff --git a/examples/DQN/.benchmark/DQN_BeamRider.png b/examples/DQN/.benchmark/DQN_BeamRider.png new file mode 100644 index 0000000000000000000000000000000000000000..934d7aee81d45ee872fec27b60cdfacf7b366ed6 Binary files /dev/null and b/examples/DQN/.benchmark/DQN_BeamRider.png differ diff --git a/examples/DQN/.benchmark/DQN_Breakout.png b/examples/DQN/.benchmark/DQN_Breakout.png new file mode 100644 index 0000000000000000000000000000000000000000..98f727f538caa64c94b103d7b4af17bd0afc6487 Binary files /dev/null and b/examples/DQN/.benchmark/DQN_Breakout.png differ diff --git a/examples/DQN/README.md b/examples/DQN/README.md index 5de3d07629b6b10deb82c62c99381ece89b4f635..9bba38ce7b63354b80b0fed2cd75a1f9ca82ab7c 100644 --- a/examples/DQN/README.md +++ b/examples/DQN/README.md @@ -8,8 +8,10 @@ Based on PARL, the DQN model of deep reinforcement learning is reproduced, and t Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari game. ### Benchmark result -- Pong - + +DQN_Pong DQN_Breakout +
+DQN_BeamRider ## How to use ### Dependencies: diff --git a/examples/DQN/train.py b/examples/DQN/train.py index 06cf8eb7ca76003dfb2fe9190551cc428956f6eb..669fac77b1973cc54ac13ea21fbd666efde6ee65 100644 --- a/examples/DQN/train.py +++ b/examples/DQN/train.py @@ -41,9 +41,9 @@ def run_train_episode(env, agent, rpm): total_reward = 0 all_cost = [] state = env.reset() - step = 0 + steps = 0 while True: - step += 1 + steps += 1 context = rpm.recent_state() context.append(state) context = np.stack(context, axis=0) @@ -52,7 +52,7 @@ def run_train_episode(env, agent, rpm): rpm.append(Experience(state, action, reward, isOver)) # start training if rpm.size() > MEMORY_WARMUP_SIZE: - if step % UPDATE_FREQ == 0: + if steps % UPDATE_FREQ == 0: batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( args.batch_size) batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] @@ -67,7 +67,7 @@ def run_train_episode(env, agent, rpm): if all_cost: logger.info('[Train]total_reward: {}, mean_cost: {}'.format( total_reward, np.mean(all_cost))) - return total_reward, step + return total_reward, steps def run_evaluate_episode(env, agent): @@ -104,34 +104,33 @@ def main(): with tqdm(total=MEMORY_WARMUP_SIZE) as pbar: while rpm.size() < MEMORY_WARMUP_SIZE: - total_reward, step = run_train_episode(env, agent, rpm) - pbar.update(step) + total_reward, steps = run_train_episode(env, agent, rpm) + pbar.update(steps) # train test_flag = 0 - pbar = tqdm(total=1e8) + pbar = tqdm(total=args.train_total_steps) recent_100_reward = [] - total_step = 0 + total_steps = 0 max_reward = None - while True: + while total_steps < args.train_total_steps: # start epoch - total_reward, step = run_train_episode(env, agent, rpm) - total_step += step + total_reward, steps = run_train_episode(env, agent, rpm) + total_steps += steps pbar.set_description('[train]exploration:{}'.format(agent.exploration)) - pbar.update(step) + pbar.update(steps) - if total_step // args.test_every_steps == test_flag: + if total_steps // args.test_every_steps >= test_flag: + while total_steps // args.test_every_steps >= test_flag: + test_flag += 1 pbar.write("testing") eval_rewards = [] for _ in tqdm(range(3), desc='eval agent'): eval_reward = run_evaluate_episode(test_env, agent) eval_rewards.append(eval_reward) - test_flag += 1 logger.info( "eval_agent done, (steps, eval_reward): ({}, {})".format( - total_step, np.mean(eval_rewards))) - if total_step > 1e8: - break + total_steps, np.mean(eval_rewards))) pbar.close() @@ -141,10 +140,17 @@ if __name__ == '__main__': parser.add_argument('--rom', help='atari rom', required=True) parser.add_argument( '--batch_size', type=int, default=64, help='batch size for training') + parser.add_argument( + '--train_total_steps', + type=int, + default=int(1e8), + help='maximum training steps') parser.add_argument( '--test_every_steps', type=int, default=100000, - help='every steps number to run test') + help='the step interval between two consecutive evaluations') + args = parser.parse_args() + main() diff --git a/examples/PPO/README.md b/examples/PPO/README.md index c78a2169135a4d0104355c10835cd8a1fe47b970..22c269017cf29cd94dd9a770cec9e228385ae0a8 100644 --- a/examples/PPO/README.md +++ b/examples/PPO/README.md @@ -11,8 +11,8 @@ Include following approach: Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game. ### Benchmark result -- HalfCheetah-v2 - + +PPO_HalfCheetah-v2 ## How to use ### Dependencies: diff --git a/examples/PPO/train.py b/examples/PPO/train.py index 4b9ead54097d8bbc524fabdd79ac945d6d6136d2..7508f6fe52900d0aa5525c47b7209c987f4c21f0 100755 --- a/examples/PPO/train.py +++ b/examples/PPO/train.py @@ -120,11 +120,12 @@ def main(): # run a few episodes to initialize scaler collect_trajectories(env, agent, scaler, episodes=5) - episode = 0 - while episode < args.num_episodes: + test_flag = 0 + total_steps = 0 + while total_steps < args.train_total_steps: obs, actions, rewards = collect_trajectories( env, agent, scaler, episodes=args.episodes_per_batch) - episode += args.episodes_per_batch + total_steps += obs.shape[0] pred_values = agent.value_predict(obs) @@ -145,14 +146,16 @@ def main(): value_loss = agent.value_learn(obs, discount_sum_rewards) logger.info( - 'Episode {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}' - .format(episode, + 'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}' + .format(total_steps, np.sum(rewards) / args.episodes_per_batch, policy_loss, kl, value_loss)) - if episode % (args.episodes_per_batch * 5) == 0: + if total_steps // args.test_every_steps >= test_flag: + while total_steps // args.test_every_steps >= test_flag: + test_flag += 1 eval_reward = run_evaluate_episode(env, agent, scaler) - logger.info('Episode {}, Evaluate reward: {}'.format( - episode, eval_reward)) + logger.info('Steps {}, Evaluate reward: {}'.format( + total_steps, eval_reward)) if __name__ == "__main__": @@ -162,11 +165,6 @@ if __name__ == "__main__": type=str, help='Mujoco environment name', default='HalfCheetah-v2') - parser.add_argument( - '--num_episodes', - type=int, - help='Number of episodes to run', - default=10000) parser.add_argument( '--gamma', type=float, help='Discount factor', default=0.995) parser.add_argument( @@ -186,6 +184,16 @@ if __name__ == "__main__": type=str, help="Choose loss type of PPO algorithm, 'CLIP' or 'KLPEN'", default='CLIP') + parser.add_argument( + '--train_total_steps', + type=int, + default=int(1e7), + help='maximum training steps') + parser.add_argument( + '--test_every_steps', + type=int, + default=int(1e4), + help='the step interval between two consecutive evaluations') args = parser.parse_args()