提交 6fdf4448 编写于 作者: H Hongsheng Zeng 提交者: Bo Zhou

Add more dqn benchmark result and unify train scripts (#46)

* add more dqn benchmark result; unify train scripts

* resize benchmark picture

* resize benchmark picture, refine comments of args

* change dependence, mujoco only support python3 now
上级 7a7583ab
...@@ -8,12 +8,12 @@ Based on PARL, the DDPG model of deep reinforcement learning is reproduced, and ...@@ -8,12 +8,12 @@ Based on PARL, the DDPG model of deep reinforcement learning is reproduced, and
Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game. Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game.
### Benchmark result ### Benchmark result
- HalfCheetah-v2
<img src=".benchmark/DDPG_HalfCheetah-v2.png"/> <img src=".benchmark/DDPG_HalfCheetah-v2.png" width = "400" height ="300" alt="DDPG_HalfCheetah-v2"/>
## How to use ## How to use
### Dependencies: ### Dependencies:
+ python2.7 or python3.5+ + python3.5+
+ [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle) + [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
+ [parl](https://github.com/PaddlePaddle/PARL) + [parl](https://github.com/PaddlePaddle/PARL)
+ gym + gym
......
...@@ -37,7 +37,9 @@ ENV_SEED = 1 ...@@ -37,7 +37,9 @@ ENV_SEED = 1
def run_train_episode(env, agent, rpm): def run_train_episode(env, agent, rpm):
obs = env.reset() obs = env.reset()
total_reward = 0 total_reward = 0
steps = 0
while True: while True:
steps += 1
batch_obs = np.expand_dims(obs, axis=0) batch_obs = np.expand_dims(obs, axis=0)
action = agent.predict(batch_obs.astype('float32')) action = agent.predict(batch_obs.astype('float32'))
action = np.squeeze(action) action = np.squeeze(action)
...@@ -62,7 +64,7 @@ def run_train_episode(env, agent, rpm): ...@@ -62,7 +64,7 @@ def run_train_episode(env, agent, rpm):
if done: if done:
break break
return total_reward return total_reward, steps
def run_evaluate_episode(env, agent): def run_evaluate_episode(env, agent):
...@@ -105,18 +107,36 @@ def main(): ...@@ -105,18 +107,36 @@ def main():
rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)
for i in range(MAX_EPISODES): test_flag = 0
train_reward = run_train_episode(env, agent, rpm) total_steps = 0
logger.info('Episode: {} Reward: {}'.format(i, train_reward)) while total_steps < args.train_total_steps:
if (i + 1) % TEST_EVERY_EPISODES == 0: train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
test_flag += 1
evaluate_reward = run_evaluate_episode(env, agent) evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Episode {}, Evaluate reward: {}'.format( logger.info('Steps {}, Evaluate reward: {}'.format(
i, evaluate_reward)) total_steps, evaluate_reward))
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
'--env', help='Mujoco environment name', default='HalfCheetah-v2') '--env', help='Mujoco environment name', default='HalfCheetah-v2')
parser.add_argument(
'--train_total_steps',
type=int,
default=int(1e7),
help='maximum training steps')
parser.add_argument(
'--test_every_steps',
type=int,
default=int(1e4),
help='the step interval between two consecutive evaluations')
args = parser.parse_args() args = parser.parse_args()
main() main()
...@@ -8,8 +8,10 @@ Based on PARL, the DQN model of deep reinforcement learning is reproduced, and t ...@@ -8,8 +8,10 @@ Based on PARL, the DQN model of deep reinforcement learning is reproduced, and t
Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari game. Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari game.
### Benchmark result ### Benchmark result
- Pong
<img src=".benchmark/DQN_Pong.png"/> <img src=".benchmark/DQN_Pong.png" width = "400" height ="300" alt="DQN_Pong" /> <img src=".benchmark/DQN_Breakout.png" width = "400" height ="300" alt="DQN_Breakout"/>
<br>
<img src=".benchmark/DQN_BeamRider.png" width = "400" height ="300" alt="DQN_BeamRider"/>
## How to use ## How to use
### Dependencies: ### Dependencies:
......
...@@ -41,9 +41,9 @@ def run_train_episode(env, agent, rpm): ...@@ -41,9 +41,9 @@ def run_train_episode(env, agent, rpm):
total_reward = 0 total_reward = 0
all_cost = [] all_cost = []
state = env.reset() state = env.reset()
step = 0 steps = 0
while True: while True:
step += 1 steps += 1
context = rpm.recent_state() context = rpm.recent_state()
context.append(state) context.append(state)
context = np.stack(context, axis=0) context = np.stack(context, axis=0)
...@@ -52,7 +52,7 @@ def run_train_episode(env, agent, rpm): ...@@ -52,7 +52,7 @@ def run_train_episode(env, agent, rpm):
rpm.append(Experience(state, action, reward, isOver)) rpm.append(Experience(state, action, reward, isOver))
# start training # start training
if rpm.size() > MEMORY_WARMUP_SIZE: if rpm.size() > MEMORY_WARMUP_SIZE:
if step % UPDATE_FREQ == 0: if steps % UPDATE_FREQ == 0:
batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
args.batch_size) args.batch_size)
batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
...@@ -67,7 +67,7 @@ def run_train_episode(env, agent, rpm): ...@@ -67,7 +67,7 @@ def run_train_episode(env, agent, rpm):
if all_cost: if all_cost:
logger.info('[Train]total_reward: {}, mean_cost: {}'.format( logger.info('[Train]total_reward: {}, mean_cost: {}'.format(
total_reward, np.mean(all_cost))) total_reward, np.mean(all_cost)))
return total_reward, step return total_reward, steps
def run_evaluate_episode(env, agent): def run_evaluate_episode(env, agent):
...@@ -104,34 +104,33 @@ def main(): ...@@ -104,34 +104,33 @@ def main():
with tqdm(total=MEMORY_WARMUP_SIZE) as pbar: with tqdm(total=MEMORY_WARMUP_SIZE) as pbar:
while rpm.size() < MEMORY_WARMUP_SIZE: while rpm.size() < MEMORY_WARMUP_SIZE:
total_reward, step = run_train_episode(env, agent, rpm) total_reward, steps = run_train_episode(env, agent, rpm)
pbar.update(step) pbar.update(steps)
# train # train
test_flag = 0 test_flag = 0
pbar = tqdm(total=1e8) pbar = tqdm(total=args.train_total_steps)
recent_100_reward = [] recent_100_reward = []
total_step = 0 total_steps = 0
max_reward = None max_reward = None
while True: while total_steps < args.train_total_steps:
# start epoch # start epoch
total_reward, step = run_train_episode(env, agent, rpm) total_reward, steps = run_train_episode(env, agent, rpm)
total_step += step total_steps += steps
pbar.set_description('[train]exploration:{}'.format(agent.exploration)) pbar.set_description('[train]exploration:{}'.format(agent.exploration))
pbar.update(step) pbar.update(steps)
if total_step // args.test_every_steps == test_flag: if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
test_flag += 1
pbar.write("testing") pbar.write("testing")
eval_rewards = [] eval_rewards = []
for _ in tqdm(range(3), desc='eval agent'): for _ in tqdm(range(3), desc='eval agent'):
eval_reward = run_evaluate_episode(test_env, agent) eval_reward = run_evaluate_episode(test_env, agent)
eval_rewards.append(eval_reward) eval_rewards.append(eval_reward)
test_flag += 1
logger.info( logger.info(
"eval_agent done, (steps, eval_reward): ({}, {})".format( "eval_agent done, (steps, eval_reward): ({}, {})".format(
total_step, np.mean(eval_rewards))) total_steps, np.mean(eval_rewards)))
if total_step > 1e8:
break
pbar.close() pbar.close()
...@@ -141,10 +140,17 @@ if __name__ == '__main__': ...@@ -141,10 +140,17 @@ if __name__ == '__main__':
parser.add_argument('--rom', help='atari rom', required=True) parser.add_argument('--rom', help='atari rom', required=True)
parser.add_argument( parser.add_argument(
'--batch_size', type=int, default=64, help='batch size for training') '--batch_size', type=int, default=64, help='batch size for training')
parser.add_argument(
'--train_total_steps',
type=int,
default=int(1e8),
help='maximum training steps')
parser.add_argument( parser.add_argument(
'--test_every_steps', '--test_every_steps',
type=int, type=int,
default=100000, default=100000,
help='every steps number to run test') help='the step interval between two consecutive evaluations')
args = parser.parse_args() args = parser.parse_args()
main() main()
...@@ -11,8 +11,8 @@ Include following approach: ...@@ -11,8 +11,8 @@ Include following approach:
Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game. Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game.
### Benchmark result ### Benchmark result
- HalfCheetah-v2
<img src=".benchmark/PPO_HalfCheetah-v2.png"/> <img src=".benchmark/PPO_HalfCheetah-v2.png" width = "400" height ="300" alt="PPO_HalfCheetah-v2" />
## How to use ## How to use
### Dependencies: ### Dependencies:
......
...@@ -120,11 +120,12 @@ def main(): ...@@ -120,11 +120,12 @@ def main():
# run a few episodes to initialize scaler # run a few episodes to initialize scaler
collect_trajectories(env, agent, scaler, episodes=5) collect_trajectories(env, agent, scaler, episodes=5)
episode = 0 test_flag = 0
while episode < args.num_episodes: total_steps = 0
while total_steps < args.train_total_steps:
obs, actions, rewards = collect_trajectories( obs, actions, rewards = collect_trajectories(
env, agent, scaler, episodes=args.episodes_per_batch) env, agent, scaler, episodes=args.episodes_per_batch)
episode += args.episodes_per_batch total_steps += obs.shape[0]
pred_values = agent.value_predict(obs) pred_values = agent.value_predict(obs)
...@@ -145,14 +146,16 @@ def main(): ...@@ -145,14 +146,16 @@ def main():
value_loss = agent.value_learn(obs, discount_sum_rewards) value_loss = agent.value_learn(obs, discount_sum_rewards)
logger.info( logger.info(
'Episode {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}' 'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
.format(episode, .format(total_steps,
np.sum(rewards) / args.episodes_per_batch, policy_loss, kl, np.sum(rewards) / args.episodes_per_batch, policy_loss, kl,
value_loss)) value_loss))
if episode % (args.episodes_per_batch * 5) == 0: if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
test_flag += 1
eval_reward = run_evaluate_episode(env, agent, scaler) eval_reward = run_evaluate_episode(env, agent, scaler)
logger.info('Episode {}, Evaluate reward: {}'.format( logger.info('Steps {}, Evaluate reward: {}'.format(
episode, eval_reward)) total_steps, eval_reward))
if __name__ == "__main__": if __name__ == "__main__":
...@@ -162,11 +165,6 @@ if __name__ == "__main__": ...@@ -162,11 +165,6 @@ if __name__ == "__main__":
type=str, type=str,
help='Mujoco environment name', help='Mujoco environment name',
default='HalfCheetah-v2') default='HalfCheetah-v2')
parser.add_argument(
'--num_episodes',
type=int,
help='Number of episodes to run',
default=10000)
parser.add_argument( parser.add_argument(
'--gamma', type=float, help='Discount factor', default=0.995) '--gamma', type=float, help='Discount factor', default=0.995)
parser.add_argument( parser.add_argument(
...@@ -186,6 +184,16 @@ if __name__ == "__main__": ...@@ -186,6 +184,16 @@ if __name__ == "__main__":
type=str, type=str,
help="Choose loss type of PPO algorithm, 'CLIP' or 'KLPEN'", help="Choose loss type of PPO algorithm, 'CLIP' or 'KLPEN'",
default='CLIP') default='CLIP')
parser.add_argument(
'--train_total_steps',
type=int,
default=int(1e7),
help='maximum training steps')
parser.add_argument(
'--test_every_steps',
type=int,
default=int(1e4),
help='the step interval between two consecutive evaluations')
args = parser.parse_args() args = parser.parse_args()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册