提交 6fdf4448 编写于 作者: H Hongsheng Zeng 提交者: Bo Zhou

Add more dqn benchmark result and unify train scripts (#46)

* add more dqn benchmark result; unify train scripts

* resize benchmark picture

* resize benchmark picture, refine comments of args

* change dependence, mujoco only support python3 now
上级 7a7583ab
......@@ -8,12 +8,12 @@ Based on PARL, the DDPG model of deep reinforcement learning is reproduced, and
Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game.
### Benchmark result
- HalfCheetah-v2
<img src=".benchmark/DDPG_HalfCheetah-v2.png"/>
<img src=".benchmark/DDPG_HalfCheetah-v2.png" width = "400" height ="300" alt="DDPG_HalfCheetah-v2"/>
## How to use
### Dependencies:
+ python2.7 or python3.5+
+ python3.5+
+ [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
+ [parl](https://github.com/PaddlePaddle/PARL)
+ gym
......
......@@ -37,7 +37,9 @@ ENV_SEED = 1
def run_train_episode(env, agent, rpm):
obs = env.reset()
total_reward = 0
steps = 0
while True:
steps += 1
batch_obs = np.expand_dims(obs, axis=0)
action = agent.predict(batch_obs.astype('float32'))
action = np.squeeze(action)
......@@ -62,7 +64,7 @@ def run_train_episode(env, agent, rpm):
if done:
break
return total_reward
return total_reward, steps
def run_evaluate_episode(env, agent):
......@@ -105,18 +107,36 @@ def main():
rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)
for i in range(MAX_EPISODES):
train_reward = run_train_episode(env, agent, rpm)
logger.info('Episode: {} Reward: {}'.format(i, train_reward))
if (i + 1) % TEST_EVERY_EPISODES == 0:
test_flag = 0
total_steps = 0
while total_steps < args.train_total_steps:
train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
test_flag += 1
evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Episode {}, Evaluate reward: {}'.format(
i, evaluate_reward))
logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--env', help='Mujoco environment name', default='HalfCheetah-v2')
parser.add_argument(
'--train_total_steps',
type=int,
default=int(1e7),
help='maximum training steps')
parser.add_argument(
'--test_every_steps',
type=int,
default=int(1e4),
help='the step interval between two consecutive evaluations')
args = parser.parse_args()
main()
......@@ -8,8 +8,10 @@ Based on PARL, the DQN model of deep reinforcement learning is reproduced, and t
Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari game.
### Benchmark result
- Pong
<img src=".benchmark/DQN_Pong.png"/>
<img src=".benchmark/DQN_Pong.png" width = "400" height ="300" alt="DQN_Pong" /> <img src=".benchmark/DQN_Breakout.png" width = "400" height ="300" alt="DQN_Breakout"/>
<br>
<img src=".benchmark/DQN_BeamRider.png" width = "400" height ="300" alt="DQN_BeamRider"/>
## How to use
### Dependencies:
......
......@@ -41,9 +41,9 @@ def run_train_episode(env, agent, rpm):
total_reward = 0
all_cost = []
state = env.reset()
step = 0
steps = 0
while True:
step += 1
steps += 1
context = rpm.recent_state()
context.append(state)
context = np.stack(context, axis=0)
......@@ -52,7 +52,7 @@ def run_train_episode(env, agent, rpm):
rpm.append(Experience(state, action, reward, isOver))
# start training
if rpm.size() > MEMORY_WARMUP_SIZE:
if step % UPDATE_FREQ == 0:
if steps % UPDATE_FREQ == 0:
batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
args.batch_size)
batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
......@@ -67,7 +67,7 @@ def run_train_episode(env, agent, rpm):
if all_cost:
logger.info('[Train]total_reward: {}, mean_cost: {}'.format(
total_reward, np.mean(all_cost)))
return total_reward, step
return total_reward, steps
def run_evaluate_episode(env, agent):
......@@ -104,34 +104,33 @@ def main():
with tqdm(total=MEMORY_WARMUP_SIZE) as pbar:
while rpm.size() < MEMORY_WARMUP_SIZE:
total_reward, step = run_train_episode(env, agent, rpm)
pbar.update(step)
total_reward, steps = run_train_episode(env, agent, rpm)
pbar.update(steps)
# train
test_flag = 0
pbar = tqdm(total=1e8)
pbar = tqdm(total=args.train_total_steps)
recent_100_reward = []
total_step = 0
total_steps = 0
max_reward = None
while True:
while total_steps < args.train_total_steps:
# start epoch
total_reward, step = run_train_episode(env, agent, rpm)
total_step += step
total_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps
pbar.set_description('[train]exploration:{}'.format(agent.exploration))
pbar.update(step)
pbar.update(steps)
if total_step // args.test_every_steps == test_flag:
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
test_flag += 1
pbar.write("testing")
eval_rewards = []
for _ in tqdm(range(3), desc='eval agent'):
eval_reward = run_evaluate_episode(test_env, agent)
eval_rewards.append(eval_reward)
test_flag += 1
logger.info(
"eval_agent done, (steps, eval_reward): ({}, {})".format(
total_step, np.mean(eval_rewards)))
if total_step > 1e8:
break
total_steps, np.mean(eval_rewards)))
pbar.close()
......@@ -141,10 +140,17 @@ if __name__ == '__main__':
parser.add_argument('--rom', help='atari rom', required=True)
parser.add_argument(
'--batch_size', type=int, default=64, help='batch size for training')
parser.add_argument(
'--train_total_steps',
type=int,
default=int(1e8),
help='maximum training steps')
parser.add_argument(
'--test_every_steps',
type=int,
default=100000,
help='every steps number to run test')
help='the step interval between two consecutive evaluations')
args = parser.parse_args()
main()
......@@ -11,8 +11,8 @@ Include following approach:
Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game.
### Benchmark result
- HalfCheetah-v2
<img src=".benchmark/PPO_HalfCheetah-v2.png"/>
<img src=".benchmark/PPO_HalfCheetah-v2.png" width = "400" height ="300" alt="PPO_HalfCheetah-v2" />
## How to use
### Dependencies:
......
......@@ -120,11 +120,12 @@ def main():
# run a few episodes to initialize scaler
collect_trajectories(env, agent, scaler, episodes=5)
episode = 0
while episode < args.num_episodes:
test_flag = 0
total_steps = 0
while total_steps < args.train_total_steps:
obs, actions, rewards = collect_trajectories(
env, agent, scaler, episodes=args.episodes_per_batch)
episode += args.episodes_per_batch
total_steps += obs.shape[0]
pred_values = agent.value_predict(obs)
......@@ -145,14 +146,16 @@ def main():
value_loss = agent.value_learn(obs, discount_sum_rewards)
logger.info(
'Episode {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
.format(episode,
'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
.format(total_steps,
np.sum(rewards) / args.episodes_per_batch, policy_loss, kl,
value_loss))
if episode % (args.episodes_per_batch * 5) == 0:
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
test_flag += 1
eval_reward = run_evaluate_episode(env, agent, scaler)
logger.info('Episode {}, Evaluate reward: {}'.format(
episode, eval_reward))
logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, eval_reward))
if __name__ == "__main__":
......@@ -162,11 +165,6 @@ if __name__ == "__main__":
type=str,
help='Mujoco environment name',
default='HalfCheetah-v2')
parser.add_argument(
'--num_episodes',
type=int,
help='Number of episodes to run',
default=10000)
parser.add_argument(
'--gamma', type=float, help='Discount factor', default=0.995)
parser.add_argument(
......@@ -186,6 +184,16 @@ if __name__ == "__main__":
type=str,
help="Choose loss type of PPO algorithm, 'CLIP' or 'KLPEN'",
default='CLIP')
parser.add_argument(
'--train_total_steps',
type=int,
default=int(1e7),
help='maximum training steps')
parser.add_argument(
'--test_every_steps',
type=int,
default=int(1e4),
help='the step interval between two consecutive evaluations')
args = parser.parse_args()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册