diff --git a/examples/DDPG/README.md b/examples/DDPG/README.md
index f11a30aa852f0630acabf873de518f7b9b64f15f..d62d062411a469e44f0ab55ca6d3a51d19f64aa2 100644
--- a/examples/DDPG/README.md
+++ b/examples/DDPG/README.md
@@ -8,12 +8,12 @@ Based on PARL, the DDPG model of deep reinforcement learning is reproduced, and
Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game.
### Benchmark result
-- HalfCheetah-v2
-
+
+
## How to use
### Dependencies:
-+ python2.7 or python3.5+
++ python3.5+
+ [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
+ [parl](https://github.com/PaddlePaddle/PARL)
+ gym
diff --git a/examples/DDPG/train.py b/examples/DDPG/train.py
index c8f7ce3f92b47c639942d3099cfec32535c6043f..ef02304782b5c8ef27ba381e87f54422e2ff6d9d 100644
--- a/examples/DDPG/train.py
+++ b/examples/DDPG/train.py
@@ -37,7 +37,9 @@ ENV_SEED = 1
def run_train_episode(env, agent, rpm):
obs = env.reset()
total_reward = 0
+ steps = 0
while True:
+ steps += 1
batch_obs = np.expand_dims(obs, axis=0)
action = agent.predict(batch_obs.astype('float32'))
action = np.squeeze(action)
@@ -62,7 +64,7 @@ def run_train_episode(env, agent, rpm):
if done:
break
- return total_reward
+ return total_reward, steps
def run_evaluate_episode(env, agent):
@@ -105,18 +107,36 @@ def main():
rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)
- for i in range(MAX_EPISODES):
- train_reward = run_train_episode(env, agent, rpm)
- logger.info('Episode: {} Reward: {}'.format(i, train_reward))
- if (i + 1) % TEST_EVERY_EPISODES == 0:
+ test_flag = 0
+ total_steps = 0
+ while total_steps < args.train_total_steps:
+ train_reward, steps = run_train_episode(env, agent, rpm)
+ total_steps += steps
+ logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
+
+ if total_steps // args.test_every_steps >= test_flag:
+ while total_steps // args.test_every_steps >= test_flag:
+ test_flag += 1
evaluate_reward = run_evaluate_episode(env, agent)
- logger.info('Episode {}, Evaluate reward: {}'.format(
- i, evaluate_reward))
+ logger.info('Steps {}, Evaluate reward: {}'.format(
+ total_steps, evaluate_reward))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--env', help='Mujoco environment name', default='HalfCheetah-v2')
+ parser.add_argument(
+ '--train_total_steps',
+ type=int,
+ default=int(1e7),
+ help='maximum training steps')
+ parser.add_argument(
+ '--test_every_steps',
+ type=int,
+ default=int(1e4),
+ help='the step interval between two consecutive evaluations')
+
args = parser.parse_args()
+
main()
diff --git a/examples/DQN/.benchmark/DQN_BeamRider.png b/examples/DQN/.benchmark/DQN_BeamRider.png
new file mode 100644
index 0000000000000000000000000000000000000000..934d7aee81d45ee872fec27b60cdfacf7b366ed6
Binary files /dev/null and b/examples/DQN/.benchmark/DQN_BeamRider.png differ
diff --git a/examples/DQN/.benchmark/DQN_Breakout.png b/examples/DQN/.benchmark/DQN_Breakout.png
new file mode 100644
index 0000000000000000000000000000000000000000..98f727f538caa64c94b103d7b4af17bd0afc6487
Binary files /dev/null and b/examples/DQN/.benchmark/DQN_Breakout.png differ
diff --git a/examples/DQN/README.md b/examples/DQN/README.md
index 5de3d07629b6b10deb82c62c99381ece89b4f635..9bba38ce7b63354b80b0fed2cd75a1f9ca82ab7c 100644
--- a/examples/DQN/README.md
+++ b/examples/DQN/README.md
@@ -8,8 +8,10 @@ Based on PARL, the DQN model of deep reinforcement learning is reproduced, and t
Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari game.
### Benchmark result
-- Pong
-
+
+
+
+
## How to use
### Dependencies:
diff --git a/examples/DQN/train.py b/examples/DQN/train.py
index 06cf8eb7ca76003dfb2fe9190551cc428956f6eb..669fac77b1973cc54ac13ea21fbd666efde6ee65 100644
--- a/examples/DQN/train.py
+++ b/examples/DQN/train.py
@@ -41,9 +41,9 @@ def run_train_episode(env, agent, rpm):
total_reward = 0
all_cost = []
state = env.reset()
- step = 0
+ steps = 0
while True:
- step += 1
+ steps += 1
context = rpm.recent_state()
context.append(state)
context = np.stack(context, axis=0)
@@ -52,7 +52,7 @@ def run_train_episode(env, agent, rpm):
rpm.append(Experience(state, action, reward, isOver))
# start training
if rpm.size() > MEMORY_WARMUP_SIZE:
- if step % UPDATE_FREQ == 0:
+ if steps % UPDATE_FREQ == 0:
batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
args.batch_size)
batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
@@ -67,7 +67,7 @@ def run_train_episode(env, agent, rpm):
if all_cost:
logger.info('[Train]total_reward: {}, mean_cost: {}'.format(
total_reward, np.mean(all_cost)))
- return total_reward, step
+ return total_reward, steps
def run_evaluate_episode(env, agent):
@@ -104,34 +104,33 @@ def main():
with tqdm(total=MEMORY_WARMUP_SIZE) as pbar:
while rpm.size() < MEMORY_WARMUP_SIZE:
- total_reward, step = run_train_episode(env, agent, rpm)
- pbar.update(step)
+ total_reward, steps = run_train_episode(env, agent, rpm)
+ pbar.update(steps)
# train
test_flag = 0
- pbar = tqdm(total=1e8)
+ pbar = tqdm(total=args.train_total_steps)
recent_100_reward = []
- total_step = 0
+ total_steps = 0
max_reward = None
- while True:
+ while total_steps < args.train_total_steps:
# start epoch
- total_reward, step = run_train_episode(env, agent, rpm)
- total_step += step
+ total_reward, steps = run_train_episode(env, agent, rpm)
+ total_steps += steps
pbar.set_description('[train]exploration:{}'.format(agent.exploration))
- pbar.update(step)
+ pbar.update(steps)
- if total_step // args.test_every_steps == test_flag:
+ if total_steps // args.test_every_steps >= test_flag:
+ while total_steps // args.test_every_steps >= test_flag:
+ test_flag += 1
pbar.write("testing")
eval_rewards = []
for _ in tqdm(range(3), desc='eval agent'):
eval_reward = run_evaluate_episode(test_env, agent)
eval_rewards.append(eval_reward)
- test_flag += 1
logger.info(
"eval_agent done, (steps, eval_reward): ({}, {})".format(
- total_step, np.mean(eval_rewards)))
- if total_step > 1e8:
- break
+ total_steps, np.mean(eval_rewards)))
pbar.close()
@@ -141,10 +140,17 @@ if __name__ == '__main__':
parser.add_argument('--rom', help='atari rom', required=True)
parser.add_argument(
'--batch_size', type=int, default=64, help='batch size for training')
+ parser.add_argument(
+ '--train_total_steps',
+ type=int,
+ default=int(1e8),
+ help='maximum training steps')
parser.add_argument(
'--test_every_steps',
type=int,
default=100000,
- help='every steps number to run test')
+ help='the step interval between two consecutive evaluations')
+
args = parser.parse_args()
+
main()
diff --git a/examples/PPO/README.md b/examples/PPO/README.md
index c78a2169135a4d0104355c10835cd8a1fe47b970..22c269017cf29cd94dd9a770cec9e228385ae0a8 100644
--- a/examples/PPO/README.md
+++ b/examples/PPO/README.md
@@ -11,8 +11,8 @@ Include following approach:
Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game.
### Benchmark result
-- HalfCheetah-v2
-
+
+
## How to use
### Dependencies:
diff --git a/examples/PPO/train.py b/examples/PPO/train.py
index 4b9ead54097d8bbc524fabdd79ac945d6d6136d2..7508f6fe52900d0aa5525c47b7209c987f4c21f0 100755
--- a/examples/PPO/train.py
+++ b/examples/PPO/train.py
@@ -120,11 +120,12 @@ def main():
# run a few episodes to initialize scaler
collect_trajectories(env, agent, scaler, episodes=5)
- episode = 0
- while episode < args.num_episodes:
+ test_flag = 0
+ total_steps = 0
+ while total_steps < args.train_total_steps:
obs, actions, rewards = collect_trajectories(
env, agent, scaler, episodes=args.episodes_per_batch)
- episode += args.episodes_per_batch
+ total_steps += obs.shape[0]
pred_values = agent.value_predict(obs)
@@ -145,14 +146,16 @@ def main():
value_loss = agent.value_learn(obs, discount_sum_rewards)
logger.info(
- 'Episode {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
- .format(episode,
+ 'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
+ .format(total_steps,
np.sum(rewards) / args.episodes_per_batch, policy_loss, kl,
value_loss))
- if episode % (args.episodes_per_batch * 5) == 0:
+ if total_steps // args.test_every_steps >= test_flag:
+ while total_steps // args.test_every_steps >= test_flag:
+ test_flag += 1
eval_reward = run_evaluate_episode(env, agent, scaler)
- logger.info('Episode {}, Evaluate reward: {}'.format(
- episode, eval_reward))
+ logger.info('Steps {}, Evaluate reward: {}'.format(
+ total_steps, eval_reward))
if __name__ == "__main__":
@@ -162,11 +165,6 @@ if __name__ == "__main__":
type=str,
help='Mujoco environment name',
default='HalfCheetah-v2')
- parser.add_argument(
- '--num_episodes',
- type=int,
- help='Number of episodes to run',
- default=10000)
parser.add_argument(
'--gamma', type=float, help='Discount factor', default=0.995)
parser.add_argument(
@@ -186,6 +184,16 @@ if __name__ == "__main__":
type=str,
help="Choose loss type of PPO algorithm, 'CLIP' or 'KLPEN'",
default='CLIP')
+ parser.add_argument(
+ '--train_total_steps',
+ type=int,
+ default=int(1e7),
+ help='maximum training steps')
+ parser.add_argument(
+ '--test_every_steps',
+ type=int,
+ default=int(1e4),
+ help='the step interval between two consecutive evaluations')
args = parser.parse_args()