diff --git a/README.md b/README.md index 5285eddcd43b2dc2ddcd746e7b36724015ff592a..35190dfbbd658e805962fa2defb009f6142d63b6 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@
+> PARL is a flexible and high-efficient reinforcement learning framework based on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle). + # Features **Reproducible**. We provide algorithms that stably reproduce the result of many influential reinforcement learning algorithms @@ -76,5 +78,5 @@ pip install --upgrade git+https://github.com/PaddlePaddle/PARL.git - [QuickStart](examples/QuickStart/) - [DQN](examples/DQN/) - [DDPG](examples/DDPG/) -- PPO +- [PPO](examples/PPO/) - [Winning Solution for NIPS2018: AI for Prosthetics Challenge](examples/NeurIPS2018-AI-for-Prosthetics-Challenge/) diff --git a/examples/DDPG/README.md b/examples/DDPG/README.md index 61d32403c4f54712478b462f084a4e75ded606e7..3f599716599430e71cf4d56879335f81077a0ecd 100644 --- a/examples/DDPG/README.md +++ b/examples/DDPG/README.md @@ -11,7 +11,6 @@ Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco ## How to use ### Dependencies: + python2.7 or python3.5+ -+ [PARL](https://github.com/PaddlePaddle/PARL) + [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle) + gym + tqdm diff --git a/examples/DDPG/mujoco_model.py b/examples/DDPG/mujoco_model.py index 991842d6ebb1d7e6150f61cadf35b55c931e38f0..ae7477b6e9f09df7af0ccf796a75ba84c2b5fd61 100644 --- a/examples/DDPG/mujoco_model.py +++ b/examples/DDPG/mujoco_model.py @@ -18,8 +18,8 @@ from parl.framework.model_base import Model class MujocoModel(Model): - def __init__(self, act_dim, act_bound): - self.actor_model = ActorModel(act_dim, act_bound) + def __init__(self, act_dim): + self.actor_model = ActorModel(act_dim) self.critic_model = CriticModel() def policy(self, obs): @@ -33,8 +33,7 @@ class MujocoModel(Model): class ActorModel(Model): - def __init__(self, act_dim, act_bound): - self.act_bound = act_bound + def __init__(self, act_dim): hid1_size = 400 hid2_size = 300 @@ -46,7 +45,7 @@ class ActorModel(Model): hid1 = self.fc1(obs) hid2 = self.fc2(hid1) means = self.fc3(hid2) - means = means * self.act_bound + means = means return means diff --git a/examples/DDPG/train.py b/examples/DDPG/train.py index 4220e05acd13ceeae6eb47a2cc6fb3ffad49749e..ba1a826e505441a489d6827b7c7ed9a3bdb200dd 100644 --- a/examples/DDPG/train.py +++ b/examples/DDPG/train.py @@ -19,7 +19,7 @@ import time from mujoco_agent import MujocoAgent from mujoco_model import MujocoModel from parl.algorithms import DDPG -from parl.utils import logger +from parl.utils import logger, action_mapping from replay_memory import ReplayMemory MAX_EPISODES = 5000 @@ -36,7 +36,7 @@ REWARD_SCALE = 0.1 ENV_SEED = 1 -def run_train_episode(env, agent, rpm, act_bound): +def run_train_episode(env, agent, rpm): obs = env.reset() total_reward = 0 for j in range(MAX_STEPS_EACH_EPISODE): @@ -44,9 +44,10 @@ def run_train_episode(env, agent, rpm, act_bound): action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) - # Add exploration noise - action = np.clip( - np.random.normal(action, act_bound), -act_bound, act_bound) + # Add exploration noise, and clip to [-1.0, 1.0] + action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0) + action = action_mapping(action, env.action_space.low[0], + env.action_space.high[0]) next_obs, reward, done, info = env.step(action) @@ -73,6 +74,8 @@ def run_evaluate_episode(env, agent): batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) + action = action_mapping(action, env.action_space.low[0], + env.action_space.high[0]) next_obs, reward, done, info = env.step(action) @@ -90,9 +93,8 @@ def main(): obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] - act_bound = env.action_space.high[0] - model = MujocoModel(act_dim, act_bound) + model = MujocoModel(act_dim) algorithm = DDPG( model, hyperparas={ @@ -106,7 +108,7 @@ def main(): rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) for i in range(MAX_EPISODES): - train_reward = run_train_episode(env, agent, rpm, act_bound) + train_reward = run_train_episode(env, agent, rpm) logger.info('Episode: {} Reward: {}'.format(i, train_reward)) if (i + 1) % TEST_EVERY_EPISODES == 0: evaluate_reward = run_evaluate_episode(env, agent) diff --git a/examples/DQN/README.md b/examples/DQN/README.md index 2901737ccf0de39add2e2e89e11a5de3202b24fc..3bf36b035bec88e0274ee73dc37a5d9493529498 100644 --- a/examples/DQN/README.md +++ b/examples/DQN/README.md @@ -11,7 +11,6 @@ Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari g ## How to use ### Dependencies: + python2.7 or python3.5+ -+ [PARL](https://github.com/PaddlePaddle/PARL) + [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle) + gym + tqdm diff --git a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/README.md b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/README.md index 3f76e55b1318de2bc5da5846c7f59702212ba525..d7b1c9dcee0b6faad44d037dc6e7816b33f9f93b 100644 --- a/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/README.md +++ b/examples/NeurIPS2018-AI-for-Prosthetics-Challenge/README.md @@ -5,7 +5,6 @@ This folder will contains the code used to train the winning models for the [Neu ### Dependencies - python3.6 - [paddlepaddle>=1.2.0](https://github.com/PaddlePaddle/Paddle) -- [PARL](https://github.com/PaddlePaddle/PARL) - [osim-rl](https://github.com/stanfordnmbl/osim-rl) ### Start Testing best models diff --git a/examples/PPO/.benchmark/PPO_HalfCheetah-v2.png b/examples/PPO/.benchmark/PPO_HalfCheetah-v2.png new file mode 100644 index 0000000000000000000000000000000000000000..7d0c031367fbb25e14f6ca0eb67bd4606a55ff67 Binary files /dev/null and b/examples/PPO/.benchmark/PPO_HalfCheetah-v2.png differ diff --git a/examples/PPO/README.md b/examples/PPO/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6190f71ff026e47d140fa930ceaae33bbbee1fe4 --- /dev/null +++ b/examples/PPO/README.md @@ -0,0 +1,31 @@ +## Reproduce PPO with PARL +Based on PARL, the PPO model of deep reinforcement learning is reproduced, and the same level of indicators of the paper is reproduced in the classic Mujoco game. +Include following approach: ++ Clipped Surrogate Objective ++ Adaptive KL Penalty Coefficient + +> PPO in +[Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347) + +### Mujoco games introduction +Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco game. + +### Benchmark result +- HalfCheetah-v2 + + +## How to use +### Dependencies: ++ python2.7 or python3.5+ ++ [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle) ++ gym ++ tqdm ++ mujoco-py>=1.50.1.0 + +### Start Training: +``` +# To train an agent for HalfCheetah-v2 game (default: CLIP loss) +python train.py + +# To train for different game and different loss type +# python train.py --env [ENV_NAME] --loss_type [CLIP|KLPEN] diff --git a/examples/PPO/mujoco_agent.py b/examples/PPO/mujoco_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..a1221c6423a14bea8f731b612287db0051dc57bb --- /dev/null +++ b/examples/PPO/mujoco_agent.py @@ -0,0 +1,194 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import parl.layers as layers +from paddle import fluid +from sklearn.utils import shuffle +from parl.framework.agent_base import Agent +from parl.utils import logger + + +class MujocoAgent(Agent): + def __init__(self, + algorithm, + obs_dim, + act_dim, + kl_targ, + loss_type, + beta=1.0, + epsilon=0.2, + policy_learn_times=20, + value_learn_times=10, + value_batch_size=256): + self.alg = algorithm + self.obs_dim = obs_dim + self.act_dim = act_dim + assert loss_type == 'CLIP' or loss_type == 'KLPEN' + self.loss_type = loss_type + super(MujocoAgent, self).__init__(algorithm) + + self.policy_learn_times = policy_learn_times + # Adaptive kl penalty coefficient + self.beta = beta + self.kl_targ = kl_targ + + self.value_learn_times = value_learn_times + self.value_batch_size = value_batch_size + self.value_learn_buffer = None + + def build_program(self): + self.policy_predict_program = fluid.Program() + self.policy_sample_program = fluid.Program() + self.policy_learn_program = fluid.Program() + self.value_predict_program = fluid.Program() + self.value_learn_program = fluid.Program() + + with fluid.program_guard(self.policy_sample_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + sampled_act = self.alg.define_sample(obs) + self.policy_sample_output = [sampled_act] + + with fluid.program_guard(self.policy_predict_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + means = self.alg.define_predict(obs) + self.policy_predict_output = [means] + + with fluid.program_guard(self.policy_learn_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + actions = layers.data( + name='actions', shape=[self.act_dim], dtype='float32') + advantages = layers.data( + name='advantages', shape=[1], dtype='float32') + if self.loss_type == 'KLPEN': + beta = layers.data(name='beta', shape=[], dtype='float32') + loss, kl = self.alg.define_policy_learn( + obs, actions, advantages, beta) + else: + loss, kl = self.alg.define_policy_learn( + obs, actions, advantages) + + self.policy_learn_output = [loss, kl] + + with fluid.program_guard(self.value_predict_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + value = self.alg.define_value_predict(obs) + self.value_predict_output = [value] + + with fluid.program_guard(self.value_learn_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + val = layers.data(name='val', shape=[], dtype='float32') + value_loss = self.alg.define_value_learn(obs, val) + self.value_learn_output = [value_loss] + + def policy_sample(self, obs): + feed = {'obs': obs} + sampled_act = self.fluid_executor.run( + self.policy_sample_program, + feed=feed, + fetch_list=self.policy_sample_output)[0] + return sampled_act + + def policy_predict(self, obs): + feed = {'obs': obs} + means = self.fluid_executor.run( + self.policy_predict_program, + feed=feed, + fetch_list=self.policy_predict_output)[0] + return means + + def value_predict(self, obs): + feed = {'obs': obs} + value = self.fluid_executor.run( + self.value_predict_program, + feed=feed, + fetch_list=self.value_predict_output)[0] + return value + + def _batch_policy_learn(self, obs, actions, advantages): + if self.loss_type == 'KLPEN': + feed = { + 'obs': obs, + 'actions': actions, + 'advantages': advantages, + 'beta': self.beta + } + else: + feed = {'obs': obs, 'actions': actions, 'advantages': advantages} + [loss, kl] = self.fluid_executor.run( + self.policy_learn_program, + feed=feed, + fetch_list=self.policy_learn_output) + return loss, kl + + def _batch_value_learn(self, obs, val): + feed = {'obs': obs, 'val': val} + value_loss = self.fluid_executor.run( + self.value_learn_program, + feed=feed, + fetch_list=self.value_learn_output)[0] + return value_loss + + def policy_learn(self, obs, actions, advantages): + """ Learn policy: + + 1. Sync parameters of policy model to old policy model + 2. Fix old policy model, and learn policy model multi times + 3. if use KLPEN loss, Adjust kl loss coefficient: beta + """ + self.alg.sync_old_policy(self.gpu_id) + + all_loss, all_kl = [], [] + for _ in range(self.policy_learn_times): + loss, kl = self._batch_policy_learn(obs, actions, advantages) + all_loss.append(loss) + all_kl.append(kl) + + if self.loss_type == 'KLPEN': + # Adative KL penalty coefficient + if kl > self.kl_targ * 2: + self.beta = 1.5 * self.beta + elif kl < self.kl_targ / 2: + self.beta = self.beta / 1.5 + + return np.mean(all_loss), np.mean(all_kl) + + def value_learn(self, obs, value): + """ Fit model to current data batch + previous data batch + """ + data_size = obs.shape[0] + + if self.value_learn_buffer is None: + obs_train, value_train = obs, value + else: + obs_train = np.concatenate([obs, self.value_learn_buffer[0]]) + value_train = np.concatenate([value, self.value_learn_buffer[1]]) + self.value_learn_buffer = (obs, value) + + all_loss = [] + for _ in range(self.value_learn_times): + obs_train, value_train = shuffle(obs_train, value_train) + start = 0 + while start < data_size: + end = start + self.value_batch_size + value_loss = self._batch_value_learn(obs_train[start:end, :], + value_train[start:end]) + all_loss.append(value_loss) + start += self.value_batch_size + return np.mean(all_loss) diff --git a/examples/PPO/mujoco_model.py b/examples/PPO/mujoco_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9c97525d8bfbb244f46991d01fdff4a38c5ffe2d --- /dev/null +++ b/examples/PPO/mujoco_model.py @@ -0,0 +1,96 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import parl.layers as layers +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from parl.framework.model_base import Model + + +class MujocoModel(Model): + def __init__(self, obs_dim, act_dim, init_logvar=-1.0): + self.policy_model = PolicyModel(obs_dim, act_dim, init_logvar) + self.value_model = ValueModel(obs_dim, act_dim) + self.policy_lr = self.policy_model.lr + self.value_lr = self.value_model.lr + + def policy(self, obs): + return self.policy_model.policy(obs) + + def policy_sample(self, obs): + return self.policy_model.sample(obs) + + def value(self, obs): + return self.value_model.value(obs) + + +class PolicyModel(Model): + def __init__(self, obs_dim, act_dim, init_logvar): + self.obs_dim = obs_dim + self.act_dim = act_dim + hid1_size = obs_dim * 10 + hid3_size = act_dim * 10 + hid2_size = int(np.sqrt(hid1_size * hid3_size)) + + self.lr = 9e-4 / np.sqrt(hid2_size) + + self.fc1 = layers.fc(size=hid1_size, act='tanh') + self.fc2 = layers.fc(size=hid2_size, act='tanh') + self.fc3 = layers.fc(size=hid3_size, act='tanh') + self.fc4 = layers.fc(size=act_dim, act='tanh') + + self.logvars = layers.create_parameter( + shape=[act_dim], + dtype='float32', + default_initializer=fluid.initializer.ConstantInitializer( + init_logvar)) + + def policy(self, obs): + hid1 = self.fc1(obs) + hid2 = self.fc2(hid1) + hid3 = self.fc3(hid2) + means = self.fc4(hid3) + logvars = self.logvars() + return means, logvars + + def sample(self, obs): + means, logvars = self.policy(obs) + sampled_act = means + ( + layers.exp(logvars / 2.0) * # stddev + layers.gaussian_random(shape=(self.act_dim, ), dtype='float32')) + return sampled_act + + +class ValueModel(Model): + def __init__(self, obs_dim, act_dim): + super(ValueModel, self).__init__() + hid1_size = obs_dim * 10 + hid3_size = 5 + hid2_size = int(np.sqrt(hid1_size * hid3_size)) + + self.lr = 1e-2 / np.sqrt(hid2_size) + + self.fc1 = layers.fc(size=hid1_size, act='tanh') + self.fc2 = layers.fc(size=hid2_size, act='tanh') + self.fc3 = layers.fc(size=hid3_size, act='tanh') + self.fc4 = layers.fc(size=1) + + def value(self, obs): + hid1 = self.fc1(obs) + hid2 = self.fc2(hid1) + hid3 = self.fc3(hid2) + V = self.fc4(hid3) + V = layers.squeeze(V, axes=[]) + return V diff --git a/examples/PPO/train.py b/examples/PPO/train.py new file mode 100755 index 0000000000000000000000000000000000000000..35d9aa7346daaf40ea179236ed66aceda39865a4 --- /dev/null +++ b/examples/PPO/train.py @@ -0,0 +1,191 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import gym +import numpy as np +from mujoco_agent import MujocoAgent +from mujoco_model import MujocoModel +from parl.algorithms import PPO +from parl.utils import logger, action_mapping +from utils import * + + +def run_train_episode(env, agent, scaler): + obs = env.reset() + observes, actions, rewards, unscaled_obs = [], [], [], [] + done = False + step = 0.0 + scale, offset = scaler.get() + scale[-1] = 1.0 # don't scale time step feature + offset[-1] = 0.0 # don't offset time step feature + while not done: + obs = obs.reshape((1, -1)) + obs = np.append(obs, [[step]], axis=1) # add time step feature + unscaled_obs.append(obs) + obs = (obs - offset) * scale # center and scale observations + obs = obs.astype('float32') + observes.append(obs) + + action = agent.policy_sample(obs) + action = np.clip(action, -1.0, 1.0) + action = action_mapping(action, env.action_space.low[0], + env.action_space.high[0]) + + action = action.reshape((1, -1)).astype('float32') + actions.append(action) + + obs, reward, done, _ = env.step(np.squeeze(action)) + rewards.append(reward) + step += 1e-3 # increment time step feature + + return (np.concatenate(observes), np.concatenate(actions), + np.array(rewards, dtype='float32'), np.concatenate(unscaled_obs)) + + +def run_evaluate_episode(env, agent, scaler): + obs = env.reset() + rewards = [] + step = 0.0 + scale, offset = scaler.get() + scale[-1] = 1.0 # don't scale time step feature + offset[-1] = 0.0 # don't offset time step feature + while True: + obs = obs.reshape((1, -1)) + obs = np.append(obs, [[step]], axis=1) # add time step feature + obs = (obs - offset) * scale # center and scale observations + obs = obs.astype('float32') + + action = agent.policy_predict(obs) + action = action_mapping(action, env.action_space.low[0], + env.action_space.high[0]) + + obs, reward, done, _ = env.step(np.squeeze(action)) + rewards.append(reward) + + step += 1e-3 # increment time step feature + if done: + break + return np.sum(rewards) + + +def collect_trajectories(env, agent, scaler, episodes): + all_obs, all_actions, all_rewards, all_unscaled_obs = [], [], [], [] + for e in range(episodes): + obs, actions, rewards, unscaled_obs = run_train_episode( + env, agent, scaler) + all_obs.append(obs) + all_actions.append(actions) + all_rewards.append(rewards) + all_unscaled_obs.append(unscaled_obs) + scaler.update(np.concatenate(all_unscaled_obs) + ) # update running statistics for scaling observations + return np.concatenate(all_obs), np.concatenate( + all_actions), np.concatenate(all_rewards) + + +def main(): + env = gym.make(args.env) + + obs_dim = env.observation_space.shape[0] + act_dim = env.action_space.shape[0] + obs_dim += 1 # add 1 to obs dim for time step feature + + scaler = Scaler(obs_dim) + + model = MujocoModel(obs_dim, act_dim) + hyperparas = { + 'act_dim': act_dim, + 'policy_lr': model.policy_lr, + 'value_lr': model.value_lr + } + alg = PPO(model, hyperparas) + agent = MujocoAgent( + alg, obs_dim, act_dim, args.kl_targ, loss_type=args.loss_type) + + # run a few episodes to initialize scaler + collect_trajectories(env, agent, scaler, episodes=5) + + episode = 0 + while episode < args.num_episodes: + obs, actions, rewards = collect_trajectories( + env, agent, scaler, episodes=args.episodes_per_batch) + episode += args.episodes_per_batch + + pred_values = agent.value_predict(obs) + + # scale rewards + scale_rewards = rewards * (1 - args.gamma) + + discount_sum_rewards = calc_discount_sum_rewards( + scale_rewards, args.gamma) + discount_sum_rewards = discount_sum_rewards.astype('float32') + + advantages = calc_gae(scale_rewards, pred_values, args.gamma, args.lam) + # normalize advantages + advantages = (advantages - advantages.mean()) / ( + advantages.std() + 1e-6) + advantages = advantages.astype('float32') + + policy_loss, kl = agent.policy_learn(obs, actions, advantages) + value_loss = agent.value_learn(obs, discount_sum_rewards) + + logger.info( + 'Episode {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}' + .format(episode, + np.sum(rewards) / args.episodes_per_batch, policy_loss, kl, + value_loss)) + if episode % (args.episodes_per_batch * 5) == 0: + eval_reward = run_evaluate_episode(env, agent, scaler) + logger.info('Episode {}, Evaluate reward: {}'.format( + episode, eval_reward)) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + '--env', + type=str, + help='Mujoco environment name', + default='HalfCheetah-v2') + parser.add_argument( + '--num_episodes', + type=int, + help='Number of episodes to run', + default=10000) + parser.add_argument( + '--gamma', type=float, help='Discount factor', default=0.995) + parser.add_argument( + '--lam', + type=float, + help='Lambda for Generalized Advantage Estimation', + default=0.98) + parser.add_argument( + '--kl_targ', type=float, help='D_KL target value', default=0.003) + parser.add_argument( + '--episodes_per_batch', + type=int, + help='Number of episodes per training batch', + default=5) + parser.add_argument( + '--loss_type', + type=str, + help="Choose loss type of PPO algorithm, 'CLIP' or 'KLPEN'", + default='CLIP') + + args = parser.parse_args() + import time + logger.set_dir('./log_dir/{}_{}'.format(args.loss_type, time.time())) + main() diff --git a/examples/PPO/utils.py b/examples/PPO/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..48de07220c0f52f29ff2206f93eccef68d9d383c --- /dev/null +++ b/examples/PPO/utils.py @@ -0,0 +1,88 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import scipy.signal + +__all__ = ['calc_discount_sum_rewards', 'calc_gae', 'Scaler'] +""" +The following code are copied or modified from: + https://github.com/pat-coady/trpo + Written by Patrick Coady (pat-coady.github.io) +""" + + +def calc_discount_sum_rewards(rewards, gamma): + """ Calculate discounted forward sum of a sequence at each point """ + return scipy.signal.lfilter([1.0], [1.0, -gamma], rewards[::-1])[::-1] + + +def calc_gae(rewards, values, gamma, lam): + """ Calculate generalized advantage estimator. + See: https://arxiv.org/pdf/1506.02438.pdf + """ + # temporal differences + tds = rewards - values + np.append(values[1:] * gamma, 0) + advantages = calc_discount_sum_rewards(tds, gamma * lam) + return advantages + + +class Scaler(object): + """ Generate scale and offset based on running mean and stddev along axis=0 + + offset = running mean + scale = 1 / (stddev + 0.1) / 3 (i.e. 3x stddev = +/- 1.0) + """ + + def __init__(self, obs_dim): + """ + Args: + obs_dim: dimension of axis=1 + """ + self.vars = np.zeros(obs_dim) + self.means = np.zeros(obs_dim) + self.cnt = 0 + self.first_pass = True + + def update(self, x): + """ Update running mean and variance (this is an exact method) + Args: + x: NumPy array, shape = (N, obs_dim) + + see: https://stats.stackexchange.com/questions/43159/how-to-calculate-pooled- + variance-of-two-groups-given-known-group-variances-mean + """ + if self.first_pass: + self.means = np.mean(x, axis=0) + self.vars = np.var(x, axis=0) + self.cnt = x.shape[0] + self.first_pass = False + else: + n = x.shape[0] + new_data_var = np.var(x, axis=0) + new_data_mean = np.mean(x, axis=0) + new_data_mean_sq = np.square(new_data_mean) + new_means = ( + (self.means * self.cnt) + (new_data_mean * n)) / (self.cnt + n) + self.vars = (((self.cnt * (self.vars + np.square(self.means))) + + (n * (new_data_var + new_data_mean_sq))) / + (self.cnt + n) - np.square(new_means)) + self.vars = np.maximum( + 0.0, self.vars) # occasionally goes negative, clip + self.means = new_means + self.cnt += n + + def get(self): + """ returns 2-tuple: (scale, offset) """ + return 1 / (np.sqrt(self.vars) + 0.1) / 3, self.means diff --git a/examples/QuickStart/README.md b/examples/QuickStart/README.md index c3558e42723a60abda0bfde6a27ce8fac8c3eb82..39cdd8c74c6a3f078392f68598c278cea3d62cc2 100644 --- a/examples/QuickStart/README.md +++ b/examples/QuickStart/README.md @@ -5,7 +5,6 @@ Based on PARL, train a agent to play CartPole game with policy gradient algorith ### Dependencies: + python2.7 or python3.5+ -+ [PARL](https://github.com/PaddlePaddle/PARL) + [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle) + gym diff --git a/parl/algorithms/__init__.py b/parl/algorithms/__init__.py index 182e401897ba73a7d2d104e09aa7f4954e6f5e04..16f032edfe429685a36d1f7eaef4c1390c62dd3f 100644 --- a/parl/algorithms/__init__.py +++ b/parl/algorithms/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from parl.algorithms.ddpg import * from parl.algorithms.dqn import * from parl.algorithms.policy_gradient import * -from parl.algorithms.ddpg import * +from parl.algorithms.ppo import * diff --git a/parl/algorithms/ppo.py b/parl/algorithms/ppo.py new file mode 100644 index 0000000000000000000000000000000000000000..cf7b40367bfbe6842672ec07f1d697d14cd14e42 --- /dev/null +++ b/parl/algorithms/ppo.py @@ -0,0 +1,154 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import parl.layers as layers +from copy import deepcopy +from paddle import fluid +from parl.framework.algorithm_base import Algorithm + +__all__ = ['PPO'] + + +class PPO(Algorithm): + def __init__(self, model, hyperparas): + Algorithm.__init__(self, model, hyperparas) + # Used to calculate probability of action in old policy + self.old_policy_model = deepcopy(model.policy_model) + + # fetch hyper parameters + self.act_dim = hyperparas['act_dim'] + self.policy_lr = hyperparas['policy_lr'] + self.value_lr = hyperparas['value_lr'] + if 'epsilon' in hyperparas: + self.epsilon = hyperparas['epsilon'] + else: + self.epsilon = 0.2 # default + + def _calc_logprob(self, actions, means, logvars): + """ Calculate log probabilities of actions, when given means and logvars + of normal distribution. + The constant sqrt(2 * pi) is omitted, which will be eliminated in later. + + Args: + actions: shape (batch_size, act_dim) + means: shape (batch_size, act_dim) + logvars: shape (act_dim) + + Returns: + logprob: shape (batch_size) + """ + exp_item = layers.elementwise_div( + layers.square(actions - means), layers.exp(logvars), axis=1) + exp_item = -0.5 * layers.reduce_sum(exp_item, dim=1) + + vars_item = -0.5 * layers.reduce_sum(logvars) + logprob = exp_item + vars_item + return logprob + + def _calc_kl(self, means, logvars, old_means, old_logvars): + """ Calculate KL divergence between old and new distributions + See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback.E2.80.93Leibler_divergence + + Args: + means: shape (batch_size, act_dim) + logvars: shape (act_dim) + old_means: shape (batch_size, act_dim) + old_logvars: shape (act_dim) + + Returns: + kl: shape (batch_size) + """ + log_det_cov_old = layers.reduce_sum(old_logvars) + log_det_cov_new = layers.reduce_sum(logvars) + tr_old_new = layers.reduce_sum(layers.exp(old_logvars - logvars)) + kl = 0.5 * (layers.reduce_sum( + layers.square(means - old_means) / layers.exp(logvars), dim=1) + ( + log_det_cov_new - log_det_cov_old) + tr_old_new - self.act_dim) + return kl + + def define_predict(self, obs): + """ Use policy model of self.model to predict means and logvars of actions + """ + means, logvars = self.model.policy(obs) + return means + + def define_sample(self, obs): + """ Use policy model of self.model to sample actions + """ + sampled_act = self.model.policy_sample(obs) + return sampled_act + + def define_policy_learn(self, obs, actions, advantages, beta=None): + """ Learn policy model with: + 1. CLIP loss: Clipped Surrogate Objective + 2. KLPEN loss: Adaptive KL Penalty Objective + See: https://arxiv.org/pdf/1707.02286.pdf + + Args: + obs: Tensor, (batch_size, obs_dim) + actions: Tensor, (batch_size, act_dim) + advantages: Tensor (batch_size, ) + beta: Tensor (1) or None + if None, use CLIP Loss; else, use KLPEN loss. + """ + old_means, old_logvars = self.old_policy_model.policy(obs) + old_means.stop_gradient = True + old_logvars.stop_gradient = True + old_logprob = self._calc_logprob(actions, old_means, old_logvars) + + means, logvars = self.model.policy(obs) + logprob = self._calc_logprob(actions, means, logvars) + + kl = self._calc_kl(means, logvars, old_means, old_logvars) + kl = layers.reduce_mean(kl) + + if beta is None: # Clipped Surrogate Objective + pg_ratio = layers.exp(logprob - old_logprob) + clipped_pg_ratio = layers.clip(pg_ratio, 1 - self.epsilon, + 1 + self.epsilon) + surrogate_loss = layers.elementwise_min( + advantages * pg_ratio, advantages * clipped_pg_ratio) + loss = 0 - layers.reduce_mean(surrogate_loss) + else: # Adaptive KL Penalty Objective + # policy gradient loss + loss1 = 0 - layers.reduce_mean( + advantages * layers.exp(logprob - old_logprob)) + # adaptive kl loss + loss2 = kl * beta + loss = loss1 + loss2 + optimizer = fluid.optimizer.AdamOptimizer(self.policy_lr) + optimizer.minimize(loss) + return loss, kl + + def define_value_predict(self, obs): + """ Use value model of self.model to predict value of obs + """ + return self.model.value(obs) + + def define_value_learn(self, obs, val): + """ Learn value model with square error cost + """ + predict_val = self.model.value(obs) + loss = layers.square_error_cost(predict_val, val) + loss = layers.reduce_mean(loss) + optimizer = fluid.optimizer.AdamOptimizer(self.value_lr) + optimizer.minimize(loss) + return loss + + def sync_old_policy(self, gpu_id): + """ Synchronize parameters of self.model.policy_model to self.old_policy_model + """ + self.model.policy_model.sync_params_to( + self.old_policy_model, gpu_id=gpu_id) diff --git a/parl/utils/tests/utils_test.py b/parl/utils/tests/utils_test.py new file mode 100644 index 0000000000000000000000000000000000000000..d5811b2f1488dc15b8aa22e73c227a836d1d0f10 --- /dev/null +++ b/parl/utils/tests/utils_test.py @@ -0,0 +1,35 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +from parl.utils import action_mapping + + +class TestUtils(unittest.TestCase): + def test_action_mapping(self): + origin_act = np.array([-1.0, 0.0, 1.0]) + + mapped_act = action_mapping(origin_act, 0.0, 1.0) + self.assertListEqual(list(mapped_act), [0.0, 0.5, 1.0]) + + mapped_act = action_mapping(origin_act, -2.0, 2.0) + self.assertListEqual(list(mapped_act), [-2.0, 0.0, 2.0]) + + mapped_act = action_mapping(origin_act, -5.0, 10.0) + self.assertListEqual(list(mapped_act), [-5.0, 2.5, 10.0]) + + +if __name__ == '__main__': + unittest.main() diff --git a/parl/utils/utils.py b/parl/utils/utils.py index b9e502625b65b844677b5f4867234b05cc3dd413..94e1ab6bb7caed564f9398848a40eb860b3c2077 100644 --- a/parl/utils/utils.py +++ b/parl/utils/utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__all__ = ['has_func'] +__all__ = ['has_func', 'action_mapping'] def has_func(obj, fun): @@ -26,3 +26,21 @@ def has_func(obj, fun): """ check_fun = getattr(obj, fun, None) return callable(check_fun) + + +def action_mapping(model_output_act, low_bound, high_bound): + """ mapping action space [-1, 1] of model output + to new action space [low_bound, high_bound]. + + Args: + model_output_act: np.array, which value is in [-1, 1] + low_bound: float, low bound of env action space + high_bound: float, high bound of env action space + + Returns: + action: np.array, which value is in [low_bound, high_bound] + """ + assert high_bound > low_bound + action = low_bound + (model_output_act - (-1.0)) * ( + (high_bound - low_bound) / 2.0) + return action