diff --git a/dygraph/reinforcement_learning/actor_critic.py b/dygraph/reinforcement_learning/actor_critic.py index acadf496d4abe8d1234165df180a20b59db6777c..c94b0a7432ea4d633a1aa8931f8954e63c08769f 100644 --- a/dygraph/reinforcement_learning/actor_critic.py +++ b/dygraph/reinforcement_learning/actor_critic.py @@ -24,6 +24,7 @@ parser.add_argument( help='random seed (default: 543)') parser.add_argument( '--render', action='store_true', help='render the environment') +parser.add_argument('--save_dir', type=str, default="./saved_models_ac") parser.add_argument( '--log-interval', type=int, @@ -61,6 +62,9 @@ class Policy(fluid.dygraph.Layer): with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = args.seed + fluid.default_main_program().random_seed = args.seed + np.random.seed(args.seed) policy = Policy("PolicyModel") eps = np.finfo(np.float32).eps.item() @@ -196,4 +200,5 @@ with fluid.dygraph.guard(): print("Solved! Running reward is now {} and " "the last episode runs to {} time steps!".format( running_reward, t)) + fluid.dygraph.save_persistables(policy.state_dict(), args.save_dir) break diff --git a/dygraph/reinforcement_learning/reinforce.py b/dygraph/reinforcement_learning/reinforce.py index 2b02135b6cef527f287209c3420a6ecf7f0bd0db..84f9ac0246f31b4018b48d43fd9c90ad68a738f0 100644 --- a/dygraph/reinforcement_learning/reinforce.py +++ b/dygraph/reinforcement_learning/reinforce.py @@ -23,6 +23,7 @@ parser.add_argument( help='random seed (default: 543)') parser.add_argument( '--render', action='store_true', help='render the environment') +parser.add_argument('--save_dir', type=str, default="./saved_models") parser.add_argument( '--log-interval', type=int, @@ -59,6 +60,10 @@ class Policy(fluid.dygraph.Layer): with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = args.seed + fluid.default_main_program().random_seed = args.seed + np.random.seed(args.seed) + policy = Policy("PolicyModel") eps = np.finfo(np.float32).eps.item() @@ -176,9 +181,10 @@ with fluid.dygraph.guard(): if i_episode % args.log_interval == 0: print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'. format(i_episode, ep_reward, running_reward)) - #print(returns) + if running_reward > env.spec.reward_threshold: print("Solved! Running reward is now {} and " "the last episode runs to {} time steps!".format( running_reward, t)) + fluid.dygraph.save_persistables(policy.state_dict(), args.save_dir) break diff --git a/dygraph/reinforcement_learning/test_actor_critic_load.py b/dygraph/reinforcement_learning/test_actor_critic_load.py new file mode 100644 index 0000000000000000000000000000000000000000..e052b6dde12503767e8de7c0195cd80afec6ac98 --- /dev/null +++ b/dygraph/reinforcement_learning/test_actor_critic_load.py @@ -0,0 +1,194 @@ +import argparse +import gym +import numpy as np +from itertools import count +from collections import namedtuple +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.dygraph.nn as nn +import paddle.fluid.framework as framework + +parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') +parser.add_argument( + '--gamma', + type=float, + default=0.99, + metavar='G', + help='discount factor (default: 0.99)') +parser.add_argument( + '--seed', + type=int, + default=543, + metavar='N', + help='random seed (default: 543)') +parser.add_argument( + '--render', action='store_true', help='render the environment') +parser.add_argument('--save_dir', type=str, default="./saved_models_ac") +parser.add_argument( + '--log-interval', + type=int, + default=10, + metavar='N', + help='interval between training status logs (default: 10)') +args = parser.parse_args() + +env = gym.make('CartPole-v0') +env.seed(args.seed) + +SavedAction = namedtuple('SavedAction', ['log_prob', 'value']) + + +class Policy(fluid.dygraph.Layer): + def __init__(self, name_scope): + super(Policy, self).__init__(name_scope) + + self.affine1 = nn.FC(self.full_name(), size=128) + self.action_head = nn.FC(self.full_name(), size=2) + self.value_head = nn.FC(self.full_name(), size=1) + + self.saved_actions = [] + self.rewards = [] + + def forward(self, x): + x = fluid.layers.reshape(x, shape=[1, 4]) + x = self.affine1(x) + x = fluid.layers.relu(x) + + action_scores = self.action_head(x) + state_values = self.value_head(x) + + return fluid.layers.softmax(action_scores, axis=-1), state_values + + +with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = args.seed + fluid.default_main_program().random_seed = args.seed + np.random.seed(args.seed) + policy = Policy("PolicyModel") + + eps = np.finfo(np.float32).eps.item() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=3e-2) + + def get_mean_and_std(values=[]): + n = 0. + s = 0. + for val in values: + s += val + n += 1 + mean = s / n + + std = 0. + for val in values: + std += (val - mean) * (val - mean) + std /= n + std = math.sqrt(std) + + return mean, std + + def sample_action(probs): + sample = np.random.random() + idx = 0 + + while idx < len(probs) and sample > probs[idx]: + sample -= probs[idx] + idx += 1 + mask = [0.] * len(probs) + mask[idx] = 1. + + return idx, np.array([mask]).astype("float32") + + def choose_best_action(probs): + idx = 0 if probs[0] > probs[1] else 1 + mask = [1., 0.] if idx == 0 else [0., 1.] + + return idx, np.array([mask]).astype("float32") + + def select_action(state): + state = fluid.dygraph.base.to_variable(state) + state.stop_gradient = True + probs, state_value = policy(state) + np_probs = probs.numpy() + + action, _mask = sample_action(np_probs[0]) + + mask = fluid.dygraph.base.to_variable(_mask) + mask.stop_gradient = True + + loss_probs = fluid.layers.log(probs) + loss_probs = fluid.layers.elementwise_mul(loss_probs, mask) + loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1) + + policy.saved_actions.append(SavedAction(loss_probs, state_value)) + + return action + + def finish_episode(): + R = 0 + saved_actions = policy.saved_actions + policy_losses = [] + value_losses = [] + returns = [] + for r in policy.rewards[::-1]: + R = r + args.gamma * R + returns.insert(0, R) + + mean, std = get_mean_and_std(returns) + returns = np.array(returns).astype("float32") + returns = (returns - mean) / (std + eps) + + for (log_prob, value), R in zip(saved_actions, returns): + advantage = R - value[0][0] + + log_prob_numpy = log_prob.numpy() + R_numpy = np.ones_like(log_prob_numpy).astype("float32") + _R = -1 * advantage * R_numpy + _R = fluid.dygraph.base.to_variable(_R) + _R.stop_gradient = True + + policy_loss = fluid.layers.elementwise_mul(_R, log_prob) + policy_losses.append(policy_loss) + + _R2 = np.ones_like(value.numpy()).astype("float32") * R + _R2 = fluid.dygraph.base.to_variable(_R2) + _R2.stop_gradient = True + + value_loss = fluid.layers.smooth_l1(value, _R2, sigma=1.0) + value_losses.append(value_loss) + + all_policy_loss = fluid.layers.concat(policy_losses) + all_policy_loss = fluid.layers.reduce_sum(all_policy_loss) + + all_value_loss = fluid.layers.concat(value_losses) + all_value_loss = fluid.layers.reduce_sum(all_value_loss) + + loss = all_policy_loss + all_value_loss + + loss.backward() + optimizer.minimize(loss) + + policy.clear_gradients() + del policy.rewards[:] + del policy.saved_actions[:] + + return returns + + running_reward = 10 + policy.load_dict(fluid.dygraph.load_persistables(args.save_dir)) + + state, ep_reward = env.reset(), 0 + for t in range(1, 10000): # Don't infinite loop while learning + state = np.array(state).astype("float32") + action = select_action(state) + state, reward, done, _ = env.step(action) + + if args.render: + env.render() + + policy.rewards.append(reward) + ep_reward += reward + + if done: + break + + print('Last reward: {:.2f}'.format(ep_reward)) diff --git a/dygraph/reinforcement_learning/test_reinforce_load.py b/dygraph/reinforcement_learning/test_reinforce_load.py new file mode 100644 index 0000000000000000000000000000000000000000..4ecc1a8e1b8affb2ced51e18c5d5650dcccc07d5 --- /dev/null +++ b/dygraph/reinforcement_learning/test_reinforce_load.py @@ -0,0 +1,180 @@ +import argparse +import gym +import numpy as np +from itertools import count +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.dygraph.nn as nn +import paddle.fluid.framework as framework + +parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') +parser.add_argument( + '--gamma', + type=float, + default=0.99, + metavar='G', + help='discount factor (default: 0.99)') +parser.add_argument( + '--seed', + type=int, + default=543, + metavar='N', + help='random seed (default: 543)') +parser.add_argument( + '--render', action='store_true', help='render the environment') +parser.add_argument('--save_dir', type=str, default="./saved_models") +parser.add_argument( + '--log-interval', + type=int, + default=10, + metavar='N', + help='interval between training status logs (default: 10)') +args = parser.parse_args() + +env = gym.make('CartPole-v0') +env.seed(args.seed) + + +class Policy(fluid.dygraph.Layer): + def __init__(self, name_scope): + super(Policy, self).__init__(name_scope) + + self.affine1 = nn.FC(self.full_name(), size=128) + self.affine2 = nn.FC(self.full_name(), size=2) + self.dropout_ratio = 0.6 + + self.saved_log_probs = [] + self.rewards = [] + + def forward(self, x): + x = fluid.layers.reshape(x, shape=[1, 4]) + x = self.affine1(x) + x = fluid.layers.dropout(x, self.dropout_ratio) + x = fluid.layers.relu(x) + action_scores = self.affine2(x) + + self._x_for_debug = x + + return fluid.layers.softmax(action_scores, axis=1) + + +with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = args.seed + fluid.default_main_program().random_seed = args.seed + np.random.seed(args.seed) + + policy = Policy("PolicyModel") + + eps = np.finfo(np.float32).eps.item() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=1e-2) + + def get_mean_and_std(values=[]): + n = 0. + s = 0. + for val in values: + s += val + n += 1 + mean = s / n + + std = 0. + for val in values: + std += (val - mean) * (val - mean) + std /= n + std = math.sqrt(std) + + return mean, std + + def sample_action(probs): + sample = np.random.random() + idx = 0 + + while idx < len(probs) and sample > probs[idx]: + sample -= probs[idx] + idx += 1 + mask = [0.] * len(probs) + mask[idx] = 1. + + return idx, np.array([mask]).astype("float32") + + def choose_best_action(probs): + idx = 0 if probs[0] > probs[1] else 1 + mask = [1., 0.] if idx == 0 else [0., 1.] + + return idx, np.array([mask]).astype("float32") + + def select_action(state): + state = fluid.dygraph.base.to_variable(state) + state.stop_gradient = True + loss_probs = policy(state) + probs = loss_probs.numpy() + + action, _mask = sample_action(probs[0]) + + mask = fluid.dygraph.base.to_variable(_mask) + mask.stop_gradient = True + + loss_probs = fluid.layers.log(loss_probs) + loss_probs = fluid.layers.elementwise_mul(loss_probs, mask) + loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1) + + policy.saved_log_probs.append(loss_probs) + + return action + + def finish_episode(): + R = 0 + policy_loss = [] + returns = [] + for r in policy.rewards[::-1]: + R = r + args.gamma * R + returns.insert(0, R) + + mean, std = get_mean_and_std(returns) + + returns = np.array(returns).astype("float32") + returns = (returns - mean) / (std + eps) + + for log_prob, R in zip(policy.saved_log_probs, returns): + log_prob_numpy = log_prob.numpy() + + R_numpy = np.ones_like(log_prob_numpy).astype("float32") + _R = -1 * R * R_numpy + _R = fluid.dygraph.base.to_variable(_R) + _R.stop_gradient = True + curr_loss = fluid.layers.elementwise_mul(_R, log_prob) + policy_loss.append(curr_loss) + + policy_loss = fluid.layers.concat(policy_loss) + policy_loss = fluid.layers.reduce_sum(policy_loss) + + policy_loss.backward() + optimizer.minimize(policy_loss) + + dy_grad = policy._x_for_debug.gradient() + + policy.clear_gradients() + del policy.rewards[:] + del policy.saved_log_probs[:] + + return returns + + running_reward = 10 + state, ep_reward = env.reset(), 0 + policy.load_dict(fluid.dygraph.load_persistables(args.save_dir)) + + for t in range(1, 10000): # Don't infinite loop while learning + state = np.array(state).astype("float32") + action = select_action(state) + state, reward, done, _ = env.step(action) + + if args.render: + env.render() + + policy.rewards.append(reward) + ep_reward += reward + + if done: + break + + print('Test reward: {:.2f}'.format(ep_reward))