diff --git a/dygraph/reinforcement_learning/README.md b/dygraph/reinforcement_learning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fc7f3449a9469527e353971206d62b129604111f --- /dev/null +++ b/dygraph/reinforcement_learning/README.md @@ -0,0 +1,105 @@ +# 强化学习 +本页将介绍如何使用PaddlePaddle在DyGraph模式下实现典型强化学习算法,包括[安装](#installation)、[训练](#training-a-model)、[输出](#log)、[模型评估](#evaluation)。 + +--- +## 内容 +- [安装](#installation) +- [训练](#training-a-model) +- [输出](#log) + +## 安装 + +在当前目录下运行样例代码需要PadddlePaddle Fluid的v1.4.0或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本,请根据安装文档中的说明来更新PaddlePaddle。 + +除了paddle以外,请安装gym模拟器(https://gym.openai.com/ ),这个模拟器内包含了十分丰富的模拟环境,可以极大地便利强化学习研究。 + +## 训练 +教程中使用`Cartpole`作为模拟环境(关于Cartpole: https://gym.openai.com/envs/CartPole-v0 ),并包含了2个典型的强化学习算法:reinforce.py 和 actor_critic.py,可以通过如下的方式启动训练: +``` +env CUDA_VISIBLE_DEVICES=0 python reinforce.py +``` +或 +``` +env CUDA_VISIBLE_DEVICES=0 python actor_critic.py +``` + +## 输出 +执行训练开始后,将得到类似如下的输出。 + +``` +env CUDA_VISIBLE_DEVICES=0 python reinforce.py + +Episode 10 Last reward: 20.00 Average reward: 14.96 +Episode 20 Last reward: 35.00 Average reward: 20.56 +Episode 30 Last reward: 26.00 Average reward: 23.18 +Episode 40 Last reward: 21.00 Average reward: 28.68 +Episode 50 Last reward: 21.00 Average reward: 30.06 +Episode 60 Last reward: 27.00 Average reward: 37.21 +Episode 70 Last reward: 67.00 Average reward: 47.69 +Episode 80 Last reward: 46.00 Average reward: 55.25 +Episode 90 Last reward: 113.00 Average reward: 80.11 +Episode 100 Last reward: 124.00 Average reward: 89.36 +Episode 110 Last reward: 97.00 Average reward: 98.29 +Episode 120 Last reward: 200.00 Average reward: 110.29 +Episode 130 Last reward: 200.00 Average reward: 142.01 +Episode 140 Last reward: 157.00 Average reward: 162.18 +Episode 150 Last reward: 101.00 Average reward: 165.37 +Episode 160 Last reward: 119.00 Average reward: 156.74 +Episode 170 Last reward: 114.00 Average reward: 146.62 +Episode 180 Last reward: 149.00 Average reward: 140.74 +Episode 190 Last reward: 114.00 Average reward: 149.52 +Episode 200 Last reward: 124.00 Average reward: 130.40 +Episode 210 Last reward: 103.00 Average reward: 119.44 +Episode 220 Last reward: 200.00 Average reward: 120.50 +Episode 230 Last reward: 172.00 Average reward: 126.33 +Episode 240 Last reward: 187.00 Average reward: 139.02 +Episode 250 Last reward: 170.00 Average reward: 154.12 +Episode 260 Last reward: 172.00 Average reward: 167.44 +Episode 270 Last reward: 195.00 Average reward: 175.00 +Episode 280 Last reward: 200.00 Average reward: 178.56 +Episode 290 Last reward: 200.00 Average reward: 187.16 +Episode 300 Last reward: 200.00 Average reward: 192.32 +Solved! Running reward is now 195.156645521 and the last episode runs to 200 time steps! + +``` +或 +``` +env CUDA_VISIBLE_DEVICES=0 python actor_critic.py + +Episode 10 Last reward: 131.00 Average reward: 23.54 +Episode 20 Last reward: 89.00 Average reward: 31.96 +Episode 30 Last reward: 108.00 Average reward: 76.43 +Episode 40 Last reward: 20.00 Average reward: 83.57 +Episode 50 Last reward: 19.00 Average reward: 56.94 +Episode 60 Last reward: 53.00 Average reward: 48.44 +Episode 70 Last reward: 147.00 Average reward: 82.04 +Episode 80 Last reward: 90.00 Average reward: 94.94 +Episode 90 Last reward: 144.00 Average reward: 97.71 +Episode 100 Last reward: 200.00 Average reward: 133.73 +Episode 110 Last reward: 200.00 Average reward: 158.69 +Episode 120 Last reward: 159.00 Average reward: 162.60 +Episode 130 Last reward: 150.00 Average reward: 159.57 +Episode 140 Last reward: 195.00 Average reward: 163.27 +Episode 150 Last reward: 143.00 Average reward: 157.88 +Episode 160 Last reward: 113.00 Average reward: 151.82 +Episode 170 Last reward: 147.00 Average reward: 146.14 +Episode 180 Last reward: 199.00 Average reward: 150.11 +Episode 190 Last reward: 200.00 Average reward: 168.77 +Episode 200 Last reward: 200.00 Average reward: 177.60 +Episode 210 Last reward: 102.00 Average reward: 174.29 +Episode 220 Last reward: 189.00 Average reward: 171.91 +Episode 230 Last reward: 200.00 Average reward: 169.92 +Episode 240 Last reward: 200.00 Average reward: 181.99 +Episode 250 Last reward: 200.00 Average reward: 189.22 +Episode 260 Last reward: 200.00 Average reward: 188.75 +Episode 270 Last reward: 180.00 Average reward: 192.27 +Episode 280 Last reward: 200.00 Average reward: 175.83 +Episode 290 Last reward: 200.00 Average reward: 185.53 +Episode 300 Last reward: 200.00 Average reward: 191.33 +Episode 310 Last reward: 200.00 Average reward: 194.81 +Solved! Running reward is now 195.071295316 and the last episode runs to 200 time steps! +``` + +## 模型评估 +强化学习模型一般采取边预测边训练,不断通过反馈来改进模型的学习方式,因此我们只需要观察学习过程中的reward变化情况,就可以评估强化学习算法的好坏。 +在gym中,不同的游戏一般都设置了不同的solve_threshold,强化学习模型只需要达到这个threshold,即可完成训练。 diff --git a/dygraph/reinforcement_learning/actor_critic.py b/dygraph/reinforcement_learning/actor_critic.py new file mode 100644 index 0000000000000000000000000000000000000000..acadf496d4abe8d1234165df180a20b59db6777c --- /dev/null +++ b/dygraph/reinforcement_learning/actor_critic.py @@ -0,0 +1,199 @@ +import argparse +import gym +import numpy as np +from itertools import count +from collections import namedtuple +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.dygraph.nn as nn +import paddle.fluid.framework as framework + +parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') +parser.add_argument( + '--gamma', + type=float, + default=0.99, + metavar='G', + help='discount factor (default: 0.99)') +parser.add_argument( + '--seed', + type=int, + default=543, + metavar='N', + help='random seed (default: 543)') +parser.add_argument( + '--render', action='store_true', help='render the environment') +parser.add_argument( + '--log-interval', + type=int, + default=10, + metavar='N', + help='interval between training status logs (default: 10)') +args = parser.parse_args() + +env = gym.make('CartPole-v0') +env.seed(args.seed) + +SavedAction = namedtuple('SavedAction', ['log_prob', 'value']) + + +class Policy(fluid.dygraph.Layer): + def __init__(self, name_scope): + super(Policy, self).__init__(name_scope) + + self.affine1 = nn.FC(self.full_name(), size=128) + self.action_head = nn.FC(self.full_name(), size=2) + self.value_head = nn.FC(self.full_name(), size=1) + + self.saved_actions = [] + self.rewards = [] + + def forward(self, x): + x = fluid.layers.reshape(x, shape=[1, 4]) + x = self.affine1(x) + x = fluid.layers.relu(x) + + action_scores = self.action_head(x) + state_values = self.value_head(x) + + return fluid.layers.softmax(action_scores, axis=-1), state_values + + +with fluid.dygraph.guard(): + policy = Policy("PolicyModel") + + eps = np.finfo(np.float32).eps.item() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=3e-2) + + def get_mean_and_std(values=[]): + n = 0. + s = 0. + for val in values: + s += val + n += 1 + mean = s / n + + std = 0. + for val in values: + std += (val - mean) * (val - mean) + std /= n + std = math.sqrt(std) + + return mean, std + + def sample_action(probs): + sample = np.random.random() + idx = 0 + + while idx < len(probs) and sample > probs[idx]: + sample -= probs[idx] + idx += 1 + mask = [0.] * len(probs) + mask[idx] = 1. + + return idx, np.array([mask]).astype("float32") + + def choose_best_action(probs): + idx = 0 if probs[0] > probs[1] else 1 + mask = [1., 0.] if idx == 0 else [0., 1.] + + return idx, np.array([mask]).astype("float32") + + def select_action(state): + state = fluid.dygraph.base.to_variable(state) + state.stop_gradient = True + probs, state_value = policy(state) + np_probs = probs.numpy() + + action, _mask = sample_action(np_probs[0]) + + mask = fluid.dygraph.base.to_variable(_mask) + mask.stop_gradient = True + + loss_probs = fluid.layers.log(probs) + loss_probs = fluid.layers.elementwise_mul(loss_probs, mask) + loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1) + + policy.saved_actions.append(SavedAction(loss_probs, state_value)) + + return action + + def finish_episode(): + R = 0 + saved_actions = policy.saved_actions + policy_losses = [] + value_losses = [] + returns = [] + for r in policy.rewards[::-1]: + R = r + args.gamma * R + returns.insert(0, R) + + mean, std = get_mean_and_std(returns) + returns = np.array(returns).astype("float32") + returns = (returns - mean) / (std + eps) + + for (log_prob, value), R in zip(saved_actions, returns): + advantage = R - value[0][0] + + log_prob_numpy = log_prob.numpy() + R_numpy = np.ones_like(log_prob_numpy).astype("float32") + _R = -1 * advantage * R_numpy + _R = fluid.dygraph.base.to_variable(_R) + _R.stop_gradient = True + + policy_loss = fluid.layers.elementwise_mul(_R, log_prob) + policy_losses.append(policy_loss) + + _R2 = np.ones_like(value.numpy()).astype("float32") * R + _R2 = fluid.dygraph.base.to_variable(_R2) + _R2.stop_gradient = True + + value_loss = fluid.layers.smooth_l1(value, _R2, sigma=1.0) + value_losses.append(value_loss) + + all_policy_loss = fluid.layers.concat(policy_losses) + all_policy_loss = fluid.layers.reduce_sum(all_policy_loss) + + all_value_loss = fluid.layers.concat(value_losses) + all_value_loss = fluid.layers.reduce_sum(all_value_loss) + + loss = all_policy_loss + all_value_loss + + loss.backward() + optimizer.minimize(loss) + + policy.clear_gradients() + del policy.rewards[:] + del policy.saved_actions[:] + + return returns + + running_reward = 10 + for i_episode in count(1): + state, ep_reward = env.reset(), 0 + for t in range(1, 10000): # Don't infinite loop while learning + state = np.array(state).astype("float32") + action = select_action(state) + state, reward, done, _ = env.step(action) + + if args.render: + env.render() + + policy.rewards.append(reward) + ep_reward += reward + + if done: + break + + running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward + returns = finish_episode() + if i_episode % args.log_interval == 0: + print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'. + format(i_episode, ep_reward, running_reward)) + #print(returns) + if running_reward > env.spec.reward_threshold: + print("Solved! Running reward is now {} and " + "the last episode runs to {} time steps!".format( + running_reward, t)) + break diff --git a/dygraph/reinforcement_learning/reinforce.py b/dygraph/reinforcement_learning/reinforce.py new file mode 100644 index 0000000000000000000000000000000000000000..2b02135b6cef527f287209c3420a6ecf7f0bd0db --- /dev/null +++ b/dygraph/reinforcement_learning/reinforce.py @@ -0,0 +1,184 @@ +import argparse +import gym +import numpy as np +from itertools import count +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.dygraph.nn as nn +import paddle.fluid.framework as framework + +parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') +parser.add_argument( + '--gamma', + type=float, + default=0.99, + metavar='G', + help='discount factor (default: 0.99)') +parser.add_argument( + '--seed', + type=int, + default=543, + metavar='N', + help='random seed (default: 543)') +parser.add_argument( + '--render', action='store_true', help='render the environment') +parser.add_argument( + '--log-interval', + type=int, + default=10, + metavar='N', + help='interval between training status logs (default: 10)') +args = parser.parse_args() + +env = gym.make('CartPole-v0') +env.seed(args.seed) + + +class Policy(fluid.dygraph.Layer): + def __init__(self, name_scope): + super(Policy, self).__init__(name_scope) + + self.affine1 = nn.FC(self.full_name(), size=128) + self.affine2 = nn.FC(self.full_name(), size=2) + self.dropout_ratio = 0.6 + + self.saved_log_probs = [] + self.rewards = [] + + def forward(self, x): + x = fluid.layers.reshape(x, shape=[1, 4]) + x = self.affine1(x) + x = fluid.layers.dropout(x, self.dropout_ratio) + x = fluid.layers.relu(x) + action_scores = self.affine2(x) + + self._x_for_debug = x + + return fluid.layers.softmax(action_scores, axis=1) + + +with fluid.dygraph.guard(): + policy = Policy("PolicyModel") + + eps = np.finfo(np.float32).eps.item() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=1e-2) + + def get_mean_and_std(values=[]): + n = 0. + s = 0. + for val in values: + s += val + n += 1 + mean = s / n + + std = 0. + for val in values: + std += (val - mean) * (val - mean) + std /= n + std = math.sqrt(std) + + return mean, std + + def sample_action(probs): + sample = np.random.random() + idx = 0 + + while idx < len(probs) and sample > probs[idx]: + sample -= probs[idx] + idx += 1 + mask = [0.] * len(probs) + mask[idx] = 1. + + return idx, np.array([mask]).astype("float32") + + def choose_best_action(probs): + idx = 0 if probs[0] > probs[1] else 1 + mask = [1., 0.] if idx == 0 else [0., 1.] + + return idx, np.array([mask]).astype("float32") + + def select_action(state): + state = fluid.dygraph.base.to_variable(state) + state.stop_gradient = True + loss_probs = policy(state) + probs = loss_probs.numpy() + + action, _mask = sample_action(probs[0]) + + mask = fluid.dygraph.base.to_variable(_mask) + mask.stop_gradient = True + + loss_probs = fluid.layers.log(loss_probs) + loss_probs = fluid.layers.elementwise_mul(loss_probs, mask) + loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1) + + policy.saved_log_probs.append(loss_probs) + + return action + + def finish_episode(): + R = 0 + policy_loss = [] + returns = [] + for r in policy.rewards[::-1]: + R = r + args.gamma * R + returns.insert(0, R) + + mean, std = get_mean_and_std(returns) + + returns = np.array(returns).astype("float32") + returns = (returns - mean) / (std + eps) + + for log_prob, R in zip(policy.saved_log_probs, returns): + log_prob_numpy = log_prob.numpy() + + R_numpy = np.ones_like(log_prob_numpy).astype("float32") + _R = -1 * R * R_numpy + _R = fluid.dygraph.base.to_variable(_R) + _R.stop_gradient = True + curr_loss = fluid.layers.elementwise_mul(_R, log_prob) + policy_loss.append(curr_loss) + + policy_loss = fluid.layers.concat(policy_loss) + policy_loss = fluid.layers.reduce_sum(policy_loss) + + policy_loss.backward() + optimizer.minimize(policy_loss) + + dy_grad = policy._x_for_debug.gradient() + + policy.clear_gradients() + del policy.rewards[:] + del policy.saved_log_probs[:] + + return returns + + running_reward = 10 + for i_episode in count(1): + state, ep_reward = env.reset(), 0 + for t in range(1, 10000): # Don't infinite loop while learning + state = np.array(state).astype("float32") + action = select_action(state) + state, reward, done, _ = env.step(action) + + if args.render: + env.render() + + policy.rewards.append(reward) + ep_reward += reward + + if done: + break + + running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward + returns = finish_episode() + if i_episode % args.log_interval == 0: + print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'. + format(i_episode, ep_reward, running_reward)) + #print(returns) + if running_reward > env.spec.reward_threshold: + print("Solved! Running reward is now {} and " + "the last episode runs to {} time steps!".format( + running_reward, t)) + break