未验证 提交 1a5d3def 编写于 作者: A Aurelius84 提交者: GitHub

[Dy2stat] Add Reinforcement learning unittest (#25445)

* add reinforcement learning model test=develop

* align backward test=develop

* add gym in paddle_build.sh test=develop

* rm pip install in script test=develop

* refine paddle_build.sh test=develop

* fix sed error in macOS test=develop

* polish code test=develop
上级 5a2d15a1
...@@ -64,6 +64,9 @@ function cmake_base() { ...@@ -64,6 +64,9 @@ function cmake_base() {
# Delete previous built whl packages # Delete previous built whl packages
rm -rf python/dist 2>/dev/null || true rm -rf python/dist 2>/dev/null || true
# `gym` is only used in unittest, it's not suitable to add in requirements.txt.
# Add it dynamically.
echo "gym" >> ${PADDLE_ROOT}/python/requirements.txt
# Support build for all python versions, currently # Support build for all python versions, currently
# including cp27-cp27m and cp27-cp27mu. # including cp27-cp27m and cp27-cp27mu.
PYTHON_FLAGS="" PYTHON_FLAGS=""
...@@ -119,6 +122,8 @@ function cmake_base() { ...@@ -119,6 +122,8 @@ function cmake_base() {
exit 1 exit 1
fi fi
fi fi
# delete `gym` to avoid modifying requirements.txt in *.whl
sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
else else
if [ "$1" != "" ]; then if [ "$1" != "" ]; then
echo "using python abi: $1" echo "using python abi: $1"
...@@ -175,6 +180,8 @@ function cmake_base() { ...@@ -175,6 +180,8 @@ function cmake_base() {
else else
pip install -r ${PADDLE_ROOT}/python/requirements.txt pip install -r ${PADDLE_ROOT}/python/requirements.txt
fi fi
# delete `gym` to avoid modifying requirements.txt in *.whl
sed -i "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
fi fi
if [ "$SYSTEM" == "Darwin" ]; then if [ "$SYSTEM" == "Darwin" ]; then
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gym
import math
import itertools
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.dygraph.nn as nn
from paddle.fluid.dygraph import to_variable, Layer
from paddle.fluid.dygraph import declarative, ProgramTranslator
import unittest
SEED = 2020
program_translator = ProgramTranslator()
class Policy(Layer):
def __init__(self):
super(Policy, self).__init__()
self.affine1 = nn.Linear(4, 128)
self.affine2 = nn.Linear(128, 2)
self.dropout_ratio = 0.6
self.saved_log_probs = []
self.rewards = []
@declarative
def forward(self, x):
x = fluid.layers.reshape(x, shape=[1, 4])
x = self.affine1(x)
x = fluid.layers.dropout(x, self.dropout_ratio)
x = fluid.layers.relu(x)
action_scores = self.affine2(x)
log_prob = fluid.layers.softmax(action_scores, axis=1)
return log_prob
class Args(object):
gamma = 0.99
log_interval = 1
train_step = 10
def train(args, place, to_static):
program_translator.enable(to_static)
env = gym.make('CartPole-v0')
env.seed(SEED)
with fluid.dygraph.guard(place):
fluid.default_main_program().random_seed = SEED
fluid.default_startup_program().random_seed = SEED
local_random = np.random.RandomState(SEED)
policy = Policy()
eps = np.finfo(np.float32).eps.item()
optimizer = fluid.optimizer.AdamaxOptimizer(
learning_rate=1e-2, parameter_list=policy.parameters())
def get_mean_and_std(values=[]):
n = 0.
s = 0.
for val in values:
s += val
n += 1
mean = s / n
std = 0.
for val in values:
std += (val - mean) * (val - mean)
std /= n
std = math.sqrt(std)
return mean, std
def sample_action(probs):
sample = local_random.random_sample()
idx = 0
while idx < len(probs) and sample > probs[idx]:
sample -= probs[idx]
idx += 1
mask = [0.] * len(probs)
mask[idx] = 1.
return idx, np.array([mask]).astype("float32")
def choose_best_action(probs):
idx = 0 if probs[0] > probs[1] else 1
mask = [1., 0.] if idx == 0 else [0., 1.]
return idx, np.array([mask]).astype("float32")
def select_action(state):
state = to_variable(state)
state.stop_gradient = True
loss_probs = policy(state)
# print(loss_probs.name)
probs = loss_probs.numpy()
action, _mask = sample_action(probs[0])
mask = to_variable(_mask)
mask.stop_gradient = True
loss_probs = fluid.layers.log(loss_probs)
loss_probs = fluid.layers.elementwise_mul(loss_probs, mask)
loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1)
policy.saved_log_probs.append(loss_probs)
return action, loss_probs
def finish_episode():
R = 0
policy_loss = []
returns = []
for r in policy.rewards[::-1]:
R = r + args.gamma * R
returns.insert(0, R)
mean, std = get_mean_and_std(returns)
returns = np.array(returns).astype("float32")
returns = (returns - mean) / (std + eps)
# calculate policy loss of each step.
for log_prob, R in zip(policy.saved_log_probs, returns):
log_prob_numpy = log_prob.numpy()
R_numpy = np.ones_like(log_prob_numpy).astype("float32")
_R = -1 * R * R_numpy
_R = to_variable(_R)
_R.stop_gradient = True
cur_loss = fluid.layers.elementwise_mul(_R, log_prob)
policy_loss.append(cur_loss)
policy_loss = fluid.layers.concat(policy_loss)
policy_loss = fluid.layers.reduce_sum(policy_loss)
policy_loss.backward()
optimizer.minimize(policy_loss)
policy.clear_gradients()
del policy.rewards[:]
del policy.saved_log_probs[:]
return returns
loss_data = []
running_reward = 10
for i_episode in itertools.count(1):
state, ep_reward = env.reset(), 0
# TODO(Aurelius84): In RL, we continuously select actions with multiple steps,
# then accumulate loss to apply optimization. But currently all vars shared with
# the same inner scope, which has problem in backward. I will fix it in next PR.
for t in range(1, 2): # default 1000
state = np.array(state).astype("float32")
action, loss = select_action(state)
state, reward, done, _ = env.step(action)
# log loss_probs
loss_data.append(loss.numpy()[0])
policy.rewards.append(reward)
ep_reward += reward
if done:
break
# sum loss and apply optimization
returns = finish_episode()
running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
if i_episode % args.log_interval == 0:
print(
'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\t loss_probs: {}'.
format(i_episode, ep_reward, running_reward,
loss.numpy()[0]))
if i_episode > args.train_step:
break
return np.array(loss_data)
class TestDeclarative(unittest.TestCase):
def setUp(self):
self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
else fluid.CPUPlace()
self.args = Args()
def test_train(self):
st_out = train(self.args, self.place, to_static=True)
dy_out = train(self.args, self.place, to_static=False)
self.assertTrue(
np.allclose(st_out, dy_out),
msg="dy_out:\n {}\n st_out:\n{}\n".format(dy_out, st_out))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册