提交 8001db66 编写于 作者: H Hongsheng Zeng 提交者: TomorrowIsAnOtherDay

fix wrapper of dynamic_lstm cannot support h_0 and c_0 parameter (#17)

* fix wrapper of dynamic_lstm cannot support h_0 and c_0 initialization, fix bug of wrapper of dynamic_gru

* use sampling_id of fluid to sampling ids

* remove test simple games unittest, avoid timeout

* change pip source
上级 21a9efed
......@@ -18,5 +18,5 @@
FROM paddlepaddle/paddle:latest-gpu
RUN apt-get install -y cmake
RUN pip install gym
RUN pip install details
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple gym
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple details
......@@ -69,7 +69,7 @@ class CategoricalDistribution(PolicyDistribution):
assert isinstance(dist, Variable)
def __call__(self):
return comf.categorical_random(self.dist)
return layers.sampling_id(self.dist)
@property
def dim(self):
......
......@@ -74,7 +74,7 @@ class TestComputationTask(unittest.TestCase):
assert not states, "states should be empty"
## actions["action"] is a batch of actions
for a in actions["action"]:
action_counter[a[0]] += 1
action_counter[a] += 1
if max:
### if max, the first action will always be chosen
......@@ -219,6 +219,7 @@ class TestComputationTask(unittest.TestCase):
if on_policy:
outputs, _ = ct.predict(inputs=dict(sensor=sensor))
actions = outputs["action"]
actions = np.expand_dims(actions, 1)
else:
## randomly assemble a batch
actions = np.random.choice(
......@@ -238,7 +239,7 @@ class TestComputationTask(unittest.TestCase):
### the policy should bias towards the first action
outputs, _ = ct.predict(inputs=dict(sensor=sensor))
for a in outputs["action"]:
self.assertEqual(a[0], 0)
self.assertEqual(a, 0)
if __name__ == "__main__":
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import parl.layers as layers
from parl.framework.computation_task import ComputationTask
from parl.algorithm_zoo.simple_algorithms import SimpleAC, SimpleQ
from parl.model_zoo.simple_models import SimpleModelAC, SimpleModelQ
import numpy as np
import unittest
import math
import gym
def unpack_exps(exps):
return [np.array(l).astype('int' if i==2 else 'float32') \
for i, l in enumerate(zip(*exps))]
def sample(past_exps, n):
indices = np.random.choice(len(past_exps), n)
return [past_exps[i] for i in indices]
class TestGymGame(unittest.TestCase):
def test_gym_games(self):
"""
Test games in OpenAI gym.
"""
games = ["MountainCar-v0", "CartPole-v0"]
final_rewards_thresholds = [
-1.8, ## drive to the right top in 180 steps (timeout is -2.0)
1.5 ## hold the pole for at least 150 steps
]
mlp_layer_confs = [
dict(
size=128, act="relu"),
dict(
size=128, act="relu"),
dict(
size=128, act="relu"),
]
for game, threshold in zip(games, final_rewards_thresholds):
for on_policy in [False, True]:
if on_policy and game != "CartPole-v0":
## SimpleAC has difficulty training mountain-car and acrobot
continue
env = gym.make(game)
state_shape = env.observation_space.shape[0]
num_actions = env.action_space.n
if on_policy:
alg = SimpleAC(
model=SimpleModelAC(
dims=state_shape,
num_actions=num_actions,
mlp_layer_confs=mlp_layer_confs +
[dict(
size=num_actions, act="softmax")]),
hyperparas=dict(lr=1e-3))
else:
alg = SimpleQ(
model=SimpleModelQ(
dims=state_shape,
num_actions=num_actions,
mlp_layer_confs=mlp_layer_confs +
[dict(size=num_actions)]),
hyperparas=dict(lr=1e-4),
exploration_end_batches=25000,
update_ref_interval=100)
print "algorithm: " + alg.__class__.__name__
ct = ComputationTask(algorithm=alg)
batch_size = 16
if not on_policy:
train_every_steps = batch_size / 4
buffer_size_limit = 100000
max_episode = 5000
average_episode_reward = []
past_exps = []
max_steps = env._max_episode_steps
for n in range(max_episode):
ob = env.reset()
episode_reward = 0
for t in range(max_steps):
res, _ = ct.predict(inputs=dict(sensor=np.array(
[ob]).astype("float32")))
pred_action = res["action"][0][0]
next_ob, reward, next_is_over, _ = env.step(
pred_action)
reward /= 100
episode_reward += reward
past_exps.append((ob, next_ob, [pred_action],
[reward], [not next_is_over]))
## only for off-policy training we use a circular buffer
if (not on_policy
) and len(past_exps) > buffer_size_limit:
past_exps.pop(0)
## compute the learning condition
learn_cond = False
if on_policy:
learn_cond = (len(past_exps) >= batch_size)
exps = past_exps ## directly use all exps in the buffer
else:
learn_cond = (
t % train_every_steps == train_every_steps - 1)
exps = sample(past_exps,
batch_size) ## sample some exps
if learn_cond:
sensor, next_sensor, action, reward, next_episode_end \
= unpack_exps(exps)
cost = ct.learn(
inputs=dict(sensor=sensor),
next_inputs=dict(next_sensor=next_sensor),
next_episode_end=dict(
next_episode_end=next_episode_end),
actions=dict(action=action),
rewards=dict(reward=reward))
## we clear the exp buffer for on-policy
if on_policy:
past_exps = []
ob = next_ob
## end before the Gym wrongly gives game_over=True for a timeout case
if t == max_steps - 2 or next_is_over:
break
if n % 50 == 0:
print("episode reward: %f" % episode_reward)
average_episode_reward.append(episode_reward)
if len(average_episode_reward) > 20:
average_episode_reward.pop(0)
### compuare the average episode reward to reduce variance
self.assertGreater(
sum(average_episode_reward) / len(average_episode_reward),
threshold)
if __name__ == "__main__":
unittest.main()
......@@ -54,18 +54,6 @@ class CNN(Feedforward):
[layers.conv2d(**c) for c in multi_conv_layers])
def categorical_random(prob):
"""
Sample an id based on categorical distribution prob
"""
cumsum = layers.cumsum(x=prob)
r = layers.uniform_random_batch_size_like(
input=prob, min=0., max=1., shape=[-1])
index = layers.reduce_sum(layers.cast(cumsum < r, 'int'), dim=-1)
index = layers.reshape(index, index.shape + (1, ))
return index
def argmax_layer(input):
"""
Get the id of the max val of an input vector
......
......@@ -15,7 +15,7 @@
Wrappers for fluid.layers so that the layers can share parameters conveniently.
"""
from paddle.fluid.executor import fetch_var
from paddle.fluid.executor import _fetch_var
import paddle.fluid as fluid
from paddle.fluid.layers import *
from paddle.fluid.param_attr import ParamAttr
......@@ -79,8 +79,8 @@ class LayerFunc(object):
or (not src_attr and not target_attr)
if not src_attr:
continue
src_var = fetch_var(src_attr.name)
target_var = fetch_var(target_attr.name, return_numpy=False)
src_var = _fetch_var(src_attr.name)
target_var = _fetch_var(target_attr.name, return_numpy=False)
target_var.set(src_var, place)
def __deepcopy__(self, memo):
......@@ -259,9 +259,11 @@ def dynamic_lstm(size,
def __init__(self):
super(DynamicLstm_, self).__init__(param_attr, bias_attr)
def __call__(self, input):
def __call__(self, input, h_0=None, c_0=None):
return layers.dynamic_lstm(
input=input,
h_0=h_0,
c_0=c_0,
size=size,
param_attr=self.param_attr,
bias_attr=self.bias_attr,
......@@ -323,7 +325,6 @@ def dynamic_gru(size,
is_reverse=False,
gate_activation='sigmoid',
candidate_activation='tanh',
h_0=None,
name=None):
"""
Return a function that creates a paddle.fluid.layers.dynamic_gru.
......@@ -337,7 +338,7 @@ def dynamic_gru(size,
def __init__(self):
super(DynamicGru_, self).__init__(param_attr, bias_attr)
def __call__(self, input):
def __call__(self, input, h_0=None):
return layers.dynamic_gru(
input=input,
size=size,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册