diff --git a/Dockerfile b/Dockerfile index d5ad6292e075fe5479245908322774fc4a82c8dd..f89aa2069e8e3d4d3f991b3f33dae59620b8752a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,5 +18,5 @@ FROM paddlepaddle/paddle:latest-gpu RUN apt-get install -y cmake -RUN pip install gym -RUN pip install details +RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple gym +RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple details diff --git a/parl/framework/policy_distribution.py b/parl/framework/policy_distribution.py index 35d682e2d96b89f7aad7ad4e11b5c4cea82d92bb..d0a68c740d71a3102579d13d57d6d24acc4bfd64 100644 --- a/parl/framework/policy_distribution.py +++ b/parl/framework/policy_distribution.py @@ -69,7 +69,7 @@ class CategoricalDistribution(PolicyDistribution): assert isinstance(dist, Variable) def __call__(self): - return comf.categorical_random(self.dist) + return layers.sampling_id(self.dist) @property def dim(self): diff --git a/parl/framework/tests/test_computation_task.py b/parl/framework/tests/test_computation_task.py index 470d2ba09cbe3980fe78a09d7302a64e9acebec6..53cf2e73a2dc0afe477e01981d959875c41ebb4e 100644 --- a/parl/framework/tests/test_computation_task.py +++ b/parl/framework/tests/test_computation_task.py @@ -74,7 +74,7 @@ class TestComputationTask(unittest.TestCase): assert not states, "states should be empty" ## actions["action"] is a batch of actions for a in actions["action"]: - action_counter[a[0]] += 1 + action_counter[a] += 1 if max: ### if max, the first action will always be chosen @@ -219,6 +219,7 @@ class TestComputationTask(unittest.TestCase): if on_policy: outputs, _ = ct.predict(inputs=dict(sensor=sensor)) actions = outputs["action"] + actions = np.expand_dims(actions, 1) else: ## randomly assemble a batch actions = np.random.choice( @@ -238,7 +239,7 @@ class TestComputationTask(unittest.TestCase): ### the policy should bias towards the first action outputs, _ = ct.predict(inputs=dict(sensor=sensor)) for a in outputs["action"]: - self.assertEqual(a[0], 0) + self.assertEqual(a, 0) if __name__ == "__main__": diff --git a/parl/framework/tests/test_simple_games.py b/parl/framework/tests/test_simple_games.py deleted file mode 100644 index c37c69d149177a127e21c2d55c2441efe94d693b..0000000000000000000000000000000000000000 --- a/parl/framework/tests/test_simple_games.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.fluid as fluid -import parl.layers as layers -from parl.framework.computation_task import ComputationTask -from parl.algorithm_zoo.simple_algorithms import SimpleAC, SimpleQ -from parl.model_zoo.simple_models import SimpleModelAC, SimpleModelQ -import numpy as np -import unittest -import math -import gym - - -def unpack_exps(exps): - return [np.array(l).astype('int' if i==2 else 'float32') \ - for i, l in enumerate(zip(*exps))] - - -def sample(past_exps, n): - indices = np.random.choice(len(past_exps), n) - return [past_exps[i] for i in indices] - - -class TestGymGame(unittest.TestCase): - def test_gym_games(self): - """ - Test games in OpenAI gym. - """ - - games = ["MountainCar-v0", "CartPole-v0"] - final_rewards_thresholds = [ - -1.8, ## drive to the right top in 180 steps (timeout is -2.0) - 1.5 ## hold the pole for at least 150 steps - ] - - mlp_layer_confs = [ - dict( - size=128, act="relu"), - dict( - size=128, act="relu"), - dict( - size=128, act="relu"), - ] - - for game, threshold in zip(games, final_rewards_thresholds): - for on_policy in [False, True]: - - if on_policy and game != "CartPole-v0": - ## SimpleAC has difficulty training mountain-car and acrobot - continue - - env = gym.make(game) - state_shape = env.observation_space.shape[0] - num_actions = env.action_space.n - - if on_policy: - alg = SimpleAC( - model=SimpleModelAC( - dims=state_shape, - num_actions=num_actions, - mlp_layer_confs=mlp_layer_confs + - [dict( - size=num_actions, act="softmax")]), - hyperparas=dict(lr=1e-3)) - else: - alg = SimpleQ( - model=SimpleModelQ( - dims=state_shape, - num_actions=num_actions, - mlp_layer_confs=mlp_layer_confs + - [dict(size=num_actions)]), - hyperparas=dict(lr=1e-4), - exploration_end_batches=25000, - update_ref_interval=100) - - print "algorithm: " + alg.__class__.__name__ - - ct = ComputationTask(algorithm=alg) - batch_size = 16 - if not on_policy: - train_every_steps = batch_size / 4 - buffer_size_limit = 100000 - - max_episode = 5000 - - average_episode_reward = [] - past_exps = [] - max_steps = env._max_episode_steps - for n in range(max_episode): - ob = env.reset() - episode_reward = 0 - for t in range(max_steps): - res, _ = ct.predict(inputs=dict(sensor=np.array( - [ob]).astype("float32"))) - pred_action = res["action"][0][0] - - next_ob, reward, next_is_over, _ = env.step( - pred_action) - reward /= 100 - episode_reward += reward - - past_exps.append((ob, next_ob, [pred_action], - [reward], [not next_is_over])) - ## only for off-policy training we use a circular buffer - if (not on_policy - ) and len(past_exps) > buffer_size_limit: - past_exps.pop(0) - - ## compute the learning condition - learn_cond = False - if on_policy: - learn_cond = (len(past_exps) >= batch_size) - exps = past_exps ## directly use all exps in the buffer - else: - learn_cond = ( - t % train_every_steps == train_every_steps - 1) - exps = sample(past_exps, - batch_size) ## sample some exps - - if learn_cond: - sensor, next_sensor, action, reward, next_episode_end \ - = unpack_exps(exps) - cost = ct.learn( - inputs=dict(sensor=sensor), - next_inputs=dict(next_sensor=next_sensor), - next_episode_end=dict( - next_episode_end=next_episode_end), - actions=dict(action=action), - rewards=dict(reward=reward)) - ## we clear the exp buffer for on-policy - if on_policy: - past_exps = [] - - ob = next_ob - - ## end before the Gym wrongly gives game_over=True for a timeout case - if t == max_steps - 2 or next_is_over: - break - - if n % 50 == 0: - print("episode reward: %f" % episode_reward) - - average_episode_reward.append(episode_reward) - if len(average_episode_reward) > 20: - average_episode_reward.pop(0) - - ### compuare the average episode reward to reduce variance - self.assertGreater( - sum(average_episode_reward) / len(average_episode_reward), - threshold) - - -if __name__ == "__main__": - unittest.main() diff --git a/parl/layers/common_functions.py b/parl/layers/common_functions.py index 3f2061353fd4704f507fa7de5336af2dbfae4d00..e4fa50aff60dcd2721a965bb24e0d0f20e62ef29 100644 --- a/parl/layers/common_functions.py +++ b/parl/layers/common_functions.py @@ -54,18 +54,6 @@ class CNN(Feedforward): [layers.conv2d(**c) for c in multi_conv_layers]) -def categorical_random(prob): - """ - Sample an id based on categorical distribution prob - """ - cumsum = layers.cumsum(x=prob) - r = layers.uniform_random_batch_size_like( - input=prob, min=0., max=1., shape=[-1]) - index = layers.reduce_sum(layers.cast(cumsum < r, 'int'), dim=-1) - index = layers.reshape(index, index.shape + (1, )) - return index - - def argmax_layer(input): """ Get the id of the max val of an input vector diff --git a/parl/layers/layer_wrappers.py b/parl/layers/layer_wrappers.py index 0a8959f9d543e9596ad6b84901f608954becc221..db293a65912078082dfc07d7c88f81ef465b3a97 100644 --- a/parl/layers/layer_wrappers.py +++ b/parl/layers/layer_wrappers.py @@ -15,7 +15,7 @@ Wrappers for fluid.layers so that the layers can share parameters conveniently. """ -from paddle.fluid.executor import fetch_var +from paddle.fluid.executor import _fetch_var import paddle.fluid as fluid from paddle.fluid.layers import * from paddle.fluid.param_attr import ParamAttr @@ -79,8 +79,8 @@ class LayerFunc(object): or (not src_attr and not target_attr) if not src_attr: continue - src_var = fetch_var(src_attr.name) - target_var = fetch_var(target_attr.name, return_numpy=False) + src_var = _fetch_var(src_attr.name) + target_var = _fetch_var(target_attr.name, return_numpy=False) target_var.set(src_var, place) def __deepcopy__(self, memo): @@ -259,9 +259,11 @@ def dynamic_lstm(size, def __init__(self): super(DynamicLstm_, self).__init__(param_attr, bias_attr) - def __call__(self, input): + def __call__(self, input, h_0=None, c_0=None): return layers.dynamic_lstm( input=input, + h_0=h_0, + c_0=c_0, size=size, param_attr=self.param_attr, bias_attr=self.bias_attr, @@ -323,7 +325,6 @@ def dynamic_gru(size, is_reverse=False, gate_activation='sigmoid', candidate_activation='tanh', - h_0=None, name=None): """ Return a function that creates a paddle.fluid.layers.dynamic_gru. @@ -337,7 +338,7 @@ def dynamic_gru(size, def __init__(self): super(DynamicGru_, self).__init__(param_attr, bias_attr) - def __call__(self, input): + def __call__(self, input, h_0=None): return layers.dynamic_gru( input=input, size=size,