# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import warnings warnings.simplefilter('default') import paddle.fluid as fluid from parl.core.fluid import layers from parl.core.fluid.algorithm import Algorithm from parl.core.fluid.policy_distribution import CategoricalDistribution __all__ = ['A3C'] class A3C(Algorithm): def __init__(self, model, hyperparas=None, vf_loss_coeff=None): """ A3C/A2C algorithm Args: model (parl.Model): forward network of policy and value hyperparas (dict): (deprecated) dict of hyper parameters. vf_loss_coeff (float): coefficient of the value function loss """ self.model = model if hyperparas is not None: warnings.warn( "the `hyperparas` argument of `__init__` function in `parl.Algorithms.A3C` is deprecated since version 1.2 and will be removed in version 1.3.", DeprecationWarning, stacklevel=2) self.vf_loss_coeff = hyperparas['vf_loss_coeff'] else: assert isinstance(vf_loss_coeff, (int, float)) self.vf_loss_coeff = vf_loss_coeff def learn(self, obs, actions, advantages, target_values, learning_rate, entropy_coeff): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. actions: An int64 tensor of shape [B]. advantages: A float32 tensor of shape [B]. target_values: A float32 tensor of shape [B]. learning_rate: float scalar of learning rate. entropy_coeff: float scalar of entropy coefficient. """ logits = self.model.policy(obs) policy_distribution = CategoricalDistribution(logits) actions_log_probs = policy_distribution.logp(actions) # The policy gradient loss pi_loss = -1.0 * layers.reduce_sum(actions_log_probs * advantages) # The value function loss values = self.model.value(obs) delta = values - target_values vf_loss = 0.5 * layers.reduce_sum(layers.square(delta)) # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0) policy_entropy = policy_distribution.entropy() entropy = layers.reduce_sum(policy_entropy) total_loss = ( pi_loss + vf_loss * self.vf_loss_coeff + entropy * entropy_coeff) fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=40.0)) optimizer = fluid.optimizer.AdamOptimizer(learning_rate) optimizer.minimize(total_loss) return total_loss, pi_loss, vf_loss, entropy def sample(self, obs): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. """ logits, values = self.model.policy_and_value(obs) policy_dist = CategoricalDistribution(logits) sample_actions = policy_dist.sample() return sample_actions, values def predict(self, obs): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. """ logits = self.model.policy(obs) probs = layers.softmax(logits) predict_actions = layers.argmax(probs, 1) return predict_actions def value(self, obs): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. """ values = self.model.value(obs) return values