maddpg.py

#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import warnings
warnings.simplefilter('default')

from parl.core.fluid import layers
from copy import deepcopy
from paddle import fluid
from parl.core.fluid.algorithm import Algorithm

__all__ = ['MADDPG']

from parl.core.fluid.policy_distribution import SoftCategoricalDistribution
from parl.core.fluid.policy_distribution import SoftMultiCategoricalDistribution


def SoftPDistribution(logits, act_space):
    """Args:
            logits: the output of policy model
            act_space: action space, must be gym.spaces.Discrete or multiagent.multi_discrete.MultiDiscrete

        Return:
            instance of SoftCategoricalDistribution or SoftMultiCategoricalDistribution
    """
    # is instance of gym.spaces.Discrete
    if (hasattr(act_space, 'n')):
        return SoftCategoricalDistribution(logits)
    # is instance of multiagent.multi_discrete.MultiDiscrete
    elif (hasattr(act_space, 'num_discrete_space')):
        return SoftMultiCategoricalDistribution(logits, act_space.low,
                                                act_space.high)
    else:
        raise AssertionError("act_space must be instance of \
            gym.spaces.Discrete or multiagent.multi_discrete.MultiDiscrete")


class MADDPG(Algorithm):
    def __init__(self,
                 model,
                 agent_index=None,
                 act_space=None,
                 gamma=None,
                 tau=None,
                 lr=None,
                 actor_lr=None,
                 critic_lr=None):
        """  MADDPG algorithm
        
        Args:
            model (parl.Model): forward network of actor and critic.
                                The function get_actor_params() of model should be implemented.
            agent_index: index of agent, in multiagent env
            act_space: action_space, gym space
            gamma (float): discounted factor for reward computation.
            tau (float): decay coefficient when updating the weights of self.target_model with self.model
            lr (float): learning rate, lr will be assigned to both critic_lr and actor_lr
            critic_lr (float): learning rate of the critic model
            actor_lr (float): learning rate of the actor model
        """

        assert isinstance(agent_index, int)
        assert isinstance(act_space, list)
        assert isinstance(gamma, float)
        assert isinstance(tau, float)
        # compatible upgrade of lr
        if lr is None:
            assert isinstance(actor_lr, float)
            assert isinstance(critic_lr, float)
        else:
            assert isinstance(lr, float)
            assert actor_lr is None, 'no need to set `actor_lr` if `lr` is not None'
            assert critic_lr is None, 'no need to set `critic_lr` if `lr` is not None'
            critic_lr = lr
            actor_lr = lr
            warnings.warn(
                "the `lr` argument of `__init__` function in `parl.Algorithms.MADDPG` is deprecated \
                    since version 1.4 and will be removed in version 2.0. \
                    Recommend to use `actor_lr` and `critic_lr`. ",
                DeprecationWarning,
                stacklevel=2)
        self.agent_index = agent_index
        self.act_space = act_space
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.model = model
        self.target_model = deepcopy(model)

    def predict(self, obs):
        """ input:  
                obs: observation, shape([B] + shape of obs_n[agent_index])
            output: 
                act: action, shape([B] + shape of act_n[agent_index])
        """
        this_policy = self.model.policy(obs)
        this_action = SoftPDistribution(
            logits=this_policy,
            act_space=self.act_space[self.agent_index]).sample()
        return this_action

    def predict_next(self, obs):
        """ input:  observation, shape([B] + shape of obs_n[agent_index])
            output: action, shape([B] + shape of act_n[agent_index])
        """
        next_policy = self.target_model.policy(obs)
        next_action = SoftPDistribution(
            logits=next_policy,
            act_space=self.act_space[self.agent_index]).sample()
        return next_action

    def Q(self, obs_n, act_n):
        """ input:  
                obs_n: all agents' observation, shape([B] + shape of obs_n)
            output: 
                act_n: all agents' action, shape([B] + shape of act_n)
        """
        return self.model.value(obs_n, act_n)

    def Q_next(self, obs_n, act_n):
        """ input:  
                obs_n: all agents' observation, shape([B] + shape of obs_n)
            output: 
                act_n: all agents' action, shape([B] + shape of act_n)
        """
        return self.target_model.value(obs_n, act_n)

    def learn(self, obs_n, act_n, target_q):
        """ update actor and critic model with MADDPG algorithm
        """
        actor_cost = self._actor_learn(obs_n, act_n)
        critic_cost = self._critic_learn(obs_n, act_n, target_q)
        return critic_cost

    def _actor_learn(self, obs_n, act_n):
        i = self.agent_index
        this_policy = self.model.policy(obs_n[i])
        sample_this_action = SoftPDistribution(
            logits=this_policy,
            act_space=self.act_space[self.agent_index]).sample()

        action_input_n = act_n + []
        action_input_n[i] = sample_this_action
        eval_q = self.Q(obs_n, action_input_n)
        act_cost = layers.reduce_mean(-1.0 * eval_q)

        act_reg = layers.reduce_mean(layers.square(this_policy))

        cost = act_cost + act_reg * 1e-3

        fluid.clip.set_gradient_clip(
            clip=fluid.clip.GradientClipByNorm(clip_norm=0.5),
            param_list=self.model.get_actor_params())

        optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr)
        optimizer.minimize(cost, parameter_list=self.model.get_actor_params())
        return cost

    def _critic_learn(self, obs_n, act_n, target_q):
        pred_q = self.Q(obs_n, act_n)
        cost = layers.reduce_mean(layers.square_error_cost(pred_q, target_q))

        fluid.clip.set_gradient_clip(
            clip=fluid.clip.GradientClipByNorm(clip_norm=0.5),
            param_list=self.model.get_critic_params())

        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost, parameter_list=self.model.get_critic_params())
        return cost

    def sync_target(self, decay=None):
        if decay is None:
            decay = 1.0 - self.tau
        self.model.sync_weights_to(self.target_model, decay=decay)