sac.py

#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from parl.core.fluid import layers
from copy import deepcopy
import numpy as np
from paddle import fluid
from paddle.fluid.layers import Normal
from parl.core.fluid.algorithm import Algorithm

epsilon = 1e-6

__all__ = ['SAC']


class SAC(Algorithm):
    def __init__(self,
                 actor,
                 critic,
                 max_action,
                 alpha=0.2,
                 gamma=None,
                 tau=None,
                 actor_lr=None,
                 critic_lr=None):
        """ SAC algorithm

        Args:
            actor (parl.Model): forward network of actor.
            critic (patl.Model): forward network of the critic.
            max_action (float): the largest value that an action can be, env.action_space.high[0]
            alpha (float): Temperature parameter determines the relative importance of the entropy against the reward
            gamma (float): discounted factor for reward computation.
            tau (float): decay coefficient when updating the weights of self.target_model with self.model
            actor_lr (float): learning rate of the actor model
            critic_lr (float): learning rate of the critic model
        """
        assert isinstance(gamma, float)
        assert isinstance(tau, float)
        assert isinstance(actor_lr, float)
        assert isinstance(critic_lr, float)
        assert isinstance(alpha, float)
        self.max_action = max_action
        self.gamma = gamma
        self.tau = tau
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.alpha = alpha

        self.actor = actor
        self.critic = critic
        self.target_critic = deepcopy(critic)

    def predict(self, obs):
        """ use actor model of self.policy to predict the action
        """
        mean, _ = self.actor.policy(obs)
        mean = layers.tanh(mean) * self.max_action
        return mean

    def sample(self, obs):
        mean, log_std = self.actor.policy(obs)
        std = layers.exp(log_std)
        normal = Normal(mean, std)
        x_t = normal.sample([1])[0]
        y_t = layers.tanh(x_t)
        action = y_t * self.max_action
        log_prob = normal.log_prob(x_t)
        log_prob -= layers.log(self.max_action * (1 - layers.pow(y_t, 2)) +
                               epsilon)
        log_prob = layers.reduce_sum(log_prob, dim=1, keep_dim=True)
        log_prob = layers.squeeze(log_prob, axes=[1])
        return action, log_prob

    def learn(self, obs, action, reward, next_obs, terminal):
        actor_cost = self.actor_learn(obs)
        critic_cost = self.critic_learn(obs, action, reward, next_obs,
                                        terminal)
        return critic_cost, actor_cost

    def actor_learn(self, obs):
        action, log_pi = self.sample(obs)
        qf1_pi, qf2_pi = self.critic.value(obs, action)
        min_qf_pi = layers.elementwise_min(qf1_pi, qf2_pi)
        cost = log_pi * self.alpha - min_qf_pi
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr)
        optimizer.minimize(cost, parameter_list=self.actor.parameters())

        return cost

    def critic_learn(self, obs, action, reward, next_obs, terminal):
        next_state_action, next_state_log_pi = self.sample(next_obs)
        qf1_next_target, qf2_next_target = self.target_critic.value(
            next_obs, next_state_action)
        min_qf_next_target = layers.elementwise_min(
            qf1_next_target, qf2_next_target) - next_state_log_pi * self.alpha

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * min_qf_next_target
        target_Q.stop_gradient = True

        current_Q1, current_Q2 = self.critic.value(obs, action)
        cost = layers.square_error_cost(current_Q1,
                                        target_Q) + layers.square_error_cost(
                                            current_Q2, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost)
        return cost

    def sync_target(self, decay=None):
        if decay is None:
            decay = 1.0 - self.tau
        self.critic.sync_weights_to(self.target_critic, decay=decay)