#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from parl.framework.algorithm import Algorithm
from paddle.fluid.initializer import ConstantInitializer
import parl.layers as layers
import parl.framework.policy_distribution as pd
from parl.layers import common_functions as comf
import paddle.fluid as fluid
from copy import deepcopy


class SimpleAC(Algorithm):
    """
    A simple Actor-Critic that has a feedforward policy network and
    a single discrete action.

    learn() requires keywords: "action", "reward", "v_value"
    """

    def __init__(self,
                 model,
                 hyperparas=dict(lr=1e-4),
                 gpu_id=-1,
                 discount_factor=0.99):

        super(SimpleAC, self).__init__(model, hyperparas, gpu_id)
        self.discount_factor = discount_factor

    def learn(self, inputs, next_inputs, states, next_states, next_episode_end,
              actions, rewards):

        action = actions["action"]
        reward = rewards["reward"]

        values = self.model.value(inputs, states)
        next_values = self.model.value(next_inputs, next_states)
        value = values["v_value"]
        next_value = next_values["v_value"] * next_episode_end[
            "next_episode_end"]
        next_value.stop_gradient = True
        assert value.shape[1] == next_value.shape[1]

        critic_value = reward + self.discount_factor * next_value
        td_error = critic_value - value
        value_cost = layers.square(td_error)

        dist, _ = self.model.policy(inputs, states)
        dist = dist["action"]
        assert isinstance(dist, pd.CategoricalDistribution)

        pg_cost = 0 - dist.loglikelihood(action)
        avg_cost = layers.mean(x=value_cost + pg_cost * td_error)
        optimizer = fluid.optimizer.DecayedAdagradOptimizer(
            learning_rate=self.hp["lr"])
        optimizer.minimize(avg_cost)
        return dict(cost=avg_cost)

    def predict(self, inputs, states):
        return self._rl_predict(self.model, inputs, states)


class SimpleQ(Algorithm):
    """
    A simple Q-learning that has a feedforward policy network and a single discrete action.

    learn() requires keywords: "action", "reward", "q_value"
    """

    def __init__(self,
                 model,
                 hyperparas=dict(lr=1e-4),
                 gpu_id=-1,
                 discount_factor=0.99,
                 exploration_end_batches=0,
                 exploration_end_rate=0.1,
                 update_ref_interval=100):

        super(SimpleQ, self).__init__(model, hyperparas, gpu_id)
        self.discount_factor = discount_factor
        self.gpu_id = gpu_id
        assert update_ref_interval > 0
        self.update_ref_interval = update_ref_interval
        self.total_batches = 0
        ## create a reference model
        self.ref_model = deepcopy(model)
        ## setup exploration
        self.explore = (exploration_end_batches > 0)
        if self.explore:
            self.exploration_counter = layers.create_persistable_variable(
                dtype="float32",
                shape=[1],
                is_bias=True,
                default_initializer=ConstantInitializer(0.))
            ### in the second half of training time, the rate is fixed to a number
            self.total_exploration_batches = exploration_end_batches
            self.exploration_rate_delta \
                = (1 - exploration_end_rate) / self.total_exploration_batches

    def before_every_batch(self):
        if self.total_batches % self.update_ref_interval == 0:
            self.model.sync_paras_to(self.ref_model, self.gpu_id)
        self.total_batches += 1

    def predict(self, inputs, states):
        """
        Override the base predict() function to put the exploration rate in inputs
        """
        rate = 0
        if self.explore:
            counter = self.exploration_counter()
            ## first compute the current exploration rate
            rate = 1 - counter * self.exploration_rate_delta

        distributions, states = self.model.policy(inputs, states)
        for dist in distributions.values():
            assert dist.__class__.__name__ == "CategoricalDistribution"
            dist.add_uniform_exploration(rate)

        actions = {}
        for key, dist in distributions.iteritems():
            actions[key] = dist()
        return actions, states

    def learn(self, inputs, next_inputs, states, next_states, next_episode_end,
              actions, rewards):

        action = actions["action"]
        reward = rewards["reward"]

        values = self.model.value(inputs, states)
        next_values = self.ref_model.value(next_inputs, next_states)
        q_value = values["q_value"]
        next_q_value = next_values["q_value"] * next_episode_end[
            "next_episode_end"]
        next_q_value.stop_gradient = True
        next_value = layers.reduce_max(next_q_value, dim=-1)
        assert q_value.shape[1] == next_q_value.shape[1]
        num_actions = q_value.shape[1]

        value = comf.idx_select(input=q_value, idx=action)
        critic_value = reward + self.discount_factor * next_value
        td_error = critic_value - value

        avg_cost = layers.mean(x=layers.square(td_error))
        optimizer = fluid.optimizer.DecayedAdagradOptimizer(
            learning_rate=self.hp["lr"])
        optimizer.minimize(avg_cost)

        self._increment_exploration_counter()
        return dict(cost=avg_cost)

    def _increment_exploration_counter(self):
        if self.explore:
            counter = self.exploration_counter()
            exploration_counter_ = counter + 1
            switch = layers.cast(
                x=(exploration_counter_ > self.total_exploration_batches),
                dtype="float32")
            ## if the counter already hits the limit, we do not change the counter
            layers.assign(
                switch * counter + (1 - switch) * exploration_counter_,
                counter)