#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
from torch.distributions import Categorical
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from random import random, randint

import parl
from parl.utils.scheduler import PiecewiseScheduler, LinearDecayScheduler

__all__ = ['A2C']


class A2C(parl.Algorithm):
    def __init__(self, model, config):
        assert isinstance(config['vf_loss_coeff'], (int, float))
        self.model = model
        self.vf_loss_coeff = config['vf_loss_coeff']
        self.optimizer = optim.Adam(
            self.model.parameters(), lr=config['learning_rate'])
        self.config = config

        self.lr_scheduler = LinearDecayScheduler(config['start_lr'],
                                                 config['max_sample_steps'])

        self.entropy_coeff_scheduler = PiecewiseScheduler(
            config['entropy_coeff_scheduler'])

    def learn(self, obs, actions, advantages, target_values):
        prob = self.model.policy(obs, softmax_dim=1)
        policy_distri = Categorical(prob)
        actions_log_probs = policy_distri.log_prob(actions)

        # The policy gradient loss
        pi_loss = -((actions_log_probs * advantages).sum())

        # The value function loss
        values = self.model.value(obs).reshape(-1)
        delta = values - target_values
        vf_loss = 0.5 * torch.mul(delta, delta).sum()

        # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0)
        policy_entropy = policy_distri.entropy()
        entropy = policy_entropy.sum()

        lr = self.lr_scheduler.step(step_num=obs.shape[0])
        entropy_coeff = self.entropy_coeff_scheduler.step()

        total_loss = pi_loss + vf_loss * self.vf_loss_coeff + entropy * entropy_coeff

        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

        total_loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()

        return total_loss, pi_loss, vf_loss, entropy, lr, entropy_coeff

    def sample(self, obs):
        prob, values = self.model.policy_and_value(obs)
        sample_actions = Categorical(prob).sample()

        return sample_actions, values

    def predict(self, obs):
        prob = self.model.policy(obs)
        _, predict_actions = prob.max(-1)

        return predict_actions

    def value(self, obs):
        values = self.model.value(obs)
        return values