# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.layers.dist_algo as dist_algo
from paddle.fluid.initializer import NumpyArrayInitializer
from dist_classification_base import DistClassificationRunner
from test_dist_collective_base import runtime_main


# TODO(gavin1332) check whether it is necessary to transpose weight
class DistArcfaceClassificationRunner(DistClassificationRunner):
    @classmethod
    def add_other_arguments(cls, parser):
        parser.add_argument('--arcface_margin', type=float, default=0.0)
        parser.add_argument('--arcface_scale', type=float, default=1.0)

    def __init__(self, args):
        super(DistArcfaceClassificationRunner, self).__init__(args)
        np.random.seed(1024)
        self.param_value = np.random.rand(self.args.class_num,
                                          self.args.feature_size)

    def local_classify_subnet(self, feature, label):
        args = self.args

        weight = layers.create_parameter(
            dtype=feature.dtype,
            shape=[args.class_num, args.feature_size],
            default_initializer=NumpyArrayInitializer(self.param_value),
            is_bias=False)

        # normalize feature
        feature_l2 = layers.sqrt(
            layers.reduce_sum(
                layers.square(feature), dim=1))
        norm_feature = layers.elementwise_div(feature, feature_l2, axis=0)

        # normalize weight
        weight_l2 = layers.sqrt(layers.reduce_sum(layers.square(weight), dim=1))
        norm_weight = layers.elementwise_div(weight, weight_l2, axis=0)
        norm_weight = layers.transpose(norm_weight, perm=[1, 0])

        cos = layers.mul(norm_feature, norm_weight)

        theta = layers.acos(cos)
        margin_cos = layers.cos(theta + args.arcface_margin)

        one_hot = layers.one_hot(label, depth=args.class_num)

        diff = (margin_cos - cos) * one_hot
        target_cos = cos + diff
        logit = layers.scale(target_cos, scale=args.arcface_scale)

        loss = layers.softmax_with_cross_entropy(logit, label)
        cost = layers.mean(loss)

        return cost

    def parall_classify_subnet(self, feature, label):
        args = self.args
        shard_dim = (args.class_num + args.nranks - 1) // args.nranks
        shard_start = shard_dim * args.rank
        rank_param_value = self.param_value[shard_start:(shard_start + shard_dim
                                                         ), :]
        cost = layers.dist_algo._distributed_arcface_classify(
            x=feature,
            label=label,
            class_num=args.class_num,
            nranks=args.nranks,
            rank_id=args.rank,
            margin=args.arcface_margin,
            logit_scale=args.arcface_scale,
            param_attr=NumpyArrayInitializer(rank_param_value))
        return cost


if __name__ == "__main__":
    runtime_main(DistArcfaceClassificationRunner)