# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import unittest

import numpy as np
from utils import extra_cc_args, extra_nvcc_args, paddle_includes

import paddle
from paddle import static
from paddle.utils.cpp_extension import get_build_directory, load
from paddle.utils.cpp_extension.extension_utils import run_cmd

# Because Windows don't use docker, the shared lib already exists in the
# cache dir, it will not be compiled again unless the shared lib is removed.
file = '{}\\custom_inplace\\custom_inplace.pyd'.format(get_build_directory())
if os.name == 'nt' and os.path.isfile(file):
    cmd = 'del {}'.format(file)
    run_cmd(cmd, True)

# Compile and load custom op Just-In-Time.
custom_inplace = load(
    name='custom_inplace',
    sources=['custom_inplace.cc'],
    extra_include_paths=paddle_includes,  # add for Coverage CI
    extra_cxx_cflags=extra_cc_args,  # test for cflags
    extra_cuda_cflags=extra_nvcc_args,  # test for cflags
    verbose=True,
)


def inplace_dynamic_add(phi_func, device, dtype, np_x, np_y):
    paddle.set_device(device)
    x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=True)
    y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
    if phi_func:
        out = custom_inplace.custom_add(x, y)
    else:
        out = x.add_(y)

    out.backward()
    return x.numpy(), y.numpy(), out.numpy(), x.grad.numpy(), y.grad.numpy()


def inplace_static_add(func, device, dtype, np_x, np_y):
    paddle.enable_static()
    paddle.set_device(device)
    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            x = static.data(name="x", shape=[None, np_x.shape[1]], dtype=dtype)
            y = static.data(name="y", shape=[None, np_y.shape[1]], dtype=dtype)
            x.stop_gradient = False
            y.stop_gradient = False
            out = func(x, y)
            mean_out = paddle.mean(out)
            static.append_backward(mean_out)

            exe = static.Executor()
            exe.run(static.default_startup_program())

            x_v, out_v, x_grad_v, y_grad_v, out_grad_v = exe.run(
                static.default_main_program(),
                feed={
                    "x": np_x.astype(dtype),
                    "y": np_y.astype(dtype),
                },
                fetch_list=[
                    x.name,
                    out.name,
                    x.name + "@GRAD",
                    y.name + "@GRAD",
                    out.name + "@GRAD",
                ],
            )
    paddle.disable_static()
    return x_v, out_v, x_grad_v, y_grad_v, out_grad_v


def inplace_dynamic_add_vector(phi_func, device, dtype, np_inputs, np_y):
    paddle.set_device(device)
    inputs = [
        paddle.to_tensor(np_input, dtype=dtype, stop_gradient=True)
        for np_input in np_inputs
    ]
    y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
    if phi_func:
        out = custom_inplace.custom_add_vec(inputs, y)
    else:
        out = [x.add_(y) for x in inputs]

    mean_out = paddle.mean(paddle.concat(out))
    mean_out.backward()
    return (
        np.concatenate([input.numpy() for input in inputs]),
        y.numpy(),
        np.concatenate([o.numpy() for o in out]),
        np.concatenate([input.grad.numpy() for input in inputs]),
        y.grad.numpy(),
    )


def inplace_static_add_vector(phi_func, device, dtype, np_inputs, np_y):
    paddle.enable_static()
    paddle.set_device(device)
    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            x1 = static.data(
                name="x1", shape=[None, np_inputs[0].shape[1]], dtype=dtype
            )
            x2 = static.data(
                name="x2", shape=[None, np_inputs[1].shape[1]], dtype=dtype
            )
            y = static.data(name="y", shape=[None, np_y.shape[1]], dtype=dtype)
            x1.stop_gradient = False
            x2.stop_gradient = False
            y.stop_gradient = False
            if phi_func:
                out = custom_inplace.custom_add_vec([x1, x2], y)
            else:
                out = [paddle.add(x1, y), paddle.add(x2, y)]
            mean_out = paddle.mean(paddle.concat(out))
            static.append_backward(mean_out)

            exe = static.Executor()
            exe.run(static.default_startup_program())

            (
                out0_v,
                out1_v,
                x1_grad_v,
                x2_grad_v,
                y_grad_v,
                out0_grad_v,
                out1_grad_v,
            ) = exe.run(
                static.default_main_program(),
                feed={
                    "x1": np_inputs[0].astype(dtype),
                    "x2": np_inputs[1].astype(dtype),
                    "y": np_y.astype(dtype),
                },
                fetch_list=[
                    out[0].name,
                    out[1].name,
                    x1.name + "@GRAD",
                    x2.name + "@GRAD",
                    y.name + "@GRAD",
                    out[0].name + "@GRAD",
                    out[1].name + "@GRAD",
                ],
            )
    paddle.disable_static()
    return (
        [out0_v, out1_v],
        [x1_grad_v, x2_grad_v],
        y_grad_v,
        [out0_grad_v, out1_grad_v],
    )


def inplace_dynamic_relu_net(phi_func, device, dtype, np_x, np_y, np_z):
    paddle.set_device(device)
    x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
    y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
    z = paddle.to_tensor(np_z, dtype=dtype, stop_gradient=False)
    out_xy = x + y
    if phi_func:
        out_xy = custom_inplace.custom_relu_inplace(out_xy)
        out_xyz = out_xy + z
        out = custom_inplace.custom_relu_inplace(out_xyz)
    else:
        out_xy = paddle.nn.functional.relu_(out_xy)
        out_xyz = out_xy + z
        out = paddle.nn.functional.relu_(out_xyz)

    out.backward()
    return x.numpy(), y.numpy(), out.numpy(), x.grad.numpy(), y.grad.numpy()


def inplace_static_relu_net(func, device, dtype, np_x, np_y, np_z):
    paddle.enable_static()
    paddle.set_device(device)
    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            x = static.data(name="x", shape=[None, np_x.shape[1]], dtype=dtype)
            y = static.data(name="y", shape=[None, np_y.shape[1]], dtype=dtype)
            z = static.data(name="z", shape=[None, np_z.shape[1]], dtype=dtype)
            x.stop_gradient = False
            y.stop_gradient = False
            z.stop_gradient = False
            out_xy = x + y
            out_xy = func(out_xy)
            out_xyz = out_xy + z
            out = func(out_xyz)
            mean_out = paddle.mean(out)
            static.append_backward(mean_out)

            exe = static.Executor()
            exe.run(static.default_startup_program())

            x_v, y_v, out_v, x_grad_v, y_grad_v = exe.run(
                static.default_main_program(),
                feed={
                    "x": np_x.astype(dtype),
                    "y": np_y.astype(dtype),
                    "z": np_z.astype(dtype),
                },
                fetch_list=[
                    x.name,
                    y.name,
                    out.name,
                    x.name + "@GRAD",
                    y.name + "@GRAD",
                ],
            )
    paddle.disable_static()
    return x_v, y_v, out_v, x_grad_v, y_grad_v


def dynamic_multi_inplace(phi_func, device, dtype, np_x, np_y, np_a, np_b):
    paddle.set_device(device)
    x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=True)
    y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
    a = paddle.to_tensor(np_a, dtype=dtype, stop_gradient=True)
    b = paddle.to_tensor(np_b, dtype=dtype, stop_gradient=False)
    if phi_func:
        out_xy, out_ab = custom_inplace.custom_multi_inplace(x, y, a, b)
    else:
        out_xy = x.add_(y)
        out_ab = a.add_(b)
    out = out_xy + out_ab

    out.backward()
    return (
        x.numpy(),
        y.numpy(),
        out_xy.numpy(),
        x.grad.numpy(),
        y.grad.numpy(),
        a.numpy(),
        b.numpy(),
        out_ab.numpy(),
        a.grad.numpy(),
        b.grad.numpy(),
    )


def static_multi_inplace(phi_func, device, dtype, np_x, np_y, np_a, np_b):
    paddle.enable_static()
    paddle.set_device(device)
    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            x = static.data(name="x", shape=[None, np_x.shape[1]], dtype=dtype)
            y = static.data(name="y", shape=[None, np_y.shape[1]], dtype=dtype)
            a = static.data(name="a", shape=[None, np_x.shape[1]], dtype=dtype)
            b = static.data(name="b", shape=[None, np_y.shape[1]], dtype=dtype)
            x.stop_gradient = False
            y.stop_gradient = False
            a.stop_gradient = False
            b.stop_gradient = False
            if phi_func:
                out_xy, out_ab = custom_inplace.custom_multi_inplace(x, y, a, b)
            else:
                out_xy = paddle.add(x, y)
                out_ab = paddle.add(a, b)
            mean_out = paddle.mean(paddle.add(out_xy, out_ab))
            static.append_backward(mean_out)

            exe = static.Executor()
            exe.run(static.default_startup_program())

            (
                x_v,
                out_xy_v,
                x_grad_v,
                y_grad_v,
                out_xy_grad_v,
                a_v,
                out_ab_v,
                a_grad_v,
                b_grad_v,
                out_ab_grad_v,
            ) = exe.run(
                static.default_main_program(),
                feed={
                    "x": np_x.astype(dtype),
                    "y": np_y.astype(dtype),
                    "a": np_a.astype(dtype),
                    "b": np_b.astype(dtype),
                },
                fetch_list=[
                    x.name,
                    out_xy.name,
                    x.name + "@GRAD",
                    y.name + "@GRAD",
                    out_xy.name + "@GRAD",
                    a.name,
                    out_ab.name,
                    a.name + "@GRAD",
                    b.name + "@GRAD",
                    out_ab.name + "@GRAD",
                ],
            )
    paddle.disable_static()
    return (
        x_v,
        out_xy_v,
        x_grad_v,
        y_grad_v,
        out_xy_grad_v,
        a_v,
        out_ab_v,
        a_grad_v,
        b_grad_v,
        out_ab_grad_v,
    )


class TestCustomInplaceJit(unittest.TestCase):
    def setUp(self):
        self.dtypes = ['float32', 'float64']
        self.devices = ['cpu']
        self.np_x = np.random.random((3, 2)).astype("float32")
        self.np_y = np.random.random((3, 2)).astype("float32")
        self.np_z = np.random.random((3, 2)).astype("float32")
        self.np_a = np.random.random((3, 2)).astype("float32")
        self.np_b = np.random.random((3, 2)).astype("float32")
        self.np_inputs = [
            np.random.random((3, 2)).astype("float32"),
            np.random.random((3, 2)).astype("float32"),
        ]

    def check_output(self, out, pd_out, name):
        np.testing.assert_array_equal(
            out,
            pd_out,
            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
                name, out, name, pd_out
            ),
        )

    def check_output_allclose(self, out, pd_out, name):
        np.testing.assert_allclose(
            out,
            pd_out,
            rtol=5e-5,
            atol=1e-2,
            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
                name, out, name, pd_out
            ),
        )

    def test_static_add(self):
        for device in self.devices:
            for dtype in self.dtypes:
                (
                    pd_x,
                    pd_out,
                    pd_x_grad,
                    pd_y_grad,
                    pd_out_grad,
                ) = inplace_static_add(
                    paddle.add,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                )
                (
                    phi_x,
                    phi_out,
                    phi_x_grad,
                    phi_y_grad,
                    phi_out_grad,
                ) = inplace_static_add(
                    custom_inplace.custom_add,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                )
                self.check_output(phi_x, phi_out, "inplace_phi_x")
                self.check_output(
                    phi_x_grad, phi_out_grad, "inplace_phi_x_grad"
                )

                self.check_output(phi_out, pd_out, "out")
                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
                self.check_output(phi_out_grad, pd_out_grad, "out_grad")

    def test_dynamic_add(self):
        for device in self.devices:
            for dtype in self.dtypes:
                (
                    pd_x,
                    pd_y,
                    pd_out,
                    pd_x_grad,
                    pd_y_grad,
                ) = inplace_dynamic_add(
                    False,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                )
                (
                    phi_x,
                    phi_y,
                    phi_out,
                    phi_x_grad,
                    phi_y_grad,
                ) = inplace_dynamic_add(
                    True,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                )

                self.check_output(phi_x, phi_out, "inplace_phi_x")
                self.check_output(pd_x, pd_out, "inplace_pd_x")

                self.check_output(phi_x, pd_x, "x")
                self.check_output(phi_y, pd_y, "y")
                self.check_output(phi_out, pd_out, "out")
                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
                self.check_output(phi_y_grad, pd_y_grad, "y_grad")

    def test_static_add_vector(self):
        for device in self.devices:
            for dtype in self.dtypes:
                (
                    pd_out,
                    pd_x_grad,
                    pd_y_grad,
                    pd_out_grad,
                ) = inplace_static_add_vector(
                    True,
                    device,
                    dtype,
                    self.np_inputs,
                    self.np_y,
                )
                (
                    phi_out,
                    phi_x_grad,
                    phi_y_grad,
                    phi_out_grad,
                ) = inplace_static_add_vector(
                    False,
                    device,
                    dtype,
                    self.np_inputs,
                    self.np_y,
                )

                self.check_output(phi_out, pd_out, "out")
                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
                self.check_output(phi_out_grad, pd_out_grad, "out_grad")

    def test_dynamic_add_vector(self):
        for device in self.devices:
            for dtype in self.dtypes:
                (
                    pd_x,
                    pd_y,
                    pd_out,
                    pd_x_grad,
                    pd_y_grad,
                ) = inplace_dynamic_add_vector(
                    True,
                    device,
                    dtype,
                    self.np_inputs,
                    self.np_y,
                )
                (
                    phi_x,
                    phi_y,
                    phi_out,
                    phi_x_grad,
                    phi_y_grad,
                ) = inplace_dynamic_add_vector(
                    False,
                    device,
                    dtype,
                    self.np_inputs,
                    self.np_y,
                )

                self.check_output(phi_x, phi_out, "inplace_phi_x")
                self.check_output(pd_x, pd_out, "inplace_pd_x")

                self.check_output(phi_x, pd_x, "x")
                self.check_output(phi_y, pd_y, "y")
                self.check_output(phi_out, pd_out, "out")
                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
                self.check_output(phi_y_grad, pd_y_grad, "y_grad")

    def test_static_relu_net(self):
        for device in self.devices:
            for dtype in self.dtypes:
                (
                    pd_x,
                    pd_y,
                    pd_out,
                    pd_x_grad,
                    pd_y_grad,
                ) = inplace_static_relu_net(
                    paddle.nn.functional.relu,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                    self.np_z,
                )
                (
                    phi_x,
                    phi_y,
                    phi_out,
                    phi_x_grad,
                    phi_y_grad,
                ) = inplace_static_relu_net(
                    custom_inplace.custom_relu_inplace,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                    self.np_z,
                )
                self.check_output_allclose(phi_x, pd_x, "x")
                self.check_output_allclose(phi_y, pd_y, "y")
                self.check_output_allclose(phi_out, pd_out, "out")
                self.check_output_allclose(phi_x_grad, pd_x_grad, "x_grad")
                self.check_output_allclose(phi_y_grad, pd_y_grad, "y_grad")

    def test_dynamic_relu_net(self):
        for device in self.devices:
            for dtype in self.dtypes:
                (
                    pd_x,
                    pd_y,
                    pd_out,
                    pd_x_grad,
                    pd_y_grad,
                ) = inplace_dynamic_relu_net(
                    False,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                    self.np_z,
                )
                (
                    phi_x,
                    phi_y,
                    phi_out,
                    phi_x_grad,
                    phi_y_grad,
                ) = inplace_dynamic_relu_net(
                    True,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                    self.np_z,
                )

                self.check_output(phi_x, pd_x, "x")
                self.check_output(phi_y, pd_y, "y")
                self.check_output(phi_out, pd_out, "out")
                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
                self.check_output(phi_y_grad, pd_y_grad, "y_grad")

    def test_static_multi_inplace(self):
        for device in self.devices:
            for dtype in self.dtypes:
                (
                    pd_x,
                    pd_out_xy,
                    pd_x_grad,
                    pd_y_grad,
                    pd_out_xy_grad,
                    pd_a,
                    pd_out_ab,
                    pd_a_grad,
                    pd_b_grad,
                    pd_out_ab_grad,
                ) = static_multi_inplace(
                    False,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                    self.np_a,
                    self.np_b,
                )
                (
                    phi_x,
                    phi_out_xy,
                    phi_x_grad,
                    phi_y_grad,
                    phi_out_xy_grad,
                    phi_a,
                    phi_out_ab,
                    phi_a_grad,
                    phi_b_grad,
                    phi_out_ab_grad,
                ) = static_multi_inplace(
                    True,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                    self.np_a,
                    self.np_b,
                )
                self.check_output(phi_x, pd_out_xy, "inplace_phi_x")
                self.check_output(
                    phi_x_grad, phi_out_xy_grad, "inplace_phi_x_grad"
                )
                self.check_output(phi_a, pd_out_ab, "inplace_phi_a")
                self.check_output(
                    phi_a_grad, phi_out_ab_grad, "inplace_phi_a_grad"
                )

                self.check_output(phi_out_xy, pd_out_xy, "outxy")
                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
                self.check_output(phi_out_xy_grad, pd_out_xy_grad, "outxy_grad")
                self.check_output(phi_out_ab, pd_out_ab, "outab")
                self.check_output(phi_a_grad, pd_a_grad, "a_grad")
                self.check_output(phi_b_grad, pd_b_grad, "b_grad")
                self.check_output(phi_out_ab_grad, pd_out_ab_grad, "outab_grad")

    def test_dynamic_multi_inplace(self):
        for device in self.devices:
            for dtype in self.dtypes:
                (
                    pd_x,
                    pd_y,
                    pd_out_xy,
                    pd_x_grad,
                    pd_y_grad,
                    pd_a,
                    pd_b,
                    pd_out_ab,
                    pd_a_grad,
                    pd_b_grad,
                ) = dynamic_multi_inplace(
                    False,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                    self.np_a,
                    self.np_b,
                )
                (
                    phi_x,
                    phi_y,
                    phi_out_xy,
                    phi_x_grad,
                    phi_y_grad,
                    phi_a,
                    phi_b,
                    phi_out_ab,
                    phi_a_grad,
                    phi_b_grad,
                ) = dynamic_multi_inplace(
                    True,
                    device,
                    dtype,
                    self.np_x,
                    self.np_y,
                    self.np_a,
                    self.np_b,
                )

                self.check_output(phi_x, phi_out_xy, "inplace_phi_x")
                self.check_output(pd_x, pd_out_xy, "inplace_pd_x")
                self.check_output(phi_a, phi_out_ab, "inplace_phi_a")
                self.check_output(pd_a, pd_out_ab, "inplace_pd_a")

                self.check_output(phi_x, pd_x, "x")
                self.check_output(phi_y, pd_y, "y")
                self.check_output(phi_out_xy, pd_out_xy, "outxy")
                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
                self.check_output(phi_a, pd_a, "a")
                self.check_output(phi_b, pd_b, "b")
                self.check_output(phi_out_ab, pd_out_ab, "outab")
                self.check_output(phi_a_grad, pd_a_grad, "a_grad")
                self.check_output(phi_b_grad, pd_b_grad, "b_grad")


if __name__ == "__main__":
    unittest.main()