From 5235ec53fd9f224c358ee8ffe4b71d1cc4fc4853 Mon Sep 17 00:00:00 2001
From: ronnywang <ronny1996@163.com>
Date: Thu, 1 Sep 2022 17:19:55 +0800
Subject: [PATCH] [NPU] add npu ops: instance_norm, conv3d_transpose (#45636)

* [NPU] add conv3d_transpose, instance_norm, instance_norm_grad

* add ut

* remove instance_norm_grad
---
 .../fluid/operators/conv_transpose_op_npu.cc  | 104 +++++++
 .../fluid/operators/instance_norm_op_npu.cc   |  93 +++++++
 .../npu/test_conv3d_transpose_op_npu.py       | 258 ++++++++++++++++++
 .../npu/test_instance_norm_op_npu.py          |  90 ++++++
 4 files changed, 545 insertions(+)
 create mode 100644 paddle/fluid/operators/instance_norm_op_npu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/npu/test_conv3d_transpose_op_npu.py
 create mode 100644 python/paddle/fluid/tests/unittests/npu/test_instance_norm_op_npu.py
diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc
index 181ecf2ca5..94a6825ff6 100644
--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_npu.cc
@@ -199,6 +199,106 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class Conv3DTransposeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const Tensor* filter = ctx.Input<Tensor>("Filter");
+    Tensor* output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>(ctx.GetPlace());
+    std::vector<int> output_padding =
+        ctx.Attr<std::vector<int>>("output_padding");
+    const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    int groups = ctx.Attr<int>("groups");
+    const std::string padding_algorithm =
+        ctx.Attr<std::string>("padding_algorithm");
+
+    // check dimension
+    const bool channel_last = data_format == "NHWC";
+
+    if (data_format == "NHWC") {
+      data_format = "NDHWC";
+    } else {
+      data_format = "NCDHW";
+    }
+
+    // update padding and dilation
+    auto in_dims = input->dims();
+    auto filter_dims = filter->dims();
+    framework::DDim in_data_dims;
+    framework::DDim filter_data_dims;
+
+    if (channel_last) {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    }
+    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
+
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    phi::UpdatePaddingAndDilation(
+        &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
+
+    // construct NPU attr
+    std::vector<int> strides(5, 1);
+    std::vector<int> dilations(5, 1);
+
+    Tensor input_tensor, output_tensor, filter_tensor;
+    input_tensor.Resize(input->dims());
+    input_tensor.ShareDataWith(*input);
+    output_tensor.Resize(output->dims());
+    output_tensor.ShareDataWith(*output);
+    filter_tensor.Resize(filter->dims());
+    filter_tensor.ShareDataWith(*filter);
+
+    PADDLE_ENFORCE_EQ(
+        dilation[0],
+        1,
+        platform::errors::InvalidArgument(
+            "dilation[0] must be equal 1, but received %d.", dilation[0]));
+
+    if (channel_last) {
+      input_tensor.set_layout(DataLayout::kNDHWC);
+      output_tensor.set_layout(DataLayout::kNDHWC);
+      strides[1] = stride[0];
+      strides[2] = stride[1];
+      strides[3] = stride[2];
+      dilations[2] = dilation[1];
+      dilations[3] = dilation[2];
+    } else {
+      input_tensor.set_layout(DataLayout::kNCDHW);
+      output_tensor.set_layout(DataLayout::kNCDHW);
+      strides[2] = stride[0];
+      strides[3] = stride[1];
+      strides[4] = stride[2];
+      dilations[3] = dilation[1];
+      dilations[4] = dilation[2];
+    }
+    filter_tensor.set_layout(DataLayout::kNCDHW);
+
+    auto output_dim_vec = phi::vectorize<int32_t>(output_tensor.dims());
+
+    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
+
+    NpuOpRunner runner;
+    runner.SetType("Conv3DBackpropInputD")
+        .AddInput(filter_tensor)
+        .AddInput(input_tensor)
+        .AddAttr("input_size", output_dim_vec)
+        .AddAttr("strides", strides)
+        .AddAttr("pads", padding)
+        .AddAttr("dilations", dilations)
+        .AddAttr("groups", groups)
+        .AddAttr("data_format", data_format)
+        .AddOutput(output_tensor);
+    runner.Run(dev_ctx.stream());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -212,3 +312,7 @@ REGISTER_OP_NPU_KERNEL(conv2d_transpose,
 REGISTER_OP_NPU_KERNEL(conv2d_transpose_grad,
                        ops::Conv2DTransposeGradNPUKernel<float>,
                        ops::Conv2DTransposeGradNPUKernel<plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(conv3d_transpose,
+                       ops::Conv3DTransposeNPUKernel<float>,
+                       ops::Conv3DTransposeNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/instance_norm_op_npu.cc b/paddle/fluid/operators/instance_norm_op_npu.cc
new file mode 100644
index 0000000000..89c6a310d7
--- /dev/null
+++ b/paddle/fluid/operators/instance_norm_op_npu.cc
@@ -0,0 +1,93 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class InstanceNormNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto epsilon = ctx.Attr<float>("epsilon");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* bias = ctx.Input<Tensor>("Bias");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("SavedMean");
+    auto* variance = ctx.Output<Tensor>("SavedVariance");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    dev_ctx.template Alloc<T>(y);
+    dev_ctx.template Alloc<T>(mean);
+    dev_ctx.template Alloc<T>(variance);
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+
+    PADDLE_ENFORCE(x_dims.size() <= 5 && x_dims.size() >= 3,
+                   platform::errors::InvalidArgument(
+                       "InstanceNorm only supports the dimension of input "
+                       " less equal to 5 and greater equal to 3. the dimension "
+                       "of input is %d.",
+                       x_dims.size()));
+
+    auto tmp_x_dims = phi::vectorize<int>(x_dims);
+    auto tmp_y_dims = phi::vectorize<int>(y_dims);
+    if (x_dims.size() < 5) {
+      for (size_t i = x_dims.size(); i < 5; ++i) {
+        tmp_x_dims.insert(tmp_x_dims.begin() + 2, 1);
+        tmp_y_dims.insert(tmp_y_dims.begin() + 2, 1);
+      }
+    }
+
+    Tensor tmp_x, tmp_y;
+    tmp_x.ShareDataWith(*x);
+
+    tmp_x.Resize(phi::make_ddim(tmp_x_dims));
+    tmp_x.set_layout(paddle::framework::DataLayout::NCDHW);
+    tmp_y.ShareDataWith(*y);
+    tmp_y.Resize(phi::make_ddim(tmp_y_dims));
+    tmp_y.set_layout(paddle::framework::DataLayout::NCDHW);
+
+    NpuOpRunner runner;
+
+    runner.SetType("InstanceNorm")
+        .AddInput(tmp_x)
+        .AddInput(*scale)
+        .AddInput(*bias)
+        .AddAttr("data_format", std::string("NCDHW"))
+        .AddAttr("epsilon", epsilon)
+        .AddOutput(tmp_y)
+        .AddOutput(*mean)
+        .AddOutput(*variance);
+    runner.Run(dev_ctx.stream());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    instance_norm,
+    ops::InstanceNormNPUKernel<paddle::platform::NPUDeviceContext,
+                               plat::float16>,
+    ops::InstanceNormNPUKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv3d_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv3d_transpose_op_npu.py
new file mode 100644
index 0000000000..0e5710e2a2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_conv3d_transpose_op_npu.py
@@ -0,0 +1,258 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+def conv3dtranspose_forward_naive(input_, filter_, attrs):
+    padding_algorithm = attrs['padding_algorithm']
+    if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+        raise ValueError("Unknown Attr(padding_algorithm): '%s'. "
+                         "It can only be 'SAME' or 'VALID'." %
+                         str(padding_algorithm))
+
+    if attrs['data_format'] == 'NHWC':
+        input_ = np.transpose(input_, [0, 4, 1, 2, 3])
+    in_n, in_c, in_d, in_h, in_w = input_.shape
+    f_c, f_out_c, f_d, f_h, f_w = filter_.shape
+    groups = attrs['groups']
+    assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c // groups
+
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+
+    def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(input_shape,
+                                                        kernel_size,
+                                                        kernel_stride):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    ksize = filter_.shape[2:5]
+    if padding_algorithm == "VALID":
+        pad = [0, 0, 0, 0, 0, 0]
+    elif padding_algorithm == "SAME":
+        dilations = [1, 1, 1]
+        input_data_shape = input_.shape[2:5]
+        pad = _get_padding_with_SAME(input_data_shape, ksize, stride)
+
+    pad_d_0, pad_d_1 = pad[0], pad[0]
+    pad_h_0, pad_h_1 = pad[1], pad[1]
+    pad_w_0, pad_w_1 = pad[2], pad[2]
+    if len(pad) == 6:
+        pad_d_0, pad_d_1 = pad[0], pad[1]
+        pad_h_0, pad_h_1 = pad[2], pad[3]
+        pad_w_0, pad_w_1 = pad[4], pad[5]
+
+    d_bolck_d = dilations[0] * (f_d - 1) + 1
+    d_bolck_h = dilations[1] * (f_h - 1) + 1
+    d_bolck_w = dilations[2] * (f_w - 1) + 1
+    out_d = (in_d - 1) * stride[0] + d_bolck_d
+    out_h = (in_h - 1) * stride[1] + d_bolck_h
+    out_w = (in_w - 1) * stride[2] + d_bolck_w
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    for n in range(in_n):
+        for d in range(in_d):
+            for i in range(in_h):
+                for j in range(in_w):
+                    for g in range(groups):
+                        input_masked = input_[n,
+                                              g * sub_in_c:(g + 1) * sub_in_c,
+                                              d, i, j]  # (c)
+                        input_masked = np.reshape(input_masked,
+                                                  (sub_in_c, 1, 1, 1))
+                        input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
+
+                        for k in range(f_out_c):
+                            tmp_out = np.sum(input_masked *
+                                             filter_[g * sub_in_c:(g + 1) *
+                                                     sub_in_c, k, :, :, :],
+                                             axis=0)
+                            d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
+                            i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
+                            j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
+                            out[n, g * f_out_c + k, d1:d2:dilations[0],
+                                i1:i2:dilations[1],
+                                j1:j2:dilations[2]] += tmp_out
+
+    out = out[:, :, pad_d_0:out_d - pad_d_1, pad_h_0:out_h - pad_h_1,
+              pad_w_0:out_w - pad_w_1]
+    if attrs['data_format'] == 'NHWC':
+        out = np.transpose(out, [0, 2, 3, 4, 1])
+    return out
+
+
+class TestConv3DTransposeOp(OpTest):
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def setUp(self):
+        # init as conv transpose
+        self.check_no_input = False
+        self.check_no_filter = False
+        self.data_format = 'NCHW'
+        self.pad = [0, 0, 0]
+        self.padding_algorithm = "EXPLICIT"
+        self.init_op_type()
+        self.init_test_case()
+        self.set_npu()
+
+        input_ = np.random.random(self.input_size).astype("float32")
+        filter_ = np.random.random(self.filter_size).astype("float32")
+
+        self.inputs = {'Input': input_, 'Filter': filter_}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'padding_algorithm': self.padding_algorithm,
+            'dilations': self.dilations,
+            'groups': self.groups,
+            'data_format': self.data_format
+        }
+
+        output = conv3dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype("float32")
+
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=8e-3)
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 1, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 1, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose"
+
+
+class TestWithSymmetricPad(TestConv3DTransposeOp):
+
+    def init_test_case(self):
+        self.check_no_input = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithAsymmetricPad(TestConv3DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 0, 1, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithSAMEPad(TestConv3DTransposeOp):
+
+    def init_test_case(self):
+        self.stride = [1, 1, 2]
+        self.dilations = [1, 2, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 6]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 4]
+        self.padding_algorithm = 'SAME'
+
+
+class TestWithVALIDPad(TestConv3DTransposeOp):
+
+    def init_test_case(self):
+        self.stride = [2, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 4, 3]
+        self.padding_algorithm = 'VALID'
+
+
+class TestWithStride(TestConv3DTransposeOp):
+
+    def init_test_case(self):
+        self.check_no_filter = True
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithDilation(TestConv3DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 2, 2]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class Test_NHWC(TestConv3DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = 'NHWC'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_instance_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_instance_norm_op_npu.py
new file mode 100644
index 0000000000..c7363194e9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_instance_norm_op_npu.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+
+import paddle
+from paddle import fluid
+from paddle.static import Program, program_guard
+from paddle.fluid import core
+from paddle.fluid.op import Operator
+from paddle.fluid.dygraph import to_variable
+from paddle.fluid.framework import _test_eager_guard
+
+paddle.enable_static()
+
+
+class TestInstanceNorm(unittest.TestCase):
+
+    def test_dygraph(self):
+        places = [fluid.NPUPlace(0)]
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.InstanceNorm(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.InstanceNorm2D(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            np.testing.assert_allclose(y1, y2, rtol=1e-03)
+
+    def test_static(self):
+        places = [fluid.NPUPlace(0)]
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ins = fluid.dygraph.InstanceNorm(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ins = paddle.nn.InstanceNorm2D(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            np.testing.assert_allclose(y1, y2, rtol=1e-03)
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab