[NPU] Add iou_similarity op (#36412)

* [NPU] Add iou_similarity op * [NPU] Add iou_similarity op * [NPU] Add iou_similarity op

[NPU] Add iou_similarity op (#36412)
* [NPU] Add iou_similarity op * [NPU] Add iou_similarity op * [NPU] Add iou_similarity op
999242e3 · zhulei · GitHub · f2612462 · 999242e3 · 999242e3
3 changed file
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -64,6 +64,8 @@ endif()

 if(WITH_XPU)
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc)
+elseif(WITH_ASCEND_CL)
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc)
 else()
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu)
 endif()

--- a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/iou_similarity_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct IouFunction {
+ public:
+  explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
+    place = ctx.GetPlace();
+    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
+                 .stream();
+  }
+  void Transpose(const Tensor* x, Tensor* y, const std::vector<int>& axis) {
+    //  y should be init first
+    const auto& runner =
+        NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
+    runner.Run(stream);
+  }
+  void Add(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Sub(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Mul(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Adds(const Tensor* x, float scalar, Tensor* y) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
+    runner.Run(stream);
+  }
+  void Maximum(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  z should be init first
+    const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Minimum(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  z should be init first
+    const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+
+ private:
+  platform::Place place;
+  aclrtStream stream;
+  const framework::ExecutionContext& ctx;
+};
+
+template <typename T>
+class IouSimilarityNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    bool normalized = ctx.Attr<bool>("box_normalized");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    auto _type = x->type();
+    auto place = ctx.GetPlace();
+
+    IouFunction<T> F(ctx);
+
+    auto N = x->dims()[0];
+    auto M = y->dims()[0];
+
+    out->mutable_data<T>({N, M}, place);
+    Tensor xt(_type);
+    Tensor yt(_type);
+    xt.mutable_data<T>({4, N}, place);
+    yt.mutable_data<T>({4, M}, place);
+    std::vector<int> vec_trans = {1, 0};
+    F.Transpose(x, &xt, vec_trans);
+    F.Transpose(y, &yt, vec_trans);
+    Tensor xmin1 = xt.Slice(0, 1);
+    Tensor ymin1 = xt.Slice(1, 2);
+    Tensor xmax1 = xt.Slice(2, 3);
+    Tensor ymax1 = xt.Slice(3, 4);
+    Tensor xmin2 = yt.Slice(0, 1);
+    Tensor ymin2 = yt.Slice(1, 2);
+    Tensor xmax2 = yt.Slice(2, 3);
+    Tensor ymax2 = yt.Slice(3, 4);
+    xmin1.Resize({N, 1});
+    ymin1.Resize({N, 1});
+    xmax1.Resize({N, 1});
+    ymax1.Resize({N, 1});
+    xmin2.Resize({1, M});
+    ymin2.Resize({1, M});
+    xmax2.Resize({1, M});
+    ymax2.Resize({1, M});
+
+    Tensor w1(_type);
+    Tensor h1(_type);
+    Tensor w2(_type);
+    Tensor h2(_type);
+    Tensor area1(_type);
+    Tensor area2(_type);
+    w1.mutable_data<T>({N, 1}, place);
+    h1.mutable_data<T>({N, 1}, place);
+    w2.mutable_data<T>({1, M}, place);
+    h2.mutable_data<T>({1, M}, place);
+    area1.mutable_data<T>({N, 1}, place);
+    area2.mutable_data<T>({1, M}, place);
+    F.Sub(&xmax1, &xmin1, &w1);
+    F.Sub(&ymax1, &ymin1, &h1);
+    F.Sub(&xmax2, &xmin2, &w2);
+    F.Sub(&ymax2, &ymin2, &h2);
+    if (!normalized) {
+      F.Adds(&w1, 1.0f, &w1);
+      F.Adds(&h1, 1.0f, &h1);
+      F.Adds(&w2, 1.0f, &w2);
+      F.Adds(&h2, 1.0f, &h2);
+    }
+    F.Mul(&w1, &h1, &area1);
+    F.Mul(&w2, &h2, &area2);
+
+    Tensor inter_xmax(_type);
+    Tensor inter_ymax(_type);
+    Tensor inter_xmin(_type);
+    Tensor inter_ymin(_type);
+    inter_xmax.mutable_data<T>({N, M}, place);
+    inter_ymax.mutable_data<T>({N, M}, place);
+    inter_xmin.mutable_data<T>({N, M}, place);
+    inter_ymin.mutable_data<T>({N, M}, place);
+    F.Minimum(&xmax1, &xmax2, &inter_xmax);
+    F.Minimum(&ymax1, &ymax2, &inter_ymax);
+    F.Maximum(&xmin1, &xmin2, &inter_xmin);
+    F.Maximum(&ymin1, &ymin2, &inter_ymin);
+
+    Tensor inter_w(_type);
+    Tensor inter_h(_type);
+    inter_w.mutable_data<T>({N, M}, place);
+    inter_h.mutable_data<T>({N, M}, place);
+    F.Sub(&inter_xmax, &inter_xmin, &inter_w);
+    F.Sub(&inter_ymax, &inter_ymin, &inter_h);
+
+    if (!normalized) {
+      F.Adds(&inter_w, 1.0f, &inter_w);
+      F.Adds(&inter_h, 1.0f, &inter_h);
+    }
+    Tensor zeros(_type);
+    zeros.mutable_data<T>({1}, place);
+    FillNpuTensorWithConstant<T>(&zeros, static_cast<T>(0));
+    F.Maximum(&inter_w, &zeros, &inter_w);
+    F.Maximum(&inter_h, &zeros, &inter_h);
+
+    F.Mul(&inter_w, &inter_h, out);
+    Tensor union_area(_type);
+    union_area.mutable_data<T>({N, M}, place);
+    F.Add(&area1, &area2, &union_area);
+    F.Sub(&union_area, out, &union_area);
+    F.DivNoNan(out, &union_area, out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(iou_similarity, ops::IouSimilarityNPUKernel<float>,
+                       ops::IouSimilarityNPUKernel<plat::float16>);
--- a/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import numpy.random as random
+import sys
+sys.path.append("..")
+import math
+import paddle
+from op_test import OpTest
+
+paddle.enable_static()
+
+np.random.seed(2021)
+
+
+class TestNpuIouSimilarityOp(OpTest):
+    def setUp(self):
+        self.op_type = "iou_similarity"
+        self.set_npu()
+        self.init_dtype()
+        self.set_init_config()
+        self.set_attrs()
+        self.set_inputs()
+        self.set_outputs()
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_init_config(self):
+        self.N = 2
+        self.M = 3
+        self.box_normalized = False
+        self.use_lod = False
+
+    def set_inputs(self):
+        self.boxes1 = random.rand(self.N, 4).astype(self.dtype)
+        self.boxes2 = random.rand(self.M, 4).astype(self.dtype)
+        if self.use_lod:
+            self.boxes1_lod = [[1 for _ in range(self.N)]]
+            self.inputs = {
+                'X': (self.boxes1, self.boxes1_lod),
+                'Y': self.boxes2
+            }
+        else:
+            self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
+
+    def set_attrs(self):
+        self.attrs = {"box_normalized": self.box_normalized}
+
+    def set_outputs(self):
+        self.output = random.rand(self.N, self.M).astype(self.dtype)
+        self._compute_iou()
+        self.outputs = {'Out': self.output}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def _compute_iou(self, ):
+        for row in range(self.boxes1.shape[0]):
+            for col in range(self.boxes2.shape[0]):
+                xmin1, ymin1, xmax1, ymax1 = self.boxes1[row]
+                xmin2, ymin2, xmax2, ymax2 = self.boxes2[col]
+                if not self.box_normalized:
+                    area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1)
+                    area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1)
+                else:
+                    area1 = (ymax1 - ymin1) * (xmax1 - xmin1)
+                    area2 = (ymax2 - ymin2) * (xmax2 - xmin2)
+
+                inter_xmax = min(xmax1, xmax2)
+                inter_ymax = min(ymax1, ymax2)
+                inter_xmin = max(xmin1, xmin2)
+                inter_ymin = max(ymin1, ymin2)
+                inter_height = inter_ymax - inter_ymin
+                inter_width = inter_xmax - inter_xmin
+                if not self.box_normalized:
+                    inter_height += 1
+                    inter_width += 1
+                inter_height = max(inter_height, 0)
+                inter_width = max(inter_width, 0)
+                inter_area = inter_width * inter_height
+                union_area = area1 + area2 - inter_area
+                sim_score = inter_area / union_area
+                self.output[row, col] = sim_score
+
+
+class TestNpuIouSimilarityOpWithLoD(TestNpuIouSimilarityOp):
+    def set_init_config(self):
+        super(TestNpuIouSimilarityOpWithLoD, self).set_init_config()
+        self.box_normalized = True
+        self.use_lod = True
+
+
+class TestNpuIouSimilarityOpWithBoxNormalized(TestNpuIouSimilarityOp):
+    def set_init_config(self):
+        super(TestNpuIouSimilarityOpWithBoxNormalized, self).set_init_config()
+        self.box_normalized = True
+        self.use_lod = True
+
+
+def TestNpuIouSimilarityOpFp16(TestNpuIouSimilarityOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()