[NPU] enhance elementwise uts (#2784)

* [NPU] reshape x,y,out node in elementwise ops

[NPU] enhance elementwise uts (#2784)
* [NPU] reshape x,y,out node in elementwise ops
907150a4 · zhupengyang · GitHub · a11eaf6a · 907150a4 · a11eaf6a
4 changed file
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
@@ -21,28 +21,42 @@ namespace lite {
 namespace subgraph {
 namespace npu {

-std::vector<int64_t> CvtYShape(const DDim& x_dims,
-                               const DDim& y_dims,
-                               int axis) {
-  CHECK_EQ(x_dims.size(), 4UL) << "[NPU] Only support 4-dimension x";
-  CHECK_GE(x_dims.size(), y_dims.size());
+void CvtYShape(std::vector<int64_t>* x_shape,
+               std::vector<int64_t>* y_shape,
+               int axis) {
+  CHECK_GE(x_shape->size(), y_shape->size());

  if (axis < 0) {
-    axis += x_dims.size();
+    axis = x_shape->size() - y_shape->size();
  }

-  std::vector<int64_t> y_new_shape(y_dims.Vectorize());
-  if (y_new_shape.size() == 4UL) {
-    return y_new_shape;
+  // only support:
+  // (n,c,h,w) * (n,c,h,w)
+  // (n,c,h,w) * (1,c,1,1)
+  // (n,c,h,w) * (1,c,h,1)
+  // (n,c,h,w) * (1,c,h,w)
+  int y_shape_size = y_shape->size();
+  if (y_shape_size == 1) {
+    y_shape->insert(y_shape->begin(), 1);
+    y_shape->insert(y_shape->end(), 2, 1);
+  } else if (y_shape_size == 2) {
+    y_shape->insert(y_shape->begin(), 1);
+    y_shape->insert(y_shape->end(), 1);
+  } else if (y_shape_size == 3) {
+    y_shape->insert(y_shape->begin(), 1);
  }
-  for (int i = 0; i < axis; i++) {
-    y_new_shape.insert(y_new_shape.begin(), 1);
+  if (y_shape_size < 4) {
+    int n = 1;
+    for (int i = 0; i < axis; i++) {
+      n *= x_shape->at(i);
+    }
+    x_shape->erase(x_shape->begin(), x_shape->begin() + axis);
+    x_shape->insert(x_shape->begin(), n);
+    x_shape->insert(x_shape->end(), 4 - x_shape->size(), 1);
  }
-  while (y_new_shape.size() < 4) {
-    y_new_shape.push_back(1);
-  }
-  CHECK_EQ(y_new_shape.size(), 4UL);
-  return y_new_shape;
+
+  CHECK_EQ(x_shape->size(), 4UL);
+  CHECK_EQ(y_shape->size(), 4UL);
 }

 int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
@@ -61,32 +75,58 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
+
  auto y_name = op_info->Input("Y").front();
  auto y_type = kernel->GetInputDeclType("Y");
  CHECK(y_type->precision() == PRECISION(kFloat));
  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
  auto y = scope->FindMutableTensor(y_name);
  auto y_dims = y->dims();
+
  auto out_name = op_info->Output("Out").front();
  auto out_type = kernel->GetOutputDeclType("Out");
  CHECK(out_type->precision() == PRECISION(kFloat));
  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+
  auto axis = op_info->GetAttr<int>("axis");

+  auto x_new_shape = x_dims.Vectorize();
+  auto y_new_shape = y_dims.Vectorize();
+  CvtYShape(&x_new_shape, &y_new_shape, axis);
+
  // X node
  std::shared_ptr<Node> x_node = nullptr;
  if (graph->Has(x_name)) {
    x_node = graph->Get(x_name);
+    if (x_dims.Vectorize() != x_new_shape) {
+      auto reshaped_x_node = graph->Add<ge::op::Reshape>(x_name + "/reshape");
+      auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
+      reshaped_x_op->set_input_tensor(*x_node->data());
+      reshaped_x_op->set_attr_shape(
+          ge::AttrValue::LIST_INT(x_new_shape.begin(), x_new_shape.end()));
+      reshaped_x_op->set_attr_axis(0);
+      x_node = reshaped_x_node;
+    }
  } else {
-    x_node = graph->Add(x_name, *x);
+    x_node = graph->Add(x_name, *x, x_new_shape);
  }

  // Y node
  std::shared_ptr<Node> y_node = nullptr;
  if (graph->Has(y_name)) {
    y_node = graph->Get(y_name);
+    if (y_dims.Vectorize() != y_new_shape) {
+      auto reshaped_y_node = graph->Add<ge::op::Reshape>(y_name + "/reshape");
+      auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
+      reshaped_y_op->set_input_tensor(*y_node->data());
+      reshaped_y_op->set_attr_shape(
+          ge::AttrValue::LIST_INT(y_new_shape.begin(), y_new_shape.end()));
+      reshaped_y_op->set_attr_axis(0);
+      y_node = reshaped_y_node;
+    }
  } else {
-    auto y_new_shape = CvtYShape(x_dims, y_dims, axis);
    y_node = graph->Add(y_name, *y, y_new_shape);
  }

@@ -98,17 +138,20 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    auto elt_op = elt_node->data<ge::op::Add>();
    elt_op->set_input_x1(*x_node->data());
    elt_op->set_input_x2(*y_node->data());
-  } else if (op_type == "elementwise_sub") {
+  } else if (op_type == "elementwise_sub" ||
+             op_type == "fusion_elementwise_sub_activation") {
    elt_node = graph->Add<ge::op::Sub>(out_name);
    auto elt_op = elt_node->data<ge::op::Sub>();
    elt_op->set_input_x1(*x_node->data());
    elt_op->set_input_x2(*y_node->data());
-  } else if (op_type == "elementwise_mul") {
+  } else if (op_type == "elementwise_mul" ||
+             op_type == "fusion_elementwise_mul_activation") {
    elt_node = graph->Add<ge::op::Mul>(out_name);
    auto elt_op = elt_node->data<ge::op::Mul>();
    elt_op->set_input_x(*x_node->data());
    elt_op->set_input_y(*y_node->data());
-  } else if (op_type == "elementwise_div") {
+  } else if (op_type == "elementwise_div" ||
+             op_type == "fusion_elementwise_div_activation") {
    elt_node = graph->Add<ge::op::RealDiv>(out_name);
    auto elt_op = elt_node->data<ge::op::RealDiv>();
    elt_op->set_input_x1(*x_node->data());
@@ -118,8 +161,22 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    return FAILED;
  }

+  if (out_dims.Vectorize() != x_new_shape) {
+    auto reshaped_elt_node = graph->Add<ge::op::Reshape>(out_name);
+    auto reshaped_elt_op = reshaped_elt_node->data<ge::op::Reshape>();
+    reshaped_elt_op->set_input_tensor(*elt_node->data());
+    auto out_shape = out_dims.Vectorize();
+    reshaped_elt_op->set_attr_shape(
+        ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+    reshaped_elt_op->set_attr_axis(0);
+    elt_node = reshaped_elt_node;
+  }
+
  // Act node
-  if (op_type == "fusion_elementwise_add_activation") {
+  if (op_type == "fusion_elementwise_add_activation" ||
+      op_type == "fusion_elementwise_sub_activation" ||
+      op_type == "fusion_elementwise_mul_activation" ||
+      op_type == "fusion_elementwise_div_activation") {
    auto act_type = op_info->GetAttr<std::string>("act_type");
    auto act_node = graph->Add<ge::op::Activation>(out_name);
    auto act_op = act_node->data<ge::op::Activation>();
@@ -128,6 +185,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    // clipped_relu etc.
    act_op->set_attr_mode(CvtActMode(act_type));
  }
+
  return REBUILD_WHEN_SHAPE_CHANGED;
 }

@@ -139,9 +197,6 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
                         kNPU,
                         paddle::lite::subgraph::npu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
-                         kNPU,
-                         paddle::lite::subgraph::npu::ElementwiseConverter);
 REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
                         kNPU,
                         paddle::lite::subgraph::npu::ElementwiseConverter);
@@ -151,3 +206,15 @@ REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
 REGISTER_SUBGRAPH_BRIDGE(elementwise_div,
                         kNPU,
                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_sub_activation,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_mul_activation,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_div_activation,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
--- a/lite/kernels/npu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/elementwise_ops.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename dtype>
-void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindTensor("x");
-  auto y = scope->FindTensor("y");
-  auto out = scope->FindMutableTensor("out_ref");
-  out->Resize(x->dims());
-
-  auto x_data = x->data<dtype>();
-  auto y_data = y->data<dtype>();
-  auto out_data = out->mutable_data<dtype>();
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  int axis = op_info->GetAttr<int>("axis");
-
-  if (axis < 0) {
-    axis += x_dims.size();
-  }
-  int batch = 1;
-  int channels = y->numel();
-  int num = x->numel() / channels / batch;
-  // do elementwise add/sub/max...
-  std::string op_type = op_info->Type();
-  if (op_type == "elementwise_add") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr + diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (op_type == "elementwise_sub") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr - diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (op_type == "elementwise_mul") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr * diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (op_type == "elementwise_div") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr / diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (op_type == "elementwise_max") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = std::max(*din_ptr, diny_data);
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else {
-    LOG(FATAL) << "unsupported Elementwise type: " << op_type;
-  }
-}
-
-void test_elementwise_add(const std::vector<int64_t>& x_shape,
-                          const std::vector<int64_t>& y_shape,
-                          int axis,
-                          std::string elt_type) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string y_var_name = "y";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize(x_shape);
-  y->Resize(y_shape);
-
-  // initialize input&output data
-  FillTensor<float>(x, 1, 3);
-  FillTensor<float>(y, 1, 3);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("elementwise_" + elt_type);
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetInput("Y", {y_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-
-  // execute reference implementation and save to output tensor
-  elementwise_add_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, elementwise_add) {
-  for (auto elt_type : {"add", "sub", "mul", "div"}) {
-    test_elementwise_add({1, 2, 3, 4}, {2}, 1, elt_type);
-    test_elementwise_add({1, 2, 3, 4}, {1, 2, 1, 1}, 1, elt_type);
-    test_elementwise_add({1, 2, 3, 4}, {1, 2, 3, 4}, 3, elt_type);
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(elementwise_add);
-USE_NPU_BRIDGE(elementwise_add);
-USE_LITE_OP(elementwise_sub);
-USE_NPU_BRIDGE(elementwise_sub);
-USE_LITE_OP(elementwise_mul);
-USE_NPU_BRIDGE(elementwise_mul);
-USE_LITE_OP(elementwise_div);
-USE_NPU_BRIDGE(elementwise_div);
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -30,10 +30,13 @@ USE_SUBGRAPH_BRIDGE(conv2d_transpose, kNPU);

 USE_SUBGRAPH_BRIDGE(dropout, kNPU);
 USE_SUBGRAPH_BRIDGE(elementwise_add, kNPU);
-USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(elementwise_sub, kNPU);
 USE_SUBGRAPH_BRIDGE(elementwise_mul, kNPU);
 USE_SUBGRAPH_BRIDGE(elementwise_div, kNPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_sub_activation, kNPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_mul_activation, kNPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_div_activation, kNPU);

 USE_SUBGRAPH_BRIDGE(fc, kNPU);
 USE_SUBGRAPH_BRIDGE(bilinear_interp, kNPU);

--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc