From 7cd585aa2fc2a11d3f928f6b736b1d65f93f1805 Mon Sep 17 00:00:00 2001
From: --get <zhaoying@cambricon.com>
Date: Wed, 27 May 2020 07:05:35 +0000
Subject: [PATCH] (feat): add norm mlu kernel and test

---
 lite/kernels/mlu/bridges/CMakeLists.txt       |   3 +
 lite/kernels/mlu/bridges/norm_op.cc           | 111 +++++++++++++
 lite/kernels/mlu/bridges/norm_op_test.cc      | 148 ++++++++++++++++++
 lite/kernels/mlu/bridges/paddle_use_bridges.h |   1 +
 4 files changed, 263 insertions(+)
 create mode 100644 lite/kernels/mlu/bridges/norm_op.cc
 create mode 100644 lite/kernels/mlu/bridges/norm_op_test.cc
diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt
index 03b8386d88..8e90e1080f 100644
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
@@ -60,6 +60,8 @@ if (LITE_BUILD_EXTRA)
   list(APPEND mlu_subgraph_bridges subgraph_bridge_lrn_op_mlu)
   lite_cc_library(subgraph_bridge_gather_op_mlu SRCS gather_op.cc DEPS ${subgraph_bridge_deps_mlu})
   list(APPEND mlu_subgraph_bridges subgraph_bridge_gather_op_mlu)
+  lite_cc_library(subgraph_bridge_norm_op_mlu SRCS norm_op.cc DEPS ${subgraph_bridge_deps_mlu})
+  list(APPEND mlu_subgraph_bridges subgraph_bridge_norm_op_mlu)
 endif()
 
 lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
@@ -84,6 +86,7 @@ lite_cc_test(test_squeeze_converter_mlu SRCS squeeze_op_test.cc DEPS scope optim
 lite_cc_test(test_reshape_converter_mlu SRCS reshape_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 lite_cc_test(test_flatten_converter_mlu SRCS flatten_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 if (LITE_BUILD_EXTRA)
+  lite_cc_test(test_norm_converter_mlu SRCS norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
   lite_cc_test(test_lrn_converter_mlu SRCS lrn_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
   lite_cc_test(test_gather_converter_mlu SRCS gather_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 endif()
diff --git a/lite/kernels/mlu/bridges/norm_op.cc b/lite/kernels/mlu/bridges/norm_op.cc
new file mode 100644
index 0000000000..492c3932a8
--- /dev/null
+++ b/lite/kernels/mlu/bridges/norm_op.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int NormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims().Vectorize();
+
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  int axis = op_info->GetAttr<int>("axis");
+  int epsilon = op_info->GetAttr<float>("epsilon");
+  if (axis < 0) {
+    axis = axis + x_dims.size();
+  }
+  std::vector<int> nchw2nhwc = {0, 3, 1, 2};
+  int nhwc_axis = nchw2nhwc[axis];
+
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+
+  // ======== DEBUG ===============
+  VLOG(6) << "x name=" << x_var_name;
+  VLOG(6) << "out name=" << out_var_name;
+  VLOG(6) << "x dims=" << x->dims();
+  VLOG(6) << "out dims=" << output->dims();
+  VLOG(6) << "axis =" << axis;
+  VLOG(6) << "nwhc axis=" << nhwc_axis;
+  VLOG(6) << "epsilon =" << epsilon;
+  // cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR);
+  // cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR);
+  // ======== DEBUG END ============
+  cnmlBaseOp_t norm_op{nullptr};
+
+  cnmlNormalizeOpParam_t param;
+  int mode = -1;
+  switch (axis) {
+    case 0:
+      mode = 3;  // N
+      break;
+    case 1:
+      mode = 0;  // C
+      break;
+    case 2:
+      mode = 4;  // H
+      break;
+    case 3:
+      mode = 5;  // W
+      break;
+    default:
+      CHECK(0);
+      break;
+  }
+  cnmlCreateNormalizeOpParamV2(&param,
+                               0,  // p
+                               0,  // use_scale
+                               mode,
+                               1,  // weight
+                               epsilon);
+
+  CNML_CALL(cnmlCreateNormalizeOp(&norm_op,
+                                  param,
+                                  input_tensor->mlu_tensor(),
+                                  output_tensor->mlu_tensor(),
+                                  nullptr,
+                                  false /*is_fix8_mode*/));
+  graph->FuseOp(norm_op);
+  CNML_CALL(cnmlDestroyBaseOp(&norm_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(norm,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::NormConverter);
diff --git a/lite/kernels/mlu/bridges/norm_op_test.cc b/lite/kernels/mlu/bridges/norm_op_test.cc
new file mode 100644
index 0000000000..35b5eabbb9
--- /dev/null
+++ b/lite/kernels/mlu/bridges/norm_op_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/norm_op.h"
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <iostream>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+// void ToFile(std::string file_name, Tensor* tensor) {
+//   int count = tensor->dims().production();
+//   auto data = tensor->mutable_data<float>();
+//   std::ostringstream outs;
+//   for (size_t i = 0; i < count; i++) {
+//     outs << data[i] << std::endl;
+//   }
+//   std::ofstream of;
+//   of.open(file_name, std::ios::out);
+//   of << outs.str();
+//   of.close();
+// }
+
+void norm_ref(const std::shared_ptr<operators::NormOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  int axis = op_info->GetAttr<int>("axis");
+  int epsilon = op_info->GetAttr<float>("epsilon");
+  auto x_dims = x->dims();
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+  out->Resize(x_dims.Vectorize());
+  auto* out_data = out->mutable_data<float>();
+
+  const auto* x_data = x->data<float>();
+  int pre_n = x_dims.count(0, axis);
+  int n = x_dims[axis];
+  int post_n = x_dims.count(axis + 1, x_dims.size());
+  for (int i = 0; i < pre_n; i++) {
+    for (int k = 0; k < post_n; k++) {
+      float sum = epsilon;
+      const float* in_tmp = x_data + i * n * post_n + k;
+      for (int j = 0; j < n; j++) {
+        sum += in_tmp[j * post_n] * in_tmp[j * post_n];
+      }
+      sum = std::sqrt(sum);
+      float* out_tmp = out_data + i * n * post_n + k;
+      for (int j = 0; j < n; j++) {
+        out_tmp[j * post_n] = in_tmp[j * post_n] / sum;
+      }
+    }
+  }
+}
+
+void test_norm(const std::vector<int64_t>& input_shape, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+  // initialize input&output data
+  FillTensor<float, float>(x, -9, 9);
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  float epsilon = 1e-9f;
+  opdesc.SetType("norm");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", static_cast<int>(axis));
+  opdesc.SetAttr("epsilon", static_cast<float>(epsilon));
+
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::NormOp>(opdesc, &scope);
+  norm_ref(op);
+  out_ref->CopyDataFrom(*out);
+  Tensor input_x;
+  input_x.Resize(DDim(input_shape));
+  // change input layout from NCHW to NHWC
+  transpose<float>(x->mutable_data<float>(),
+                   input_x.mutable_data<float>(),
+                   {static_cast<int>(input_shape[0]),
+                    static_cast<int>(input_shape[1]),
+                    static_cast<int>(input_shape[2]),
+                    static_cast<int>(input_shape[3])},
+                   {0, 2, 3, 1});
+  x->CopyDataFrom(input_x);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  std::vector<int64_t> out_shape = input_shape;
+  Tensor output_trans;
+  output_trans.Resize(out_shape);
+  // Change output layout from NHWC to NCHW
+  transpose<float>(out_data,
+                   output_trans.mutable_data<float>(),
+                   {static_cast<int>(out_shape[0]),
+                    static_cast<int>(out_shape[2]),
+                    static_cast<int>(out_shape[3]),
+                    static_cast<int>(out_shape[1])},
+                   {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, norm) {
+  test_norm({1, 2, 3, 4}, 1);
+  test_norm({1, 2, 3, 4}, 2);
+  test_norm({1, 2, 3, 4}, 3);
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(norm, kMLU);
diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h
index 703687df87..9bd2f3357f 100644
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
@@ -42,4 +42,5 @@ USE_SUBGRAPH_BRIDGE(squeeze2, kMLU);
 #ifdef LITE_BUILD_EXTRA
 USE_SUBGRAPH_BRIDGE(gather, kMLU);
 USE_SUBGRAPH_BRIDGE(lrn, kMLU)
+USE_SUBGRAPH_BRIDGE(norm, kMLU)
 #endif
-- 
GitLab