From c4a5c960d1de68b992228ea448e070d70bb9c30d Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Fri, 25 Sep 2020 09:40:19 +0800
Subject: [PATCH] [X86] add new kernel of relu6 and reduce_mean, test=develop
 (#4431)

---
 lite/kernels/x86/activation_compute.cc        | 11 +++
 lite/kernels/x86/activation_compute.h         | 36 ++++++++++
 lite/kernels/x86/reduce_compute.cc            | 10 +++
 lite/kernels/x86/reduce_compute.h             | 70 +++++++++++++++----
 lite/operators/activation_ops.cc              |  3 +
 lite/operators/op_params.h                    |  2 +
 lite/tests/kernels/activation_compute_test.cc |  9 ++-
 .../tests/kernels/reduce_mean_compute_test.cc |  7 +-
 8 files changed, 132 insertions(+), 16 deletions(-)
diff --git a/lite/kernels/x86/activation_compute.cc b/lite/kernels/x86/activation_compute.cc
index 9b4c2fadd9..aee6bd6bd3 100644
--- a/lite/kernels/x86/activation_compute.cc
+++ b/lite/kernels/x86/activation_compute.cc
@@ -88,3 +88,14 @@ REGISTER_LITE_KERNEL(sigmoid,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+
+// float
+REGISTER_LITE_KERNEL(relu6,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::Relu6Compute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/activation_compute.h b/lite/kernels/x86/activation_compute.h
index 520adaf44f..b76e94398e 100644
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
@@ -248,6 +248,42 @@ class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   virtual ~SoftsignCompute() = default;
 };
 
+// relu6(x) = min(max(0, x), 6)
+template <typename T>
+struct Relu6Functor {
+  float threshold;
+  explicit Relu6Functor(float threshold_) : threshold(threshold_) {}
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
+  }
+};
+
+template <typename T>
+class Relu6Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+
+    param.Out->template mutable_data<T>();
+    auto X = param.X;
+    auto Out = param.Out;
+    auto place = lite::fluid::EigenDeviceType<TARGET(kX86)>();
+    CHECK(X);
+    CHECK(Out);
+    auto x = lite::fluid::EigenVector<T>::Flatten(*X);
+    auto out = lite::fluid::EigenVector<T>::Flatten(*Out);
+    Relu6Functor<T> functor(param.threshold);
+    functor(place, x, out);
+  }
+
+  virtual ~Relu6Compute() = default;
+};
+
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/x86/reduce_compute.cc b/lite/kernels/x86/reduce_compute.cc
index f95f4cfb88..edeac0a84e 100644
--- a/lite/kernels/x86/reduce_compute.cc
+++ b/lite/kernels/x86/reduce_compute.cc
@@ -23,3 +23,13 @@ REGISTER_LITE_KERNEL(reduce_sum,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(reduce_mean,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::ReduceMeanCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/reduce_compute.h b/lite/kernels/x86/reduce_compute.h
index 1b7c99eeef..fb02348759 100644
--- a/lite/kernels/x86/reduce_compute.h
+++ b/lite/kernels/x86/reduce_compute.h
@@ -31,11 +31,18 @@ struct SumFunctor {
   }
 };
 
-#define HANDLE_DIM(NDIM, RDIM)                                            \
-  if (ndim == NDIM && rdim == RDIM) {                                     \
-    paddle::lite::kernels::x86::                                          \
-        ReduceFunctor<lite::TargetType::kX86, T, NDIM, RDIM, SumFunctor>( \
-            *input, output, dims, keep_dim);                              \
+struct MeanFunctor {
+  template <typename X, typename Y, typename Dim>
+  void operator()(X* x, Y* y, const Dim& dim) {
+    y->device(lite::fluid::EigenDeviceType<TARGET(kX86)>()) = x->mean(dim);
+  }
+};
+
+#define HANDLE_DIM(NDIM, RDIM, FUNCTOR)                                \
+  if (ndim == NDIM && rdim == RDIM) {                                  \
+    paddle::lite::kernels::x86::                                       \
+        ReduceFunctor<lite::TargetType::kX86, T, NDIM, RDIM, FUNCTOR>( \
+            *input, output, dims, keep_dim);                           \
   }
 
 template <typename T>
@@ -64,19 +71,58 @@ class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     } else {
       int ndim = input->dims().size();
       int rdim = dims.size();
-      HANDLE_DIM(4, 3);
-      HANDLE_DIM(4, 2);
-      HANDLE_DIM(4, 1);
-      HANDLE_DIM(3, 2);
-      HANDLE_DIM(3, 1);
-      HANDLE_DIM(2, 1);
-      HANDLE_DIM(1, 1);
+      HANDLE_DIM(4, 3, SumFunctor);
+      HANDLE_DIM(4, 2, SumFunctor);
+      HANDLE_DIM(4, 1, SumFunctor);
+      HANDLE_DIM(3, 2, SumFunctor);
+      HANDLE_DIM(3, 1, SumFunctor);
+      HANDLE_DIM(2, 1, SumFunctor);
+      HANDLE_DIM(1, 1, SumFunctor);
     }
   }
 
   virtual ~ReduceSumCompute() = default;
 };
 
+template <typename T>
+class ReduceMeanCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ReduceParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ReduceParam>();
+    // auto& context = ctx_->As<X86Context>();
+    auto* input = param.x;
+    auto* output = param.output;
+    param.output->template mutable_data<T>();
+
+    const auto& dims = param.dim;
+    bool keep_dim = param.keep_dim;
+
+    if (dims.size() == 0) {
+      // Flatten and reduce 1-D tensor
+      auto x = lite::fluid::EigenVector<T>::Flatten(*input);
+      auto out = lite::fluid::EigenScalar<T>::From(output);
+      // auto& place = *platform::CPUDeviceContext().eigen_device();
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      MeanFunctor functor;
+      functor(&x, &out, reduce_dim);
+    } else {
+      int ndim = input->dims().size();
+      int rdim = dims.size();
+      HANDLE_DIM(4, 3, MeanFunctor);
+      HANDLE_DIM(4, 2, MeanFunctor);
+      HANDLE_DIM(4, 1, MeanFunctor);
+      HANDLE_DIM(3, 2, MeanFunctor);
+      HANDLE_DIM(3, 1, MeanFunctor);
+      HANDLE_DIM(2, 1, MeanFunctor);
+      HANDLE_DIM(1, 1, MeanFunctor);
+    }
+  }
+
+  virtual ~ReduceMeanCompute() = default;
+};
+
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc
index 9b20f4348b..a25297f012 100644
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -89,6 +89,9 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   } else if (opdesc.Type() == "elu") {
     param_.active_type = lite_api::ActivationType::kElu;
     param_.Elu_alpha = opdesc.GetAttr<float>("alpha");
+  } else if (opdesc.Type() == "relu6") {
+    param_.active_type = lite_api::ActivationType::kRelu6;
+    param_.threshold = opdesc.GetAttr<float>("threshold");
   }
 
   VLOG(4) << "opdesc.Type():" << opdesc.Type();
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 33da913d2e..85d7854970 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -403,6 +403,8 @@ struct ActivationParam : ParamBase {
   float relu_threshold{1.0f};
   // elu
   float Elu_alpha{1.0f};
+  // relu6
+  float threshold{6.0f};
 
   ///////////////////////////////////////////////////////////////////////////////////
   // get a vector of input tensors
diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc
index fb88f6b553..6799da30da 100644
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -58,6 +58,7 @@ class ActivationComputeTester : public arena::TestCase {
   float hard_swish_offset = 3.0;
   float relu_threshold_ = 1.0;
   float elu_alpha_ = 1.0;
+  float threshold_ = 6.0;
   DDim dims_{{1}};
   std::string type_ = "";
   activation_type_test act_type_ = RELU;
@@ -170,7 +171,8 @@ class ActivationComputeTester : public arena::TestCase {
       case RELU6: {
         for (int i = 0; i < dims_.production(); i++) {
           output_data[i] = x_data[i] > 0.f ? x_data[i] : 0.f;
-          output_data[i] = output_data[i] < 6.0 ? output_data[i] : 6.0;
+          output_data[i] =
+              output_data[i] < threshold_ ? output_data[i] : threshold_;
         }
         break;
       }
@@ -273,6 +275,9 @@ class ActivationComputeTester : public arena::TestCase {
     if (act_type_ == ELU) {
       op_desc->SetAttr("alpha", elu_alpha_);
     }
+    if (act_type_ == RELU6) {
+      op_desc->SetAttr("threshold", threshold_);
+    }
   }
 
   void PrepareData() override {
@@ -510,6 +515,8 @@ TEST(Activation_relu6, precision) {
 #elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
   place = TARGET(kHuaweiAscendNPU);
   abs_error = 1e-2;  // precision_mode default is force_fp16
+#elif defined(LITE_WITH_X86)
+  place = TARGET(kX86);
 #else
   return;
 #endif
diff --git a/lite/tests/kernels/reduce_mean_compute_test.cc b/lite/tests/kernels/reduce_mean_compute_test.cc
index 0d41d25179..d679d027a6 100644
--- a/lite/tests/kernels/reduce_mean_compute_test.cc
+++ b/lite/tests/kernels/reduce_mean_compute_test.cc
@@ -333,9 +333,10 @@ void test_reduce_mean(Place place) {
 }
 
 TEST(ReduceMean, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  test_reduce_mean(place);
+#endif
 #ifdef LITE_WITH_ARM
   Place place(TARGET(kARM));
   test_reduce_mean(place);
-- 
GitLab