add square op for arm kernel (#3169)

* add backend * add kernel * add grad kernel

add square op for arm kernel (#3169)
* add backend * add kernel * add grad kernel
a6f9e0c7 · mapingshuo · GitHub · 80fb550c · a6f9e0c7 · a6f9e0c7
15 changed file
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -700,6 +700,35 @@ void act_rsqrt<float>(const float* din, float* dout, int size, int threads) {
  }
 }

+template <>
+void act_square<float>(const float* din, float* dout, int size, int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = ptr_in[0] * ptr_in[0];
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
+#ifdef LITE_WITH_TRAIN
+template <>
+void act_square_grad(const float* din,
+                     const float* dout_grad,
+                     float* din_grad,
+                     int size,
+                     int threads) {
+  const float* ptr_out_grad = dout_grad;
+  float* ptr_in_grad = din_grad;
+  for (int i = 0; i < size; ++i) {
+    ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0];
+    ptr_out_grad++;
+    ptr_in_grad++;
+    din++;
+  }
+}
+#endif
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -69,6 +69,15 @@ void act_hard_sigmoid(const T* din,
 template <typename T>
 void act_rsqrt(const T* din, T* dout, int size, int threads);

+template <typename T>
+void act_square(const T* din, T* dout, int size, int threads);
+
+#ifdef LITE_WITH_TRAIN
+template <typename T>
+void act_square_grad(
+    const T* din, const T* dout_grad, T* din_grad, int size, int threads);
+#endif
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -108,6 +108,7 @@ add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math
 add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
 if(LITE_WITH_TRAIN)
  add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+  add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 endif()

 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)

--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
@@ -169,6 +169,16 @@ void RsqrtCompute::Run() {
      x_data, output_data, x_dims.production(), ctx.threads());
 }

+void SquareCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_square<float>(
+      x_data, output_data, x_dims.production(), ctx.threads());
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -260,3 +270,8 @@ REGISTER_LITE_KERNEL(
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
+REGISTER_LITE_KERNEL(
+    square, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SquareCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
@@ -139,6 +139,15 @@ class RsqrtCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  virtual ~RsqrtCompute() = default;
 };

+class SquareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~SquareCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/arm/activation_grad_compute.cc
+++ b/lite/kernels/arm/activation_grad_compute.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/activation_grad_compute.h"
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void SquareGradCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto out_grad_dims = param.Out_grad->dims();
+  auto out_grad_data = param.Out_grad->data<float>();
+
+  auto x_data = param.X->data<float>();
+  auto x_grad_data = param.X_grad->mutable_data<float>();
+  lite::arm::math::act_square_grad<float>(x_data,
+                                          out_grad_data,
+                                          x_grad_data,
+                                          out_grad_dims.production(),
+                                          ctx.threads());
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(square_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::SquareGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/activation_grad_compute.h
+++ b/lite/kernels/arm/activation_grad_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class SquareGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationGradParam;
+
+  void Run() override;
+
+  virtual ~SquareGradCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -143,6 +143,7 @@ add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS})
 add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS})
 if (LITE_WITH_TRAIN)
  add_operator(mean_grad_op extra SRCS mean_grad_op.cc DEPS ${op_DEPS})
+  add_operator(activation_grad_ops basic SRCS activation_grad_ops.cc DEPS ${op_DEPS})
 endif()

 if (NOT LITE_WITH_X86)

--- a/lite/operators/activation_grad_ops.cc
+++ b/lite/operators/activation_grad_ops.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.i
+
+#include "lite/operators/activation_grad_ops.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ActivationGradOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X_grad);
+  CHECK_OR_FALSE(param_.Out_grad);
+  return true;
+}
+
+bool ActivationGradOp::InferShape() const {
+  param_.X_grad->Resize(param_.Out_grad->dims());
+  return true;
+}
+
+bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
+                                  lite::Scope* scope) {
+  auto Out_grad_name = opdesc.Input("Out@GRAD").front();
+  auto X_grad_name = opdesc.Output("X@GRAD").front();
+
+  param_.Out_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
+  param_.X_grad = GetMutableVar<Tensor>(scope, X_grad_name);
+
+  if (opdesc.HasInput("X")) {
+    auto X_name = opdesc.Input("X").front();
+    param_.X = GetVar<lite::Tensor>(scope, X_name);
+  } else {
+    param_.X = param_.X_grad;
+  }
+
+  if (opdesc.HasInput("Out")) {
+    auto Out_name = opdesc.Input("Out").front();
+    param_.Out = GetVar<lite::Tensor>(scope, Out_name);
+  } else {
+    param_.Out = param_.Out_grad;
+  }
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp);
--- a/lite/operators/activation_grad_ops.h
+++ b/lite/operators/activation_grad_ops.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ActivationGradOp : public OpLite {
+ public:
+  explicit ActivationGradOp(const std::string& type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "activation_grad_op"; }
+
+ private:
+  mutable operators::ActivationGradParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -78,46 +78,6 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
  return true;
 }

-// #ifdef LITE_WITH_TRAIN
-
-// bool ActivationGradOp::CheckShape() const {
-//   CHECK_OR_FALSE(param_.X_grad);
-//   CHECK_OR_FALSE(param_.Out_grad);
-//   return true;
-// }
-
-// bool ActivationGradOp::InferShape() const {
-//   param_.X_grad->Resize(param_.Out_grad->dims());
-//   return true;
-// }
-
-// bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
-//                                   lite::Scope* scope) {
-//   auto Out_grad_name = opdesc.Input(framework::GradVarName("Out")).front();
-//   auto X_grad_name = opdesc.Output(framework::GradVarName("X")).front();
-
-//   param_.Out_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
-//   param_.X_grad = GetMutableVar<Tensor>(scope, X_grad_name);
-
-//   if (opdesc.HasInput("X")) {
-//     auto X_name = opdesc.Input("X").front();
-//     param_.X = GetVar<lite::Tensor>(scope, X_name);
-//   } else {
-//     param_.X = param_.X_grad;
-//   }
-
-//   if (opdesc.HasInput("Out")) {
-//     auto Out_name = opdesc.Input("Out").front();
-//     param_.Out = GetVar<lite::Tensor>(scope, Out_name);
-//   } else {
-//     param_.Out = param_.Out_grad;
-//   }
-
-//   return true;
-// }
-
-// #endif
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
@@ -138,7 +98,3 @@ REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp);
-
-// #ifdef LITE_WITH_TRAIN
-// REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp);
-// #endif
--- a/lite/operators/activation_ops.h
+++ b/lite/operators/activation_ops.h
@@ -38,27 +38,6 @@ class ActivationOp : public OpLite {
  mutable operators::ActivationParam param_;
 };

-// #ifdef LITE_WITH_TRAIN
-// class ActivationGradOp : public OpLite {
-//  public:
-//   explicit ActivationGradOp(const std::string& type) : OpLite(type) {}
-
-//   bool CheckShape() const override;
-
-//   bool InferShape() const override;
-
-//   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
-
-//   void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_);
-//   }
-
-//   std::string DebugString() const override { return "activation_grad_op"; }
-
-//  private:
-//   mutable operators::ActivationGradParam param_;
-// };
-// #endif
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -60,8 +60,12 @@ if(LITE_BUILD_EXTRA)
    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+
    # for training kernel
-    lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    if (LITE_WITH_TRAIN)
+        lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    endif()

 endif()
    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -35,7 +35,8 @@ enum activation_type_test {
  EXP,
  FLOOR,
  RSQRT,
-  GELU
+  GELU,
+  SQUARE
 };

 class ActivationComputeTester : public arena::TestCase {
@@ -192,6 +193,12 @@ class ActivationComputeTester : public arena::TestCase {
        }
        break;
      }
+      case SQUARE: {
+        for (int i = 0; i < dims_.production(); i++) {
+          output_data[i] = x_data[i] * x_data[i];
+        }
+        break;
+      }
      default:
        LOG(INFO) << "the type of activation is unknow.";
    }
@@ -632,6 +639,33 @@ TEST(Activation_rsqrt, precision) {
 #endif
 }

+TEST(Activation_square, precision) {
+  LOG(INFO) << "test square op";
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  for (auto n : {2}) {
+    for (auto c : {2}) {
+      for (auto h : {2}) {
+        for (auto w : {2}) {
+          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+              place,
+              "def",
+              0.01,
+              6.,
+              "all",
+              0.,
+              DDim(std::vector<int64_t>({n, c, h, w})),
+              "square",
+              SQUARE));
+          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena.TestPrecision();
+        }
+      }
+    }
+  }
+#endif
+}
+
 TEST(Activation_gelu, precision) {
  LOG(INFO) << "test gelu op";
  Place place;

--- a/lite/tests/kernels/activation_grad_compute_test.cc
+++ b/lite/tests/kernels/activation_grad_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/activation_grad_compute.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/arm/activation_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+using param_t = operators::ActivationParam;
+using grad_param_t = operators::ActivationGradParam;
+using kernel_t = SquareCompute;
+using grad_kernel_t = SquareGradCompute;
+
+class ActivationGradTester {
+ public:
+  explicit ActivationGradTester(DDim dims) : dims_(dims) {}
+
+  void prepare_kernel() {
+    std::unique_ptr<KernelContext> ctx1(new KernelContext);
+    ctx1->As<ARMContext>();
+    kernel_.SetContext(std::move(ctx1));
+
+    std::unique_ptr<KernelContext> ctx2(new KernelContext);
+    ctx2->As<ARMContext>();
+    delta_kernel_.SetContext(std::move(ctx2));
+
+    std::unique_ptr<KernelContext> ctx3(new KernelContext);
+    ctx3->As<ARMContext>();
+    grad_kernel_.SetContext(std::move(ctx3));
+  }
+
+  void run_forward(param_t* param,
+                   kernel_t* kernel,
+                   const std::vector<float>& in_vec,
+                   float* out_vec) {
+    Tensor x;
+    Tensor output;
+    x.Resize(dims_);
+    output.Resize(dims_);
+    auto* x_data = x.mutable_data<float>();
+    for (int i = 0; i < dims_.production(); i++) {
+      x_data[i] = in_vec[i];
+    }
+    param->X = &x;
+    param->Out = &output;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* output_data = output.mutable_data<float>();
+    for (int i = 0; i < dims_.production(); i++) {
+      out_vec[i] = output_data[i];
+    }
+  }
+
+  void run_backward(grad_param_t* param,
+                    grad_kernel_t* kernel,
+                    const std::vector<float>& in_vec,
+                    const std::vector<float>& out_grad_vec,
+                    float* in_grad_vec) {
+    Tensor x;
+    Tensor x_grad;
+    Tensor out_grad;
+    x.Resize(dims_);
+    x_grad.Resize(dims_);
+    out_grad.Resize(dims_);
+    auto* x_data = x.mutable_data<float>();
+    auto* out_grad_data = out_grad.mutable_data<float>();
+
+    for (int i = 0; i < dims_.production(); i++) {
+      x_data[i] = in_vec[i];
+      out_grad_data[i] = out_grad_vec[i];
+    }
+    param->X = &x;
+    param->X_grad = &x_grad;
+    param->Out_grad = &out_grad;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* x_grad_data = x_grad.mutable_data<float>();
+    for (int i = 0; i < dims_.production(); i++) {
+      in_grad_vec[i] = x_grad_data[i];
+    }
+  }
+
+  void check_grad(float delta, float max_grad_delta) {
+    std::vector<float> x(dims_.production());
+    std::vector<float> out(dims_.production());
+    for (int i = 0; i < dims_.production(); i++) {
+      x[i] = 1.0 * static_cast<float>(i % 128) * 0.3f - 1.1;
+    }
+    this->run_forward(&param_, &kernel_, x, out.data());
+
+    std::vector<float> x_delta(dims_.production());
+    std::vector<float> out_delta(dims_.production());
+
+    for (int i = 0; i < dims_.production(); i++) {
+      x_delta[i] = x[i] + delta;
+    }
+    this->run_forward(&delta_param_, &delta_kernel_, x_delta, out_delta.data());
+
+    std::vector<float> out_grad(dims_.production());
+    std::vector<float> x_grad(dims_.production());
+
+    for (int i = 0; i < dims_.production(); i++) {
+      out_grad[i] = 1.0;
+    }
+    this->run_backward(&grad_param_, &grad_kernel_, x, out_grad, x_grad.data());
+
+    for (int i = 0; i < dims_.production(); i++) {
+      EXPECT_NEAR(x_grad[i], (out_delta[i] - out[i]) / delta, max_grad_delta);
+    }
+  }
+
+ private:
+  DDim dims_;
+  kernel_t kernel_;
+  kernel_t delta_kernel_;
+  grad_kernel_t grad_kernel_;
+  param_t param_;
+  param_t delta_param_;
+  grad_param_t grad_param_;
+};
+
+void TestNormalCase(DDim dims) {
+  std::unique_ptr<ActivationGradTester> tester(new ActivationGradTester(dims));
+  tester->prepare_kernel();
+  float delta = 0.001;
+  float max_grad_delta = 0.005;
+  tester->check_grad(delta, max_grad_delta);
+}
+
+TEST(activation_grad_arm, compute) {
+  LOG(INFO) << "Test Square grad";
+  DeviceInfo::Init();
+  for (auto n : {2}) {
+    for (auto c : {2}) {
+      for (auto h : {2}) {
+        for (auto w : {2}) {
+          TestNormalCase(DDim(std::vector<int64_t>({n, c, h, w})));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(square, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(square_grad, kARM, kFloat, kNCHW, def);