add Act grad (#3923)

add act grad ops

add Act grad (#3923)
add act grad ops
845e80d9 · mapingshuo · GitHub · f4c04186 · 845e80d9 · 845e80d9
9 changed file
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -763,24 +763,6 @@ void act_thresholded_relu<float>(
  }
 }

-#ifdef LITE_WITH_TRAIN
-template <>
-void act_square_grad(const float* din,
-                     const float* dout_grad,
-                     float* din_grad,
-                     int size,
-                     int threads) {
-  const float* ptr_out_grad = dout_grad;
-  float* ptr_in_grad = din_grad;
-  for (int i = 0; i < size; ++i) {
-    ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0];
-    ptr_out_grad++;
-    ptr_in_grad++;
-    din++;
-  }
-}
-#endif
-
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -90,12 +90,6 @@ template <typename T>
 void act_thresholded_relu(
    const T* din, T* dout, int size, float threshold, int threads);

-#ifdef LITE_WITH_TRAIN
-template <typename T>
-void act_square_grad(
-    const T* din, const T* dout_grad, T* din_grad, int size, int threads);
-#endif
-
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -103,7 +103,6 @@ add_kernel(deformable_conv_compute_arm ARM extra SRCS deformable_conv_compute.cc
 add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm)

 add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)

--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -18,6 +18,7 @@ add_kernel(read_from_array_compute_host Host extra SRCS read_from_array_compute.
 add_kernel(assign_compute_host Host extra SRCS assign_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(retinanet_detection_output_compute_host Host extra SRCS retinanet_detection_output_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(where_index_compute_host Host extra SRCS where_index_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(activation_grad_compute_host Host train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps})

 if(LITE_BUILD_EXTRA)
  lite_cc_test(test_where_index_compute_host SRCS where_index_compute.cc DEPS where_index_compute_host)

--- a/lite/kernels/arm/activation_grad_compute.cc
+++ b/lite/kernels/arm/activation_grad_compute.cc
@@ -12,41 +12,87 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/kernels/arm/activation_grad_compute.h"
-#include "lite/backends/arm/math/funcs.h"
+#include "lite/kernels/host/activation_grad_compute.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

 void SquareGradCompute::Run() {
  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
+  CHECK(param.X);
  auto out_grad_dims = param.Out_grad->dims();
  auto out_grad_data = param.Out_grad->data<float>();

  auto x_data = param.X->data<float>();
  auto x_grad_data = param.X_grad->mutable_data<float>();
-  lite::arm::math::act_square_grad<float>(x_data,
-                                          out_grad_data,
-                                          x_grad_data,
-                                          out_grad_dims.production(),
-                                          ctx.threads());
+  for (int i = 0; i < out_grad_dims.production(); i++) {
+    x_grad_data[i] = out_grad_data[i] * 2.0 * x_data[i];
+  }
 }

-}  // namespace arm
+void ReluGradCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(param.X);
+  auto out_grad_dims = param.Out_grad->dims();
+  auto out_grad_data = param.Out_grad->data<float>();
+
+  auto x_data = param.X->data<float>();
+  auto x_grad_data = param.X_grad->mutable_data<float>();
+  for (int i = 0; i < out_grad_dims.production(); i++) {
+    x_grad_data[i] = x_data[i] > 0 ? out_grad_data[i] : 0.0;
+  }
+}
+
+void TanhGradCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(param.Out);
+  auto out_grad_dims = param.Out_grad->dims();
+  auto out_grad_data = param.Out_grad->data<float>();
+
+  auto out_data = param.Out->data<float>();
+  auto x_grad_data = param.X_grad->mutable_data<float>();
+  for (int i = 0; i < out_grad_dims.production(); i++) {
+    x_grad_data[i] = out_grad_data[i] *
+                     (static_cast<float>(1.0) - out_data[i] * out_data[i]);
+  }
+}
+
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle

 REGISTER_LITE_KERNEL(square_grad,
-                     kARM,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::SquareGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(relu_grad,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::SquareGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(tanh_grad,
+                     kHost,
                     kFloat,
                     kNCHW,
-                     paddle::lite::kernels::arm::SquareGradCompute,
+                     paddle::lite::kernels::host::SquareGradCompute,
                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
    .Finalize();
--- a/lite/kernels/arm/activation_grad_compute.h
+++ b/lite/kernels/arm/activation_grad_compute.h
@@ -20,9 +20,9 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

-class SquareGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class SquareGradCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
 public:
  using param_t = operators::ActivationGradParam;

@@ -31,7 +31,25 @@ class SquareGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  virtual ~SquareGradCompute() = default;
 };

-}  // namespace arm
+class ReluGradCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationGradParam;
+
+  void Run() override;
+
+  virtual ~ReluGradCompute() = default;
+};
+
+class TanhGradCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationGradParam;
+
+  void Run() override;
+
+  virtual ~TanhGradCompute() = default;
+};
+
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/operators/activation_grad_ops.cc
+++ b/lite/operators/activation_grad_ops.cc
@@ -41,15 +41,11 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
  if (opdesc.HasInput("X")) {
    auto X_name = opdesc.Input("X").front();
    param_.X = GetVar<lite::Tensor>(scope, X_name);
-  } else {
-    param_.X = param_.X_grad;
  }

  if (opdesc.HasInput("Out")) {
    auto Out_name = opdesc.Input("Out").front();
    param_.Out = GetVar<lite::Tensor>(scope, Out_name);
-  } else {
-    param_.Out = param_.Out_grad;
  }

  return true;
@@ -60,3 +56,5 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
 }  // namespace paddle

 REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp);
+REGISTER_LITE_OP(relu_grad, paddle::lite::operators::ActivationGradOp);
+REGISTER_LITE_OP(tanh_grad, paddle::lite::operators::ActivationGradOp);
--- a/lite/tests/kernels/activation_grad_compute_test.cc
+++ b/lite/tests/kernels/activation_grad_compute_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/kernels/arm/activation_grad_compute.h"
+#include "lite/kernels/host/activation_grad_compute.h"
 #include <gtest/gtest.h>
 #include "lite/core/op_registry.h"
 #include "lite/kernels/arm/activation_compute.h"
@@ -20,13 +20,11 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {

 using param_t = operators::ActivationParam;
 using grad_param_t = operators::ActivationGradParam;
-using kernel_t = SquareCompute;
-using grad_kernel_t = SquareGradCompute;

+template <class kernel_t, class grad_kernel_t>
 class ActivationGradTester {
 public:
  explicit ActivationGradTester(DDim dims) : dims_(dims) {}
@@ -71,22 +69,28 @@ class ActivationGradTester {
  void run_backward(grad_param_t* param,
                    grad_kernel_t* kernel,
                    const std::vector<float>& in_vec,
+                    const std::vector<float>& out_vec,
                    const std::vector<float>& out_grad_vec,
                    float* in_grad_vec) {
    Tensor x;
+    Tensor out;
    Tensor x_grad;
    Tensor out_grad;
    x.Resize(dims_);
+    out.Resize(dims_);
    x_grad.Resize(dims_);
    out_grad.Resize(dims_);
    auto* x_data = x.mutable_data<float>();
+    auto* out_data = out.mutable_data<float>();
    auto* out_grad_data = out_grad.mutable_data<float>();

    for (int i = 0; i < dims_.production(); i++) {
      x_data[i] = in_vec[i];
+      out_data[i] = out_vec[i];
      out_grad_data[i] = out_grad_vec[i];
    }
    param->X = &x;
+    param->Out = &out;
    param->X_grad = &x_grad;
    param->Out_grad = &out_grad;
    kernel->SetParam(*param);
@@ -102,7 +106,9 @@ class ActivationGradTester {
    std::vector<float> x(dims_.production());
    std::vector<float> out(dims_.production());
    for (int i = 0; i < dims_.production(); i++) {
-      x[i] = 1.0 * static_cast<float>(i % 128) * 0.3f - 1.1;
+      x[i] = static_cast<float>(i % 3 - 2.0) / 2.0 * 0.333 +
+             static_cast<float>(i % 19 - 10.0) / 10.0 * 0.333 +
+             static_cast<float>(i % 39 - 20.0) / 20.0 * 0.333 + 0.001213;
    }
    this->run_forward(&param_, &kernel_, x, out.data());

@@ -120,7 +126,8 @@ class ActivationGradTester {
    for (int i = 0; i < dims_.production(); i++) {
      out_grad[i] = 1.0;
    }
-    this->run_backward(&grad_param_, &grad_kernel_, x, out_grad, x_grad.data());
+    this->run_backward(
+        &grad_param_, &grad_kernel_, x, out, out_grad, x_grad.data());

    for (int i = 0; i < dims_.production(); i++) {
      EXPECT_NEAR(x_grad[i], (out_delta[i] - out[i]) / delta, max_grad_delta);
@@ -137,31 +144,58 @@ class ActivationGradTester {
  grad_param_t grad_param_;
 };

-void TestNormalCase(DDim dims) {
-  std::unique_ptr<ActivationGradTester> tester(new ActivationGradTester(dims));
+void TestSquareGrad(DDim dims) {
+  LOG(INFO) << "Test Square grad";
+  std::unique_ptr<
+      ActivationGradTester<arm::SquareCompute, host::SquareGradCompute>>
+      tester(
+          new ActivationGradTester<arm::SquareCompute, host::SquareGradCompute>(
+              dims));
  tester->prepare_kernel();
  float delta = 0.001;
  float max_grad_delta = 0.005;
  tester->check_grad(delta, max_grad_delta);
 }

-TEST(activation_grad_arm, compute) {
-  LOG(INFO) << "Test Square grad";
+void TestReluGrad(DDim dims) {
+  LOG(INFO) << "Test Relu grad";
+  std::unique_ptr<ActivationGradTester<arm::ReluCompute, host::ReluGradCompute>>
+      tester(new ActivationGradTester<arm::ReluCompute, host::ReluGradCompute>(
+          dims));
+  tester->prepare_kernel();
+  float delta = 0.001;
+  float max_grad_delta = 0.005;
+  tester->check_grad(delta, max_grad_delta);
+}
+
+void TestTanhGrad(DDim dims) {
+  LOG(INFO) << "Test Tanh grad";
+  std::unique_ptr<ActivationGradTester<arm::TanhCompute, host::TanhGradCompute>>
+      tester(new ActivationGradTester<arm::TanhCompute, host::TanhGradCompute>(
+          dims));
+  tester->prepare_kernel();
+  float delta = 0.001;
+  float max_grad_delta = 0.005;
+  tester->check_grad(delta, max_grad_delta);
+}
+
+TEST(activation_grad_host, compute) {
  DeviceInfo::Init();
-  for (auto n : {2}) {
-    for (auto c : {2}) {
-      for (auto h : {2}) {
-        for (auto w : {2}) {
-          TestNormalCase(DDim(std::vector<int64_t>({n, c, h, w})));
+  for (auto n : {2, 1}) {
+    for (auto c : {2, 9}) {
+      for (auto h : {2, 1}) {
+        for (auto w : {2, 10}) {
+          TestSquareGrad(DDim(std::vector<int64_t>({n, c, h, w})));
+          TestReluGrad(DDim(std::vector<int64_t>({n, c, h, w})));
+          TestTanhGrad(DDim(std::vector<int64_t>({n, c, h, w})));
        }
      }
    }
  }
 }

-}  // namespace arm
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 USE_LITE_KERNEL(square, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(square_grad, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(square_grad, kHost, kFloat, kNCHW, def);
--- a/lite/tests/kernels/elementwise_grad_compute_test.cc
+++ b/lite/tests/kernels/elementwise_grad_compute_test.cc
@@ -215,18 +215,6 @@ class ElementwiseAddGradTester {
    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
    this->run_forward(&param_, &kernel_, x, y, out.data());

-    for (int i = 0; i < x_dims_.production(); i++) {
-      LOG(INFO) << "x_" << i << ": " << x[i];
-    }
-
-    for (int i = 0; i < y_dims_.production(); i++) {
-      LOG(INFO) << "y_" << i << ": " << y[i];
-    }
-
-    for (int i = 0; i < out_dims_.production(); i++) {
-      LOG(INFO) << "out_" << i << ": " << out[i];
-    }
-
    // backward
    std::vector<float> out_grad(out_dims_.production());
    std::vector<float> x_grad(x_dims_.production());
@@ -242,14 +230,6 @@ class ElementwiseAddGradTester {
                       x_grad.data(),
                       y_grad.data());

-    for (int i = 0; i < x_grad.size(); i++) {
-      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
-    }
-
-    for (int i = 0; i < y_grad.size(); i++) {
-      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
-    }
-
    // get numeric gradient
    std::vector<float> x_delta(x_dims_.production());
    std::vector<float> y_delta(y_dims_.production());
@@ -443,18 +423,6 @@ class ElementwiseSubGradTester {
    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
    this->run_forward(&param_, &kernel_, x, y, out.data());

-    for (int i = 0; i < x_dims_.production(); i++) {
-      LOG(INFO) << "x_" << i << ": " << x[i];
-    }
-
-    for (int i = 0; i < y_dims_.production(); i++) {
-      LOG(INFO) << "y_" << i << ": " << y[i];
-    }
-
-    for (int i = 0; i < out_dims_.production(); i++) {
-      LOG(INFO) << "out_" << i << ": " << out[i];
-    }
-
    // backward
    std::vector<float> out_grad(out_dims_.production());
    std::vector<float> x_grad(x_dims_.production());
@@ -470,14 +438,6 @@ class ElementwiseSubGradTester {
                       x_grad.data(),
                       y_grad.data());

-    for (int i = 0; i < x_grad.size(); i++) {
-      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
-    }
-
-    for (int i = 0; i < y_grad.size(); i++) {
-      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
-    }
-
    // get numeric gradient
    std::vector<float> x_delta(x_dims_.production());
    std::vector<float> y_delta(y_dims_.production());