From d1e89ead550a5c847abb1614b60ce686a3a532c6 Mon Sep 17 00:00:00 2001
From: wuhuanzhou <mr.avin0323@gmail.com>
Date: Wed, 2 Jun 2021 10:48:29 +0800
Subject: [PATCH] optimize OP's compilation time implemented by Eigen,
 test=develop (#33218)

---
 paddle/fluid/operators/eigen/eigen_function.h | 20 +++++++++++
 paddle/fluid/operators/eigen/loss.cc          | 33 +++++++++++++++++++
 paddle/fluid/operators/eigen/loss.cu          | 33 +++++++++++++++++++
 paddle/fluid/operators/log_loss_op.cc         |  5 +++
 paddle/fluid/operators/log_loss_op.cu         | 21 ------------
 paddle/fluid/operators/log_loss_op.h          | 11 +++----
 paddle/fluid/operators/top_k_function_cuda.h  | 13 +++++---
 7 files changed, 105 insertions(+), 31 deletions(-)
 delete mode 100644 paddle/fluid/operators/log_loss_op.cu
diff --git a/paddle/fluid/operators/eigen/eigen_function.h b/paddle/fluid/operators/eigen/eigen_function.h
index 8cbc7cd6acd..9a3be7ca439 100644
--- a/paddle/fluid/operators/eigen/eigen_function.h
+++ b/paddle/fluid/operators/eigen/eigen_function.h
@@ -196,6 +196,26 @@ struct EigenRankLossGrad {
                         const InType& left, const InType& right);
 };
 
+template <typename EigenDevice, typename T>
+struct EigenLogLoss {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& pred,
+                   const InType& label, const T& epsilon);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenLogLossGrad {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss,
+                   const InType& pred, const InType& label, const T& epsilon);
+};
+
 template <typename EigenDevice, typename T>
 struct EigenHingeLoss {
   using InType = Eigen::TensorMap<
diff --git a/paddle/fluid/operators/eigen/loss.cc b/paddle/fluid/operators/eigen/loss.cc
index 22a3647bc31..469456537d9 100644
--- a/paddle/fluid/operators/eigen/loss.cc
+++ b/paddle/fluid/operators/eigen/loss.cc
@@ -53,6 +53,39 @@ struct EigenRankLossGrad<Eigen::DefaultDevice, T> {
 template struct EigenRankLoss<Eigen::DefaultDevice, float>;
 template struct EigenRankLossGrad<Eigen::DefaultDevice, float>;
 
+template <typename T>
+struct EigenLogLoss<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& pred, const InType& label, const T& epsilon) {
+    out.device(dev) = (-(label * (pred + epsilon).log()) -
+                       ((static_cast<T>(1) - label) *
+                        (static_cast<T>(1) - pred + epsilon).log()));
+  }
+};
+
+template <typename T>
+struct EigenLogLossGrad<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType dpred,
+                   const InType& dloss, const InType& pred, const InType& label,
+                   const T& epsilon) {
+    dpred.device(dev) =
+        dloss *
+        (-(label / (pred + epsilon)) +
+         ((static_cast<T>(1) - label) / (static_cast<T>(1) - pred + epsilon)));
+  }
+};
+
+template struct EigenLogLoss<Eigen::DefaultDevice, float>;
+template struct EigenLogLossGrad<Eigen::DefaultDevice, float>;
+
 template <typename T>
 struct EigenHingeLoss<Eigen::DefaultDevice, T> {
   using InType = Eigen::TensorMap<
diff --git a/paddle/fluid/operators/eigen/loss.cu b/paddle/fluid/operators/eigen/loss.cu
index fac7e3370bc..02341202a2b 100644
--- a/paddle/fluid/operators/eigen/loss.cu
+++ b/paddle/fluid/operators/eigen/loss.cu
@@ -53,6 +53,39 @@ struct EigenRankLossGrad<Eigen::GpuDevice, T> {
 template struct EigenRankLoss<Eigen::GpuDevice, float>;
 template struct EigenRankLossGrad<Eigen::GpuDevice, float>;
 
+template <typename T>
+struct EigenLogLoss<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& pred,
+                   const InType& label, const T& epsilon) {
+    out.device(dev) = (-(label * (pred + epsilon).log()) -
+                       ((static_cast<T>(1) - label) *
+                        (static_cast<T>(1) - pred + epsilon).log()));
+  }
+};
+
+template <typename T>
+struct EigenLogLossGrad<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType dpred,
+                   const InType& dloss, const InType& pred, const InType& label,
+                   const T& epsilon) {
+    dpred.device(dev) =
+        dloss *
+        (-(label / (pred + epsilon)) +
+         ((static_cast<T>(1) - label) / (static_cast<T>(1) - pred + epsilon)));
+  }
+};
+
+template struct EigenLogLoss<Eigen::GpuDevice, float>;
+template struct EigenLogLossGrad<Eigen::GpuDevice, float>;
+
 template <typename T>
 struct EigenHingeLoss<Eigen::GpuDevice, T> {
   using InType = Eigen::TensorMap<
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index 1569512dc74..c41805d41ce 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -154,3 +154,8 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     log_loss_grad,
     ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.cu b/paddle/fluid/operators/log_loss_op.cu
deleted file mode 100644
index 280913c43a2..00000000000
--- a/paddle/fluid/operators/log_loss_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/log_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h
index e62de17a986..e7985ab810b 100644
--- a/paddle/fluid/operators/log_loss_op.h
+++ b/paddle/fluid/operators/log_loss_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -40,9 +41,8 @@ class LogLossKernel : public framework::OpKernel<T> {
     auto loss = EigenVector<T>::Flatten(*loss_out);
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
-    loss.device(place) = (-(label * (prediction + epsilon).log()) -
-                          ((static_cast<T>(1) - label) *
-                           (static_cast<T>(1) - prediction + epsilon).log()));
+    EigenLogLoss<std::decay_t<decltype(place)>, T>::Eval(
+        place, loss, prediction, label, epsilon);
   }
 };
 
@@ -64,9 +64,8 @@ class LogLossGradKernel : public framework::OpKernel<T> {
     if (dpred) {
       dpred->mutable_data<T>(ctx.GetPlace());
       auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
-                               ((static_cast<T>(1) - label) /
-                                (static_cast<T>(1) - prediction + epsilon)));
+      EigenLogLossGrad<std::decay_t<decltype(place)>, T>::Eval(
+          place, dx, dl, prediction, label, epsilon);
     }
   }
 };
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index a7d7ea260ec..07749f90eba 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 #endif
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
@@ -563,15 +564,19 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
     const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
     auto e_indices =
         framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(temp_indices);
+    auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(
+        static_cast<const Tensor>(temp_indices));
 
     std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
     auto dim = framework::make_ddim(odims);
     auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
-    auto e_tmp_values = framework::EigenMatrix<T>::From(temp_values);
+    auto e_tmp_values =
+        framework::EigenMatrix<T>::From(static_cast<const Tensor>(temp_values));
 
-    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
-    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
+    EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
+        dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
+    EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
+        dev, e_values, e_tmp_values, slice_indices, slice_sizes);
   }
   return true;
 }
-- 
GitLab