optimize OP's compilation time implemented by Eigen, test=develop (#33218)

d1e89ead · wuhuanzhou · GitHub · e7541209 · d1e89ead · d1e89ead
7 changed file
--- a/paddle/fluid/operators/eigen/eigen_function.h
+++ b/paddle/fluid/operators/eigen/eigen_function.h
@@ -196,6 +196,26 @@ struct EigenRankLossGrad {
                        const InType& left, const InType& right);
 };

+template <typename EigenDevice, typename T>
+struct EigenLogLoss {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& pred,
+                   const InType& label, const T& epsilon);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenLogLossGrad {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss,
+                   const InType& pred, const InType& label, const T& epsilon);
+};
+
 template <typename EigenDevice, typename T>
 struct EigenHingeLoss {
  using InType = Eigen::TensorMap<

--- a/paddle/fluid/operators/eigen/loss.cc
+++ b/paddle/fluid/operators/eigen/loss.cc
@@ -53,6 +53,39 @@ struct EigenRankLossGrad<Eigen::DefaultDevice, T> {
 template struct EigenRankLoss<Eigen::DefaultDevice, float>;
 template struct EigenRankLossGrad<Eigen::DefaultDevice, float>;

+template <typename T>
+struct EigenLogLoss<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& pred, const InType& label, const T& epsilon) {
+    out.device(dev) = (-(label * (pred + epsilon).log()) -
+                       ((static_cast<T>(1) - label) *
+                        (static_cast<T>(1) - pred + epsilon).log()));
+  }
+};
+
+template <typename T>
+struct EigenLogLossGrad<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType dpred,
+                   const InType& dloss, const InType& pred, const InType& label,
+                   const T& epsilon) {
+    dpred.device(dev) =
+        dloss *
+        (-(label / (pred + epsilon)) +
+         ((static_cast<T>(1) - label) / (static_cast<T>(1) - pred + epsilon)));
+  }
+};
+
+template struct EigenLogLoss<Eigen::DefaultDevice, float>;
+template struct EigenLogLossGrad<Eigen::DefaultDevice, float>;
+
 template <typename T>
 struct EigenHingeLoss<Eigen::DefaultDevice, T> {
  using InType = Eigen::TensorMap<

--- a/paddle/fluid/operators/eigen/loss.cu
+++ b/paddle/fluid/operators/eigen/loss.cu
@@ -53,6 +53,39 @@ struct EigenRankLossGrad<Eigen::GpuDevice, T> {
 template struct EigenRankLoss<Eigen::GpuDevice, float>;
 template struct EigenRankLossGrad<Eigen::GpuDevice, float>;

+template <typename T>
+struct EigenLogLoss<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& pred,
+                   const InType& label, const T& epsilon) {
+    out.device(dev) = (-(label * (pred + epsilon).log()) -
+                       ((static_cast<T>(1) - label) *
+                        (static_cast<T>(1) - pred + epsilon).log()));
+  }
+};
+
+template <typename T>
+struct EigenLogLossGrad<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType dpred,
+                   const InType& dloss, const InType& pred, const InType& label,
+                   const T& epsilon) {
+    dpred.device(dev) =
+        dloss *
+        (-(label / (pred + epsilon)) +
+         ((static_cast<T>(1) - label) / (static_cast<T>(1) - pred + epsilon)));
+  }
+};
+
+template struct EigenLogLoss<Eigen::GpuDevice, float>;
+template struct EigenLogLossGrad<Eigen::GpuDevice, float>;
+
 template <typename T>
 struct EigenHingeLoss<Eigen::GpuDevice, T> {
  using InType = Eigen::TensorMap<

--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -154,3 +154,8 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
    log_loss_grad,
    ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/log_loss_op.cu
+++ b/paddle/fluid/operators/log_loss_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/log_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/log_loss_op.h
+++ b/paddle/fluid/operators/log_loss_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"

 namespace paddle {
 namespace operators {
@@ -40,9 +41,8 @@ class LogLossKernel : public framework::OpKernel<T> {
    auto loss = EigenVector<T>::Flatten(*loss_out);
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();

-    loss.device(place) = (-(label * (prediction + epsilon).log()) -
-                          ((static_cast<T>(1) - label) *
-                           (static_cast<T>(1) - prediction + epsilon).log()));
+    EigenLogLoss<std::decay_t<decltype(place)>, T>::Eval(
+        place, loss, prediction, label, epsilon);
  }
 };

@@ -64,9 +64,8 @@ class LogLossGradKernel : public framework::OpKernel<T> {
    if (dpred) {
      dpred->mutable_data<T>(ctx.GetPlace());
      auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
-                               ((static_cast<T>(1) - label) /
-                                (static_cast<T>(1) - prediction + epsilon)));
+      EigenLogLossGrad<std::decay_t<decltype(place)>, T>::Eval(
+          place, dx, dl, prediction, label, epsilon);
    }
  }
 };

--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 #endif
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
@@ -563,15 +564,19 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
    auto e_indices =
        framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(temp_indices);
+    auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(
+        static_cast<const Tensor>(temp_indices));

    std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
    auto dim = framework::make_ddim(odims);
    auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
-    auto e_tmp_values = framework::EigenMatrix<T>::From(temp_values);
+    auto e_tmp_values =
+        framework::EigenMatrix<T>::From(static_cast<const Tensor>(temp_values));

-    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
-    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
+    EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
+        dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
+    EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
+        dev, e_values, e_tmp_values, slice_indices, slice_sizes);
  }
  return true;
 }