Merge pull request #4492 from QiJune/refine_some_functors

Pass DeviceContext to functors not ExecutionContext.

Merge pull request #4492 from QiJune/refine_some_functors
Pass DeviceContext to functors not ExecutionContext.
7cc5ae99 · Cao Ying · GitHub · 9fbf94b6 · b611a479 · 7cc5ae99
17 changed file
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,14 +22,14 @@ namespace framework {
 template <>
 Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_.GetEigenDevice<platform::CPUPlace>();
 }

 #ifndef PADDLE_ONLY_CPU
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_.GetEigenDevice<platform::GPUPlace>();
 }
 #endif


--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -296,21 +296,6 @@ template <>
 std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
    const std::string& name) const;

-template <typename T>
-struct EigenDeviceConverter;
-
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
-
-#ifndef PADDLE_ONLY_CPU
-template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
-};
-#endif
-
 class ExecutionContext : public InferShapeContext {
 public:
  ExecutionContext(const OperatorBase& op, const Scope& scope,
@@ -318,8 +303,8 @@ class ExecutionContext : public InferShapeContext {
      : InferShapeContext(op, scope), device_context_(device_context) {}

  template <typename PlaceType,
-            typename DeviceType =
-                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+            typename DeviceType = typename platform::EigenDeviceConverter<
+                PlaceType>::EigenDeviceType>
  DeviceType& GetEigenDevice() const;

  platform::Place GetPlace() const { return device_context_.GetPlace(); }

--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -18,14 +18,6 @@ namespace paddle {
 namespace operators {

 namespace {
-// TODO(qingqing): make zero setting a common function.
-template <typename T>
-__global__ void Zero(T* X, const int N) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    X[i] = 0.0;
-  }
-}

 template <typename T>
 __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
@@ -64,7 +56,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
    y->mutable_data<T>(ctx.GetPlace());

    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        ctx, y, x, label, ctx.Attr<bool>("softLabel"));
+        ctx.device_context(), y, x, label, ctx.Attr<bool>("softLabel"));
  }
 };

@@ -99,11 +91,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
                              .stream()>>>(dx_data, dy_data, x_data, label_data,
                                           batch_size, class_num);
    } else {
-      Zero<T><<<grid, block, 0,
-                reinterpret_cast<const platform::CUDADeviceContext&>(
-                    ctx.device_context())
-                    .stream()>>>(dx_data, batch_size * class_num);
-
+      math::SetConstant<platform::GPUPlace, T>(ctx.device_context(), dx, 0);
      auto* label_data = label->data<int>();
      grid = (batch_size + block - 1) / block;
      CrossEntropyGradientKernel<T><<<

--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/cross_entropy.h"
+#include "paddle/operators/math/math_function.h"

 namespace paddle {
 namespace operators {
@@ -37,7 +38,7 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
    y->mutable_data<T>(ctx.GetPlace());

    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        ctx, y, x, labels, ctx.Attr<bool>("softLabel"));
+        ctx.device_context(), y, x, labels, ctx.Attr<bool>("softLabel"));
  }
 };

@@ -69,8 +70,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
      const T* x_data = x->data<T>();
      const int* label_data = label->data<int>();

-      // TODO(qingqing): make zero setting a common function.
-      memset(dx_data, 0, sizeof(T) * batch_size * class_num);
+      math::SetConstant<platform::CPUPlace, T>(ctx.device_context(), dx, 0);

      for (int i = 0; i < batch_size; ++i) {
        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
 if(WITH_GPU)
    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
      im2col.cu DEPS cblas device_context operator)
+    nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
    nv_library(softmax_function SRCS softmax.cc softmax.cu
      DEPS operator)
    nv_library(cross_entropy_function SRCS cross_entropy.cc cross_entropy.cu
@@ -8,9 +9,9 @@ if(WITH_GPU)
 else()
    cc_library(math_function SRCS math_function.cc im2col.cc
      DEPS cblas device_context operator)
+    cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
    cc_library(softmax_function SRCS softmax.cc DEPS operator)
    cc_library(cross_entropy_function SRCS cross_entropy.cc DEPS operator)
 endif()

-nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
@@ -26,8 +26,8 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
 class CrossEntropyFunctor<platform::CPUPlace, T> {
 public:
-  void operator()(const framework::ExecutionContext& ctx,
-                  framework::Tensor* out, const framework::Tensor* prob,
+  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
+                  const framework::Tensor* prob,
                  const framework::Tensor* labels, const bool softLabel) {
    const int batch_size = prob->dims()[0];
    if (softLabel) {
@@ -35,7 +35,7 @@ class CrossEntropyFunctor<platform::CPUPlace, T> {
      auto lbl = EigenMatrix<T>::From(*labels);
      auto loss = EigenMatrix<T>::From(*out);

-      loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+      loss.device(*ctx.GetEigenDevice<platform::CPUPlace>()) =
          -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
                .sum(Eigen::DSizes<int, 1>(1))
                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));

--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
@@ -74,8 +74,8 @@ using Tensor = framework::Tensor;
 template <typename T>
 class CrossEntropyFunctor<platform::GPUPlace, T> {
 public:
-  void operator()(const framework::ExecutionContext& ctx,
-                  framework::Tensor* out, const framework::Tensor* prob,
+  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
+                  const framework::Tensor* prob,
                  const framework::Tensor* labels, bool softLabel) {
    const T* prob_data = prob->data<T>();
    T* loss_data = out->mutable_data<T>(ctx.GetPlace());
@@ -87,20 +87,18 @@ class CrossEntropyFunctor<platform::GPUPlace, T> {
      const T* label_data = labels->data<T>();
      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));

-      SoftCrossEntropyKernel<
-          T><<<batch_size, block, block * sizeof(T),
-               reinterpret_cast<const platform::CUDADeviceContext&>(
-                   ctx.device_context())
-                   .stream()>>>(loss_data, prob_data, label_data, class_num);
+      SoftCrossEntropyKernel<T><<<
+          batch_size, block, block * sizeof(T),
+          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+          loss_data, prob_data, label_data, class_num);
    } else {
      const int* label_data = labels->data<int>();
      int block = 512;
      int grid = (batch_size + block - 1) / block;
      CrossEntropyKernel<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(loss_data, prob_data, label_data,
-                                           batch_size, class_num);
+          grid, block, 0,
+          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+          loss_data, prob_data, label_data, batch_size, class_num);
    }
  }
 };

--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
@@ -37,9 +37,7 @@ struct TolerableValue {
 template <typename Place, typename T>
 class CrossEntropyFunctor {
 public:
-  // (TODO caoying) it is much better to use DeviceContext as the first
-  // parameter.
-  void operator()(const framework::ExecutionContext& context,
+  void operator()(const platform::DeviceContext& context,
                  framework::Tensor* out, const framework::Tensor* prob,
                  const framework::Tensor* labels, const bool softLabel);
 };

--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -52,6 +52,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,

 #include <cmath>

+#include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
@@ -84,6 +85,13 @@ void matmul(const platform::DeviceContext& context,
            const framework::Tensor& matrix_b, bool trans_b, T alpha,
            framework::Tensor* matrix_out, T beta);

+template <typename Place, typename T>
+void SetConstant(const platform::DeviceContext& context,
+                 framework::Tensor* tensor, T num) {
+  auto t = framework::EigenVector<T>::Flatten(*tensor);
+  t.device(*context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(num));
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -243,3 +243,24 @@ TEST(math_function, gemm_trans_clbas) {
  EXPECT_EQ(input3_ptr[6], 86);
  EXPECT_EQ(input3_ptr[7], 99);
 }
+
+TEST(math_function, zero) {
+  paddle::framework::Tensor tensor;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUPlace, float>(
+      context, &tensor, 0);
+  EXPECT_EQ(t[0], 0);
+  EXPECT_EQ(t[1], 0);
+  EXPECT_EQ(t[2], 0);
+  EXPECT_EQ(t[3], 0);
+
+  paddle::operators::math::SetConstant<paddle::platform::CPUPlace, float>(
+      context, &tensor, 1);
+
+  EXPECT_EQ(t[0], 1);
+  EXPECT_EQ(t[1], 1);
+  EXPECT_EQ(t[2], 1);
+  EXPECT_EQ(t[3], 1);
+}
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
@@ -36,7 +36,7 @@ struct ValueClip {
 template <typename Place, typename T>
 class SoftmaxFunctor {
 public:
-  void operator()(const framework::ExecutionContext& context,
+  void operator()(const platform::DeviceContext& context,
                  const framework::Tensor* X, framework::Tensor* Y) {
    auto logits = EigenMatrix<T>::From(*X);
    auto softmax = EigenMatrix<T>::From(*Y);
@@ -58,8 +58,8 @@ class SoftmaxFunctor {
                               .broadcast(one_by_class))
                              .unaryExpr(ValueClip<T>());

-    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
-    softmax.device(context.GetEigenDevice<Place>()) =
+    softmax.device(*context.GetEigenDevice<Place>()) = shifted_logits.exp();
+    softmax.device(*context.GetEigenDevice<Place>()) =
        (softmax *
         softmax.sum(along_class)
             .inverse()

--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -35,7 +35,7 @@ class SoftmaxKernel : public framework::OpKernel<T> {
    // allocate memory on device.
    Y->mutable_data<T>(context.GetPlace());

-    math::SoftmaxFunctor<Place, T>()(context, X, Y);
+    math::SoftmaxFunctor<Place, T>()(context.device_context(), X, Y);
  }
 };


--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -66,9 +66,11 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
    softmax->mutable_data<T>(context.GetPlace());
    loss->mutable_data<T>(context.GetPlace());

-    math::SoftmaxFunctor<platform::GPUPlace, T>()(context, logits, softmax);
+    math::SoftmaxFunctor<platform::GPUPlace, T>()(context.device_context(),
+                                                  logits, softmax);
    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        context, loss, softmax, labels, context.Attr<bool>("softLabel"));
+        context.device_context(), loss, softmax, labels,
+        context.Attr<bool>("softLabel"));
  }
 };


--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -40,9 +40,11 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
    softmax->mutable_data<T>(context.GetPlace());
    loss->mutable_data<T>(context.GetPlace());

-    math::SoftmaxFunctor<platform::CPUPlace, T>()(context, logits, softmax);
+    math::SoftmaxFunctor<platform::CPUPlace, T>()(context.device_context(),
+                                                  logits, softmax);
    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        context, loss, softmax, labels, context.Attr<bool>("softLabel"));
+        context.device_context(), loss, softmax, labels,
+        context.Attr<bool>("softLabel"));
  }
 };


--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -16,8 +16,8 @@ namespace paddle {
 namespace platform {

 template <>
-Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
-    const {
+Eigen::DefaultDevice* DeviceContext::GetEigenDevice<
+    platform::CPUPlace, Eigen::DefaultDevice>() const {
  return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
 }

@@ -37,6 +37,12 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }

 #ifndef PADDLE_ONLY_CPU

+template <>
+Eigen::GpuDevice*
+DeviceContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
+  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
+}
+
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
 public:
  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -90,11 +96,6 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
  mutable unsigned int* semaphore_;
 };

-template <>
-Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
-  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
-}
-
 CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
  SetDeviceId(place_.device);
  PADDLE_ENFORCE(cudaStreamCreate(&stream_));

--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -27,13 +27,23 @@ limitations under the License. */
 namespace paddle {
 namespace platform {

+template <typename T>
+struct EigenDeviceConverter;
+
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+
 class DeviceContext {
 public:
  virtual ~DeviceContext() {}
  virtual Place GetPlace() const = 0;

-  template <typename DeviceType>
-  DeviceType* get_eigen_device() const;
+  template <typename PlaceType,
+            typename DeviceType =
+                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+  DeviceType* GetEigenDevice() const;

  virtual void Wait() const {}
 };
@@ -52,6 +62,11 @@ class CPUDeviceContext : public DeviceContext {
 };

 #ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+
 class EigenCudaStreamDevice;

 class CUDADeviceContext : public DeviceContext {

--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -24,7 +24,7 @@ TEST(Device, Init) {
  for (int i = 0; i < count; i++) {
    DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
    Eigen::GpuDevice* gpu_device =
-        device_context->template get_eigen_device<Eigen::GpuDevice>();
+        device_context->template GetEigenDevice<GPUPlace>();
    ASSERT_NE(nullptr, gpu_device);
    delete device_context;
  }