From b67ded04f26941d102b11eb8e8645fd5e6ee2b2f Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 3 Jun 2020 10:25:27 +0800
Subject: [PATCH] Support gradient accumulation of fp16 in imperative mode
 (#24823)

* support gradient accumulation of fp16 in imperative mode, test=develop

* enhance coverage test, test=develop

* follow comments, test=develop
---
 paddle/fluid/imperative/CMakeLists.txt        |  2 +-
 .../fluid/imperative/gradient_accumulator.cc  | 43 ++++++++-
 .../tests/test_gradient_accmulator.cc         | 87 +++++++++----------
 paddle/fluid/operators/math/math_function.cc  | 13 +++
 paddle/fluid/operators/math/math_function.cu  | 13 +++
 paddle/fluid/operators/math/math_function.h   |  7 ++
 6 files changed, 115 insertions(+), 50 deletions(-)
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 0403cf25c7..2a9e559d0c 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags)
 
 cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function) 
 add_subdirectory(jit)
 
 cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index e8ea7dc926..f5fc594470 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -85,13 +86,19 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 #else
   void operator()(const platform::CUDAPlace& place) {
-    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
   }
 #endif
 
   // there is NO blas in CUDAPinnedPlace
   void operator()(const platform::CUDAPinnedPlace& place) {
-    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
   }
 
  private:
@@ -100,6 +107,16 @@ class TensorAddFunctor : public boost::static_visitor<> {
   T* y_;
 };
 
+template <typename DeviceContext, typename T>
+void TensorAddImpl(const framework::Tensor& src, framework::Tensor* dst,
+                   const platform::Place& place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  paddle::platform::DeviceContext* ctx = pool.Get(place);
+  auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
+  operators::math::ElementwiseAddTo<DeviceContext, T> func;
+  func(dev_ctx, src, dst);
+}
+
 void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
   auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
   auto& src_tensor = src.Get<framework::LoDTensor>();
@@ -133,8 +150,26 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
 
 #undef PADDLE_TENSOR_ADD
 
-  PADDLE_THROW("Not supported data type %s for AddTo",
-               framework::DataTypeToString(data_type));
+  if (data_type == framework::proto::VarType::FP16) {
+    if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+      return TensorAddImpl<platform::CUDADeviceContext, platform::float16>(
+          src_tensor, dst_tensor, place);
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Gradient accumulation of data type (%s) on place (%s) is not "
+          "supported in imperative mode",
+          framework::DataTypeToString(data_type), place));
+#endif
+    } else if (platform::is_cpu_place(place)) {
+      return TensorAddImpl<platform::CPUDeviceContext, platform::float16>(
+          src_tensor, dst_tensor, place);
+    }
+  }
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Gradient accumulation of data type (%s) on place (%s) is not "
+      "supported in imperative mode",
+      framework::DataTypeToString(data_type), place));
 }
 
 void SelectedRowsAddToTensor(const framework::Variable& src,
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index 0d567f5d05..49bc24edba 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <memory>
+#include <type_traits>
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/variable.h"
@@ -27,9 +28,8 @@ namespace imperative {
 
 void TensorAdd(const framework::Variable& src, framework::Variable* dst);
 
-#if defined(PADDLE_WITH_CUDA)
-template <typename T>
-int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
+template <typename Place, typename T>
+int TensorddTest(Place place, T t1, T t2) {
   framework::Variable var1;
   framework::Variable var2;
   std::vector<T> src_data(10, t1);
@@ -39,6 +39,7 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
   for (unsigned int i = 0; i < 10; i++) {
     result.emplace_back(src_data[i] + dst_data[i]);
   }
+
   std::vector<int64_t> dims = {2, 5};
   auto* src = var1.GetMutable<framework::LoDTensor>();
   auto* dst = var2.GetMutable<framework::LoDTensor>();
@@ -46,44 +47,19 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
   dst->Resize(framework::make_ddim(dims));
   auto* src_mutable = src->mutable_data<T>(place);
   auto* dst_mutable = dst->mutable_data<T>(place);
-  paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
-                       sizeof(T) * src_data.size(), 0);
-  paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
-                       sizeof(T) * dst_data.size(), 0);
-  imperative::TensorAdd(var1, &var2);
-  framework::LoDTensor rlt;
-  platform::CPUPlace rlt_place;
-  framework::TensorCopySync(*dst, rlt_place, &rlt);
-
-  for (unsigned int i = 0; i < rlt.numel(); i++) {
-    if (rlt.data<T>()[i] != result[i]) return 1;
-  }
-  return 0;
-}
+  if (!std::is_same<Place, platform::CUDAPlace>::value) {
+    paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
+                         sizeof(T) * src_data.size());
+    paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
+                         sizeof(T) * dst_data.size());
+#if defined(PADDLE_WITH_CUDA)
+  } else {
+    paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
+                         sizeof(T) * src_data.size(), 0);
+    paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
+                         sizeof(T) * dst_data.size(), 0);
 #endif
-
-template <typename T>
-int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
-  framework::Variable var1;
-  framework::Variable var2;
-  std::vector<T> src_data(10, t1);
-  std::vector<T> dst_data(10, t2);
-  std::vector<T> result;
-  platform::CPUPlace src_place;
-  for (unsigned int i = 0; i < 10; i++) {
-    result.emplace_back(src_data[i] + dst_data[i]);
   }
-  std::vector<int64_t> dims = {2, 5};
-  auto* src = var1.GetMutable<framework::LoDTensor>();
-  auto* dst = var2.GetMutable<framework::LoDTensor>();
-  src->Resize(framework::make_ddim(dims));
-  dst->Resize(framework::make_ddim(dims));
-  auto* src_mutable = src->mutable_data<T>(place);
-  auto* dst_mutable = dst->mutable_data<T>(place);
-  paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
-                       sizeof(T) * src_data.size());
-  paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
-                       sizeof(T) * dst_data.size());
   imperative::TensorAdd(var1, &var2);
   framework::LoDTensor rlt;
   platform::CPUPlace rlt_place;
@@ -92,6 +68,7 @@ int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
   for (unsigned int i = 0; i < rlt.numel(); i++) {
     if (rlt.data<T>()[i] != result[i]) return 1;
   }
+
   return 0;
 }
 
@@ -102,18 +79,38 @@ TEST(test_add_functor, add_functor) {
   platform::CPUPlace cpu_place;
 
   int cpu_res = 1;
-  cpu_res = TensorCPUAddTest(cpu_place, 1.0, 0.0);
+  cpu_res = TensorddTest(cpu_place, 1.0, 0.0);
+  EXPECT_EQ(cpu_res, 0);
+  cpu_res = TensorddTest(cpu_place, static_cast<double>(1.0),
+                         static_cast<double>(2.0));
   EXPECT_EQ(cpu_res, 0);
-  cpu_res = TensorCPUAddTest(cpu_place, static_cast<double>(1.0),
-                             static_cast<double>(2.0));
+  cpu_res = TensorddTest(cpu_place, static_cast<platform::float16>(1.0),
+                         static_cast<platform::float16>(2.0));
   EXPECT_EQ(cpu_res, 0);
 #if defined(PADDLE_WITH_CUDA)
   int gpu_res = 1;
-  gpu_res = TensorGPUAddTest(gpu_place, 1.0, 0.0);
+  gpu_res = TensorddTest(gpu_place, 1.0, 0.0);
   EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorGPUAddTest(gpu_place, static_cast<double>(1.0),
-                             static_cast<double>(2.0));
+  gpu_res = TensorddTest(gpu_place, static_cast<double>(1.0),
+                         static_cast<double>(2.0));
   EXPECT_EQ(gpu_res, 0);
+  gpu_res = TensorddTest(gpu_place, static_cast<platform::float16>(1.0),
+                         static_cast<platform::float16>(2.0));
+  EXPECT_EQ(gpu_res, 0);
+#endif
+}
+
+TEST(test_add_functor, execption) {
+  platform::CUDAPinnedPlace cuda_pinned_place;
+  platform::CUDAPlace cuda_place(0);
+  platform::CPUPlace cpu_place;
+
+  ASSERT_ANY_THROW(TensorddTest(cpu_place, 1, 0));
+#if defined(PADDLE_WITH_CUDA)
+  ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, 1.0, 0.0));
+  ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place,
+                                static_cast<platform::float16>(1.0),
+                                static_cast<platform::float16>(2.0)));
 #endif
 }
 
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index e1491a8156..44b0410441 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -146,6 +146,19 @@ template struct RowwiseSum<platform::CPUDeviceContext, double>;
 template struct RowwiseMean<platform::CPUDeviceContext, float>;
 template struct RowwiseMean<platform::CPUDeviceContext, double>;
 
+template <typename T>
+struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
+  void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src,
+                  framework::Tensor* dst) {
+    auto in = framework::EigenVector<T>::Flatten(src);
+    auto out = framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index fdbd77a5c8..235bbb57ed 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -148,6 +148,19 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
 template struct RowwiseMean<platform::CUDADeviceContext, float>;
 template struct RowwiseMean<platform::CUDADeviceContext, double>;
 
+template <typename T>
+struct ElementwiseAddTo<platform::CUDADeviceContext, T> {
+  void operator()(platform::CUDADeviceContext* ctx,
+                  const framework::Tensor& src, framework::Tensor* dst) {
+    auto in = framework::EigenVector<T>::Flatten(src);
+    auto out = framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<platform::CUDADeviceContext,
+                                 platform::float16>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index b4f19417b6..333552a0c1 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -51,6 +51,13 @@ struct RowwiseAdd {
                   const framework::Tensor& vec, framework::Tensor* output);
 };
 
+template <typename DeviceContext, typename T>
+struct ElementwiseAddTo {
+  // dst = dst + src
+  void operator()(DeviceContext* ctx, const framework::Tensor& src,
+                  framework::Tensor* dst);
+};
+
 template <typename DeviceContext, typename T>
 struct ColwiseSum {
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-- 
GitLab