Support gradient accumulation of fp16 in imperative mode (#24823)

* support gradient accumulation of fp16 in imperative mode, test=develop * enhance coverage test, test=develop * follow comments, test=develop

Support gradient accumulation of fp16 in imperative mode (#24823)
* support gradient accumulation of fp16 in imperative mode, test=develop * enhance coverage test, test=develop * follow comments, test=develop
b67ded04 · Leo Chen · GitHub · 1e190a9e · b67ded04 · b67ded04
6 changed file
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags)

 cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function) 
 add_subdirectory(jit)

 cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"

 namespace paddle {
@@ -85,13 +86,19 @@ class TensorAddFunctor : public boost::static_visitor<> {
  }
 #else
  void operator()(const platform::CUDAPlace& place) {
-    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
  }
 #endif

  // there is NO blas in CUDAPinnedPlace
  void operator()(const platform::CUDAPinnedPlace& place) {
-    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
  }

 private:
@@ -100,6 +107,16 @@ class TensorAddFunctor : public boost::static_visitor<> {
  T* y_;
 };

+template <typename DeviceContext, typename T>
+void TensorAddImpl(const framework::Tensor& src, framework::Tensor* dst,
+                   const platform::Place& place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  paddle::platform::DeviceContext* ctx = pool.Get(place);
+  auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
+  operators::math::ElementwiseAddTo<DeviceContext, T> func;
+  func(dev_ctx, src, dst);
+}
+
 void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
  auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
  auto& src_tensor = src.Get<framework::LoDTensor>();
@@ -133,8 +150,26 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {

 #undef PADDLE_TENSOR_ADD

-  PADDLE_THROW("Not supported data type %s for AddTo",
-               framework::DataTypeToString(data_type));
+  if (data_type == framework::proto::VarType::FP16) {
+    if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+      return TensorAddImpl<platform::CUDADeviceContext, platform::float16>(
+          src_tensor, dst_tensor, place);
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Gradient accumulation of data type (%s) on place (%s) is not "
+          "supported in imperative mode",
+          framework::DataTypeToString(data_type), place));
+#endif
+    } else if (platform::is_cpu_place(place)) {
+      return TensorAddImpl<platform::CPUDeviceContext, platform::float16>(
+          src_tensor, dst_tensor, place);
+    }
+  }
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Gradient accumulation of data type (%s) on place (%s) is not "
+      "supported in imperative mode",
+      framework::DataTypeToString(data_type), place));
 }

 void SelectedRowsAddToTensor(const framework::Variable& src,

--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <memory>
+#include <type_traits>
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/variable.h"
@@ -27,9 +28,8 @@ namespace imperative {

 void TensorAdd(const framework::Variable& src, framework::Variable* dst);

-#if defined(PADDLE_WITH_CUDA)
-template <typename T>
-int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
+template <typename Place, typename T>
+int TensorddTest(Place place, T t1, T t2) {
  framework::Variable var1;
  framework::Variable var2;
  std::vector<T> src_data(10, t1);
@@ -39,6 +39,7 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
  for (unsigned int i = 0; i < 10; i++) {
    result.emplace_back(src_data[i] + dst_data[i]);
  }
+
  std::vector<int64_t> dims = {2, 5};
  auto* src = var1.GetMutable<framework::LoDTensor>();
  auto* dst = var2.GetMutable<framework::LoDTensor>();
@@ -46,44 +47,19 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
  dst->Resize(framework::make_ddim(dims));
  auto* src_mutable = src->mutable_data<T>(place);
  auto* dst_mutable = dst->mutable_data<T>(place);
-  paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
-                       sizeof(T) * src_data.size(), 0);
-  paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
-                       sizeof(T) * dst_data.size(), 0);
-  imperative::TensorAdd(var1, &var2);
-  framework::LoDTensor rlt;
-  platform::CPUPlace rlt_place;
-  framework::TensorCopySync(*dst, rlt_place, &rlt);
-
-  for (unsigned int i = 0; i < rlt.numel(); i++) {
-    if (rlt.data<T>()[i] != result[i]) return 1;
-  }
-  return 0;
-}
+  if (!std::is_same<Place, platform::CUDAPlace>::value) {
+    paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
+                         sizeof(T) * src_data.size());
+    paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
+                         sizeof(T) * dst_data.size());
+#if defined(PADDLE_WITH_CUDA)
+  } else {
+    paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
+                         sizeof(T) * src_data.size(), 0);
+    paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
+                         sizeof(T) * dst_data.size(), 0);
 #endif
-
-template <typename T>
-int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
-  framework::Variable var1;
-  framework::Variable var2;
-  std::vector<T> src_data(10, t1);
-  std::vector<T> dst_data(10, t2);
-  std::vector<T> result;
-  platform::CPUPlace src_place;
-  for (unsigned int i = 0; i < 10; i++) {
-    result.emplace_back(src_data[i] + dst_data[i]);
  }
-  std::vector<int64_t> dims = {2, 5};
-  auto* src = var1.GetMutable<framework::LoDTensor>();
-  auto* dst = var2.GetMutable<framework::LoDTensor>();
-  src->Resize(framework::make_ddim(dims));
-  dst->Resize(framework::make_ddim(dims));
-  auto* src_mutable = src->mutable_data<T>(place);
-  auto* dst_mutable = dst->mutable_data<T>(place);
-  paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
-                       sizeof(T) * src_data.size());
-  paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
-                       sizeof(T) * dst_data.size());
  imperative::TensorAdd(var1, &var2);
  framework::LoDTensor rlt;
  platform::CPUPlace rlt_place;
@@ -92,6 +68,7 @@ int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
  for (unsigned int i = 0; i < rlt.numel(); i++) {
    if (rlt.data<T>()[i] != result[i]) return 1;
  }
+
  return 0;
 }

@@ -102,18 +79,38 @@ TEST(test_add_functor, add_functor) {
  platform::CPUPlace cpu_place;

  int cpu_res = 1;
-  cpu_res = TensorCPUAddTest(cpu_place, 1.0, 0.0);
+  cpu_res = TensorddTest(cpu_place, 1.0, 0.0);
+  EXPECT_EQ(cpu_res, 0);
+  cpu_res = TensorddTest(cpu_place, static_cast<double>(1.0),
+                         static_cast<double>(2.0));
  EXPECT_EQ(cpu_res, 0);
-  cpu_res = TensorCPUAddTest(cpu_place, static_cast<double>(1.0),
-                             static_cast<double>(2.0));
+  cpu_res = TensorddTest(cpu_place, static_cast<platform::float16>(1.0),
+                         static_cast<platform::float16>(2.0));
  EXPECT_EQ(cpu_res, 0);
 #if defined(PADDLE_WITH_CUDA)
  int gpu_res = 1;
-  gpu_res = TensorGPUAddTest(gpu_place, 1.0, 0.0);
+  gpu_res = TensorddTest(gpu_place, 1.0, 0.0);
  EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorGPUAddTest(gpu_place, static_cast<double>(1.0),
-                             static_cast<double>(2.0));
+  gpu_res = TensorddTest(gpu_place, static_cast<double>(1.0),
+                         static_cast<double>(2.0));
  EXPECT_EQ(gpu_res, 0);
+  gpu_res = TensorddTest(gpu_place, static_cast<platform::float16>(1.0),
+                         static_cast<platform::float16>(2.0));
+  EXPECT_EQ(gpu_res, 0);
+#endif
+}
+
+TEST(test_add_functor, execption) {
+  platform::CUDAPinnedPlace cuda_pinned_place;
+  platform::CUDAPlace cuda_place(0);
+  platform::CPUPlace cpu_place;
+
+  ASSERT_ANY_THROW(TensorddTest(cpu_place, 1, 0));
+#if defined(PADDLE_WITH_CUDA)
+  ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, 1.0, 0.0));
+  ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place,
+                                static_cast<platform::float16>(1.0),
+                                static_cast<platform::float16>(2.0)));
 #endif
 }


--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -146,6 +146,19 @@ template struct RowwiseSum<platform::CPUDeviceContext, double>;
 template struct RowwiseMean<platform::CPUDeviceContext, float>;
 template struct RowwiseMean<platform::CPUDeviceContext, double>;

+template <typename T>
+struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
+  void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src,
+                  framework::Tensor* dst) {
+    auto in = framework::EigenVector<T>::Flatten(src);
+    auto out = framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -148,6 +148,19 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
 template struct RowwiseMean<platform::CUDADeviceContext, float>;
 template struct RowwiseMean<platform::CUDADeviceContext, double>;

+template <typename T>
+struct ElementwiseAddTo<platform::CUDADeviceContext, T> {
+  void operator()(platform::CUDADeviceContext* ctx,
+                  const framework::Tensor& src, framework::Tensor* dst) {
+    auto in = framework::EigenVector<T>::Flatten(src);
+    auto out = framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<platform::CUDADeviceContext,
+                                 platform::float16>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -51,6 +51,13 @@ struct RowwiseAdd {
                  const framework::Tensor& vec, framework::Tensor* output);
 };

+template <typename DeviceContext, typename T>
+struct ElementwiseAddTo {
+  // dst = dst + src
+  void operator()(DeviceContext* ctx, const framework::Tensor& src,
+                  framework::Tensor* dst);
+};
+
 template <typename DeviceContext, typename T>
 struct ColwiseSum {
  void operator()(const DeviceContext& context, const framework::Tensor& input,