From b67ded04f26941d102b11eb8e8645fd5e6ee2b2f Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 3 Jun 2020 10:25:27 +0800 Subject: [PATCH] Support gradient accumulation of fp16 in imperative mode (#24823) * support gradient accumulation of fp16 in imperative mode, test=develop * enhance coverage test, test=develop * follow comments, test=develop --- paddle/fluid/imperative/CMakeLists.txt | 2 +- .../fluid/imperative/gradient_accumulator.cc | 43 ++++++++- .../tests/test_gradient_accmulator.cc | 87 +++++++++---------- paddle/fluid/operators/math/math_function.cc | 13 +++ paddle/fluid/operators/math/math_function.cu | 13 +++ paddle/fluid/operators/math/math_function.h | 7 ++ 6 files changed, 115 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 0403cf25c7..2a9e559d0c 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -2,7 +2,7 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags) cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform) cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function) add_subdirectory(jit) cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index e8ea7dc926..f5fc594470 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -24,6 +24,7 @@ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -85,13 +86,19 @@ class TensorAddFunctor : public boost::static_visitor<> { } #else void operator()(const platform::CUDAPlace& place) { - PADDLE_THROW("Do NOT support gradient merge in place %s", place); + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); } #endif // there is NO blas in CUDAPinnedPlace void operator()(const platform::CUDAPinnedPlace& place) { - PADDLE_THROW("Do NOT support gradient merge in place %s", place); + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); } private: @@ -100,6 +107,16 @@ class TensorAddFunctor : public boost::static_visitor<> { T* y_; }; +template +void TensorAddImpl(const framework::Tensor& src, framework::Tensor* dst, + const platform::Place& place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + paddle::platform::DeviceContext* ctx = pool.Get(place); + auto dev_ctx = dynamic_cast(ctx); + operators::math::ElementwiseAddTo func; + func(dev_ctx, src, dst); +} + void TensorAdd(const framework::Variable& src, framework::Variable* dst) { auto* dst_tensor = dst->GetMutable(); auto& src_tensor = src.Get(); @@ -133,8 +150,26 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { #undef PADDLE_TENSOR_ADD - PADDLE_THROW("Not supported data type %s for AddTo", - framework::DataTypeToString(data_type)); + if (data_type == framework::proto::VarType::FP16) { + if (platform::is_gpu_place(place)) { +#ifdef PADDLE_WITH_CUDA + return TensorAddImpl( + src_tensor, dst_tensor, place); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "Gradient accumulation of data type (%s) on place (%s) is not " + "supported in imperative mode", + framework::DataTypeToString(data_type), place)); +#endif + } else if (platform::is_cpu_place(place)) { + return TensorAddImpl( + src_tensor, dst_tensor, place); + } + } + PADDLE_THROW(platform::errors::Unimplemented( + "Gradient accumulation of data type (%s) on place (%s) is not " + "supported in imperative mode", + framework::DataTypeToString(data_type), place)); } void SelectedRowsAddToTensor(const framework::Variable& src, diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc index 0d567f5d05..49bc24edba 100644 --- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc +++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include "gtest/gtest.h" #include "paddle/fluid/framework/variable.h" @@ -27,9 +28,8 @@ namespace imperative { void TensorAdd(const framework::Variable& src, framework::Variable* dst); -#if defined(PADDLE_WITH_CUDA) -template -int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) { +template +int TensorddTest(Place place, T t1, T t2) { framework::Variable var1; framework::Variable var2; std::vector src_data(10, t1); @@ -39,6 +39,7 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) { for (unsigned int i = 0; i < 10; i++) { result.emplace_back(src_data[i] + dst_data[i]); } + std::vector dims = {2, 5}; auto* src = var1.GetMutable(); auto* dst = var2.GetMutable(); @@ -46,44 +47,19 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) { dst->Resize(framework::make_ddim(dims)); auto* src_mutable = src->mutable_data(place); auto* dst_mutable = dst->mutable_data(place); - paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), - sizeof(T) * src_data.size(), 0); - paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), - sizeof(T) * dst_data.size(), 0); - imperative::TensorAdd(var1, &var2); - framework::LoDTensor rlt; - platform::CPUPlace rlt_place; - framework::TensorCopySync(*dst, rlt_place, &rlt); - - for (unsigned int i = 0; i < rlt.numel(); i++) { - if (rlt.data()[i] != result[i]) return 1; - } - return 0; -} + if (!std::is_same::value) { + paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), + sizeof(T) * src_data.size()); + paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), + sizeof(T) * dst_data.size()); +#if defined(PADDLE_WITH_CUDA) + } else { + paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), + sizeof(T) * src_data.size(), 0); + paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), + sizeof(T) * dst_data.size(), 0); #endif - -template -int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) { - framework::Variable var1; - framework::Variable var2; - std::vector src_data(10, t1); - std::vector dst_data(10, t2); - std::vector result; - platform::CPUPlace src_place; - for (unsigned int i = 0; i < 10; i++) { - result.emplace_back(src_data[i] + dst_data[i]); } - std::vector dims = {2, 5}; - auto* src = var1.GetMutable(); - auto* dst = var2.GetMutable(); - src->Resize(framework::make_ddim(dims)); - dst->Resize(framework::make_ddim(dims)); - auto* src_mutable = src->mutable_data(place); - auto* dst_mutable = dst->mutable_data(place); - paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), - sizeof(T) * src_data.size()); - paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), - sizeof(T) * dst_data.size()); imperative::TensorAdd(var1, &var2); framework::LoDTensor rlt; platform::CPUPlace rlt_place; @@ -92,6 +68,7 @@ int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) { for (unsigned int i = 0; i < rlt.numel(); i++) { if (rlt.data()[i] != result[i]) return 1; } + return 0; } @@ -102,18 +79,38 @@ TEST(test_add_functor, add_functor) { platform::CPUPlace cpu_place; int cpu_res = 1; - cpu_res = TensorCPUAddTest(cpu_place, 1.0, 0.0); + cpu_res = TensorddTest(cpu_place, 1.0, 0.0); + EXPECT_EQ(cpu_res, 0); + cpu_res = TensorddTest(cpu_place, static_cast(1.0), + static_cast(2.0)); EXPECT_EQ(cpu_res, 0); - cpu_res = TensorCPUAddTest(cpu_place, static_cast(1.0), - static_cast(2.0)); + cpu_res = TensorddTest(cpu_place, static_cast(1.0), + static_cast(2.0)); EXPECT_EQ(cpu_res, 0); #if defined(PADDLE_WITH_CUDA) int gpu_res = 1; - gpu_res = TensorGPUAddTest(gpu_place, 1.0, 0.0); + gpu_res = TensorddTest(gpu_place, 1.0, 0.0); EXPECT_EQ(gpu_res, 0); - gpu_res = TensorGPUAddTest(gpu_place, static_cast(1.0), - static_cast(2.0)); + gpu_res = TensorddTest(gpu_place, static_cast(1.0), + static_cast(2.0)); EXPECT_EQ(gpu_res, 0); + gpu_res = TensorddTest(gpu_place, static_cast(1.0), + static_cast(2.0)); + EXPECT_EQ(gpu_res, 0); +#endif +} + +TEST(test_add_functor, execption) { + platform::CUDAPinnedPlace cuda_pinned_place; + platform::CUDAPlace cuda_place(0); + platform::CPUPlace cpu_place; + + ASSERT_ANY_THROW(TensorddTest(cpu_place, 1, 0)); +#if defined(PADDLE_WITH_CUDA) + ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, 1.0, 0.0)); + ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, + static_cast(1.0), + static_cast(2.0))); #endif } diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index e1491a8156..44b0410441 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -146,6 +146,19 @@ template struct RowwiseSum; template struct RowwiseMean; template struct RowwiseMean; +template +struct ElementwiseAddTo { + void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src, + framework::Tensor* dst) { + auto in = framework::EigenVector::Flatten(src); + auto out = framework::EigenVector::Flatten(*dst); + auto& place = *(ctx->eigen_device()); + out.device(place) = out + in; + } +}; + +template struct ElementwiseAddTo; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index fdbd77a5c8..235bbb57ed 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -148,6 +148,19 @@ void RowwiseSum::operator()( template struct RowwiseMean; template struct RowwiseMean; +template +struct ElementwiseAddTo { + void operator()(platform::CUDADeviceContext* ctx, + const framework::Tensor& src, framework::Tensor* dst) { + auto in = framework::EigenVector::Flatten(src); + auto out = framework::EigenVector::Flatten(*dst); + auto& place = *(ctx->eigen_device()); + out.device(place) = out + in; + } +}; + +template struct ElementwiseAddTo; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h index b4f19417b6..333552a0c1 100644 --- a/paddle/fluid/operators/math/math_function.h +++ b/paddle/fluid/operators/math/math_function.h @@ -51,6 +51,13 @@ struct RowwiseAdd { const framework::Tensor& vec, framework::Tensor* output); }; +template +struct ElementwiseAddTo { + // dst = dst + src + void operator()(DeviceContext* ctx, const framework::Tensor& src, + framework::Tensor* dst); +}; + template struct ColwiseSum { void operator()(const DeviceContext& context, const framework::Tensor& input, -- GitLab