未验证 提交 b67ded04 编写于 作者: L Leo Chen 提交者: GitHub

Support gradient accumulation of fp16 in imperative mode (#24823)

* support gradient accumulation of fp16 in imperative mode, test=develop

* enhance coverage test, test=develop

* follow comments, test=develop
上级 1e190a9e
...@@ -2,7 +2,7 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags) ...@@ -2,7 +2,7 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags)
cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform) cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer) cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function)
add_subdirectory(jit) add_subdirectory(jit)
cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer) cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
...@@ -85,13 +86,19 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -85,13 +86,19 @@ class TensorAddFunctor : public boost::static_visitor<> {
} }
#else #else
void operator()(const platform::CUDAPlace& place) { void operator()(const platform::CUDAPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place); PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
} }
#endif #endif
// there is NO blas in CUDAPinnedPlace // there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) { void operator()(const platform::CUDAPinnedPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place); PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
} }
private: private:
...@@ -100,6 +107,16 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -100,6 +107,16 @@ class TensorAddFunctor : public boost::static_visitor<> {
T* y_; T* y_;
}; };
template <typename DeviceContext, typename T>
void TensorAddImpl(const framework::Tensor& src, framework::Tensor* dst,
const platform::Place& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
paddle::platform::DeviceContext* ctx = pool.Get(place);
auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
operators::math::ElementwiseAddTo<DeviceContext, T> func;
func(dev_ctx, src, dst);
}
void TensorAdd(const framework::Variable& src, framework::Variable* dst) { void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
auto* dst_tensor = dst->GetMutable<framework::LoDTensor>(); auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
auto& src_tensor = src.Get<framework::LoDTensor>(); auto& src_tensor = src.Get<framework::LoDTensor>();
...@@ -133,8 +150,26 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { ...@@ -133,8 +150,26 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
#undef PADDLE_TENSOR_ADD #undef PADDLE_TENSOR_ADD
PADDLE_THROW("Not supported data type %s for AddTo", if (data_type == framework::proto::VarType::FP16) {
framework::DataTypeToString(data_type)); if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
return TensorAddImpl<platform::CUDADeviceContext, platform::float16>(
src_tensor, dst_tensor, place);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
framework::DataTypeToString(data_type), place));
#endif
} else if (platform::is_cpu_place(place)) {
return TensorAddImpl<platform::CPUDeviceContext, platform::float16>(
src_tensor, dst_tensor, place);
}
}
PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
framework::DataTypeToString(data_type), place));
} }
void SelectedRowsAddToTensor(const framework::Variable& src, void SelectedRowsAddToTensor(const framework::Variable& src,
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <memory> #include <memory>
#include <type_traits>
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
...@@ -27,9 +28,8 @@ namespace imperative { ...@@ -27,9 +28,8 @@ namespace imperative {
void TensorAdd(const framework::Variable& src, framework::Variable* dst); void TensorAdd(const framework::Variable& src, framework::Variable* dst);
#if defined(PADDLE_WITH_CUDA) template <typename Place, typename T>
template <typename T> int TensorddTest(Place place, T t1, T t2) {
int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
framework::Variable var1; framework::Variable var1;
framework::Variable var2; framework::Variable var2;
std::vector<T> src_data(10, t1); std::vector<T> src_data(10, t1);
...@@ -39,6 +39,7 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) { ...@@ -39,6 +39,7 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
for (unsigned int i = 0; i < 10; i++) { for (unsigned int i = 0; i < 10; i++) {
result.emplace_back(src_data[i] + dst_data[i]); result.emplace_back(src_data[i] + dst_data[i]);
} }
std::vector<int64_t> dims = {2, 5}; std::vector<int64_t> dims = {2, 5};
auto* src = var1.GetMutable<framework::LoDTensor>(); auto* src = var1.GetMutable<framework::LoDTensor>();
auto* dst = var2.GetMutable<framework::LoDTensor>(); auto* dst = var2.GetMutable<framework::LoDTensor>();
...@@ -46,44 +47,19 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) { ...@@ -46,44 +47,19 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
dst->Resize(framework::make_ddim(dims)); dst->Resize(framework::make_ddim(dims));
auto* src_mutable = src->mutable_data<T>(place); auto* src_mutable = src->mutable_data<T>(place);
auto* dst_mutable = dst->mutable_data<T>(place); auto* dst_mutable = dst->mutable_data<T>(place);
paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), if (!std::is_same<Place, platform::CUDAPlace>::value) {
sizeof(T) * src_data.size(), 0); paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), sizeof(T) * src_data.size());
sizeof(T) * dst_data.size(), 0); paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
imperative::TensorAdd(var1, &var2); sizeof(T) * dst_data.size());
framework::LoDTensor rlt; #if defined(PADDLE_WITH_CUDA)
platform::CPUPlace rlt_place; } else {
framework::TensorCopySync(*dst, rlt_place, &rlt); paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
sizeof(T) * src_data.size(), 0);
for (unsigned int i = 0; i < rlt.numel(); i++) { paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
if (rlt.data<T>()[i] != result[i]) return 1; sizeof(T) * dst_data.size(), 0);
}
return 0;
}
#endif #endif
template <typename T>
int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
framework::Variable var1;
framework::Variable var2;
std::vector<T> src_data(10, t1);
std::vector<T> dst_data(10, t2);
std::vector<T> result;
platform::CPUPlace src_place;
for (unsigned int i = 0; i < 10; i++) {
result.emplace_back(src_data[i] + dst_data[i]);
} }
std::vector<int64_t> dims = {2, 5};
auto* src = var1.GetMutable<framework::LoDTensor>();
auto* dst = var2.GetMutable<framework::LoDTensor>();
src->Resize(framework::make_ddim(dims));
dst->Resize(framework::make_ddim(dims));
auto* src_mutable = src->mutable_data<T>(place);
auto* dst_mutable = dst->mutable_data<T>(place);
paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
sizeof(T) * src_data.size());
paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
sizeof(T) * dst_data.size());
imperative::TensorAdd(var1, &var2); imperative::TensorAdd(var1, &var2);
framework::LoDTensor rlt; framework::LoDTensor rlt;
platform::CPUPlace rlt_place; platform::CPUPlace rlt_place;
...@@ -92,6 +68,7 @@ int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) { ...@@ -92,6 +68,7 @@ int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
for (unsigned int i = 0; i < rlt.numel(); i++) { for (unsigned int i = 0; i < rlt.numel(); i++) {
if (rlt.data<T>()[i] != result[i]) return 1; if (rlt.data<T>()[i] != result[i]) return 1;
} }
return 0; return 0;
} }
...@@ -102,18 +79,38 @@ TEST(test_add_functor, add_functor) { ...@@ -102,18 +79,38 @@ TEST(test_add_functor, add_functor) {
platform::CPUPlace cpu_place; platform::CPUPlace cpu_place;
int cpu_res = 1; int cpu_res = 1;
cpu_res = TensorCPUAddTest(cpu_place, 1.0, 0.0); cpu_res = TensorddTest(cpu_place, 1.0, 0.0);
EXPECT_EQ(cpu_res, 0);
cpu_res = TensorddTest(cpu_place, static_cast<double>(1.0),
static_cast<double>(2.0));
EXPECT_EQ(cpu_res, 0); EXPECT_EQ(cpu_res, 0);
cpu_res = TensorCPUAddTest(cpu_place, static_cast<double>(1.0), cpu_res = TensorddTest(cpu_place, static_cast<platform::float16>(1.0),
static_cast<double>(2.0)); static_cast<platform::float16>(2.0));
EXPECT_EQ(cpu_res, 0); EXPECT_EQ(cpu_res, 0);
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
int gpu_res = 1; int gpu_res = 1;
gpu_res = TensorGPUAddTest(gpu_place, 1.0, 0.0); gpu_res = TensorddTest(gpu_place, 1.0, 0.0);
EXPECT_EQ(gpu_res, 0); EXPECT_EQ(gpu_res, 0);
gpu_res = TensorGPUAddTest(gpu_place, static_cast<double>(1.0), gpu_res = TensorddTest(gpu_place, static_cast<double>(1.0),
static_cast<double>(2.0)); static_cast<double>(2.0));
EXPECT_EQ(gpu_res, 0); EXPECT_EQ(gpu_res, 0);
gpu_res = TensorddTest(gpu_place, static_cast<platform::float16>(1.0),
static_cast<platform::float16>(2.0));
EXPECT_EQ(gpu_res, 0);
#endif
}
TEST(test_add_functor, execption) {
platform::CUDAPinnedPlace cuda_pinned_place;
platform::CUDAPlace cuda_place(0);
platform::CPUPlace cpu_place;
ASSERT_ANY_THROW(TensorddTest(cpu_place, 1, 0));
#if defined(PADDLE_WITH_CUDA)
ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, 1.0, 0.0));
ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place,
static_cast<platform::float16>(1.0),
static_cast<platform::float16>(2.0)));
#endif #endif
} }
......
...@@ -146,6 +146,19 @@ template struct RowwiseSum<platform::CPUDeviceContext, double>; ...@@ -146,6 +146,19 @@ template struct RowwiseSum<platform::CPUDeviceContext, double>;
template struct RowwiseMean<platform::CPUDeviceContext, float>; template struct RowwiseMean<platform::CPUDeviceContext, float>;
template struct RowwiseMean<platform::CPUDeviceContext, double>; template struct RowwiseMean<platform::CPUDeviceContext, double>;
template <typename T>
struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src,
framework::Tensor* dst) {
auto in = framework::EigenVector<T>::Flatten(src);
auto out = framework::EigenVector<T>::Flatten(*dst);
auto& place = *(ctx->eigen_device());
out.device(place) = out + in;
}
};
template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -148,6 +148,19 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()( ...@@ -148,6 +148,19 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
template struct RowwiseMean<platform::CUDADeviceContext, float>; template struct RowwiseMean<platform::CUDADeviceContext, float>;
template struct RowwiseMean<platform::CUDADeviceContext, double>; template struct RowwiseMean<platform::CUDADeviceContext, double>;
template <typename T>
struct ElementwiseAddTo<platform::CUDADeviceContext, T> {
void operator()(platform::CUDADeviceContext* ctx,
const framework::Tensor& src, framework::Tensor* dst) {
auto in = framework::EigenVector<T>::Flatten(src);
auto out = framework::EigenVector<T>::Flatten(*dst);
auto& place = *(ctx->eigen_device());
out.device(place) = out + in;
}
};
template struct ElementwiseAddTo<platform::CUDADeviceContext,
platform::float16>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -51,6 +51,13 @@ struct RowwiseAdd { ...@@ -51,6 +51,13 @@ struct RowwiseAdd {
const framework::Tensor& vec, framework::Tensor* output); const framework::Tensor& vec, framework::Tensor* output);
}; };
template <typename DeviceContext, typename T>
struct ElementwiseAddTo {
// dst = dst + src
void operator()(DeviceContext* ctx, const framework::Tensor& src,
framework::Tensor* dst);
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct ColwiseSum { struct ColwiseSum {
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册