未验证 提交 b67ded04 编写于 作者: L Leo Chen 提交者: GitHub

Support gradient accumulation of fp16 in imperative mode (#24823)

* support gradient accumulation of fp16 in imperative mode, test=develop

* enhance coverage test, test=develop

* follow comments, test=develop
上级 1e190a9e
......@@ -2,7 +2,7 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags)
cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer)
cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function)
add_subdirectory(jit)
cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
......
......@@ -24,6 +24,7 @@
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
......@@ -85,13 +86,19 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#else
void operator()(const platform::CUDAPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#endif
// there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
private:
......@@ -100,6 +107,16 @@ class TensorAddFunctor : public boost::static_visitor<> {
T* y_;
};
template <typename DeviceContext, typename T>
void TensorAddImpl(const framework::Tensor& src, framework::Tensor* dst,
const platform::Place& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
paddle::platform::DeviceContext* ctx = pool.Get(place);
auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
operators::math::ElementwiseAddTo<DeviceContext, T> func;
func(dev_ctx, src, dst);
}
void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
auto& src_tensor = src.Get<framework::LoDTensor>();
......@@ -133,8 +150,26 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
#undef PADDLE_TENSOR_ADD
PADDLE_THROW("Not supported data type %s for AddTo",
framework::DataTypeToString(data_type));
if (data_type == framework::proto::VarType::FP16) {
if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
return TensorAddImpl<platform::CUDADeviceContext, platform::float16>(
src_tensor, dst_tensor, place);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
framework::DataTypeToString(data_type), place));
#endif
} else if (platform::is_cpu_place(place)) {
return TensorAddImpl<platform::CPUDeviceContext, platform::float16>(
src_tensor, dst_tensor, place);
}
}
PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
framework::DataTypeToString(data_type), place));
}
void SelectedRowsAddToTensor(const framework::Variable& src,
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include <memory>
#include <type_traits>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/variable.h"
......@@ -27,9 +28,8 @@ namespace imperative {
void TensorAdd(const framework::Variable& src, framework::Variable* dst);
#if defined(PADDLE_WITH_CUDA)
template <typename T>
int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
template <typename Place, typename T>
int TensorddTest(Place place, T t1, T t2) {
framework::Variable var1;
framework::Variable var2;
std::vector<T> src_data(10, t1);
......@@ -39,6 +39,7 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
for (unsigned int i = 0; i < 10; i++) {
result.emplace_back(src_data[i] + dst_data[i]);
}
std::vector<int64_t> dims = {2, 5};
auto* src = var1.GetMutable<framework::LoDTensor>();
auto* dst = var2.GetMutable<framework::LoDTensor>();
......@@ -46,44 +47,19 @@ int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
dst->Resize(framework::make_ddim(dims));
auto* src_mutable = src->mutable_data<T>(place);
auto* dst_mutable = dst->mutable_data<T>(place);
paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
sizeof(T) * src_data.size(), 0);
paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
sizeof(T) * dst_data.size(), 0);
imperative::TensorAdd(var1, &var2);
framework::LoDTensor rlt;
platform::CPUPlace rlt_place;
framework::TensorCopySync(*dst, rlt_place, &rlt);
for (unsigned int i = 0; i < rlt.numel(); i++) {
if (rlt.data<T>()[i] != result[i]) return 1;
}
return 0;
}
if (!std::is_same<Place, platform::CUDAPlace>::value) {
paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
sizeof(T) * src_data.size());
paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
sizeof(T) * dst_data.size());
#if defined(PADDLE_WITH_CUDA)
} else {
paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
sizeof(T) * src_data.size(), 0);
paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
sizeof(T) * dst_data.size(), 0);
#endif
template <typename T>
int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
framework::Variable var1;
framework::Variable var2;
std::vector<T> src_data(10, t1);
std::vector<T> dst_data(10, t2);
std::vector<T> result;
platform::CPUPlace src_place;
for (unsigned int i = 0; i < 10; i++) {
result.emplace_back(src_data[i] + dst_data[i]);
}
std::vector<int64_t> dims = {2, 5};
auto* src = var1.GetMutable<framework::LoDTensor>();
auto* dst = var2.GetMutable<framework::LoDTensor>();
src->Resize(framework::make_ddim(dims));
dst->Resize(framework::make_ddim(dims));
auto* src_mutable = src->mutable_data<T>(place);
auto* dst_mutable = dst->mutable_data<T>(place);
paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
sizeof(T) * src_data.size());
paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
sizeof(T) * dst_data.size());
imperative::TensorAdd(var1, &var2);
framework::LoDTensor rlt;
platform::CPUPlace rlt_place;
......@@ -92,6 +68,7 @@ int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
for (unsigned int i = 0; i < rlt.numel(); i++) {
if (rlt.data<T>()[i] != result[i]) return 1;
}
return 0;
}
......@@ -102,18 +79,38 @@ TEST(test_add_functor, add_functor) {
platform::CPUPlace cpu_place;
int cpu_res = 1;
cpu_res = TensorCPUAddTest(cpu_place, 1.0, 0.0);
cpu_res = TensorddTest(cpu_place, 1.0, 0.0);
EXPECT_EQ(cpu_res, 0);
cpu_res = TensorddTest(cpu_place, static_cast<double>(1.0),
static_cast<double>(2.0));
EXPECT_EQ(cpu_res, 0);
cpu_res = TensorCPUAddTest(cpu_place, static_cast<double>(1.0),
static_cast<double>(2.0));
cpu_res = TensorddTest(cpu_place, static_cast<platform::float16>(1.0),
static_cast<platform::float16>(2.0));
EXPECT_EQ(cpu_res, 0);
#if defined(PADDLE_WITH_CUDA)
int gpu_res = 1;
gpu_res = TensorGPUAddTest(gpu_place, 1.0, 0.0);
gpu_res = TensorddTest(gpu_place, 1.0, 0.0);
EXPECT_EQ(gpu_res, 0);
gpu_res = TensorGPUAddTest(gpu_place, static_cast<double>(1.0),
static_cast<double>(2.0));
gpu_res = TensorddTest(gpu_place, static_cast<double>(1.0),
static_cast<double>(2.0));
EXPECT_EQ(gpu_res, 0);
gpu_res = TensorddTest(gpu_place, static_cast<platform::float16>(1.0),
static_cast<platform::float16>(2.0));
EXPECT_EQ(gpu_res, 0);
#endif
}
TEST(test_add_functor, execption) {
platform::CUDAPinnedPlace cuda_pinned_place;
platform::CUDAPlace cuda_place(0);
platform::CPUPlace cpu_place;
ASSERT_ANY_THROW(TensorddTest(cpu_place, 1, 0));
#if defined(PADDLE_WITH_CUDA)
ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, 1.0, 0.0));
ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place,
static_cast<platform::float16>(1.0),
static_cast<platform::float16>(2.0)));
#endif
}
......
......@@ -146,6 +146,19 @@ template struct RowwiseSum<platform::CPUDeviceContext, double>;
template struct RowwiseMean<platform::CPUDeviceContext, float>;
template struct RowwiseMean<platform::CPUDeviceContext, double>;
template <typename T>
struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src,
framework::Tensor* dst) {
auto in = framework::EigenVector<T>::Flatten(src);
auto out = framework::EigenVector<T>::Flatten(*dst);
auto& place = *(ctx->eigen_device());
out.device(place) = out + in;
}
};
template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -148,6 +148,19 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
template struct RowwiseMean<platform::CUDADeviceContext, float>;
template struct RowwiseMean<platform::CUDADeviceContext, double>;
template <typename T>
struct ElementwiseAddTo<platform::CUDADeviceContext, T> {
void operator()(platform::CUDADeviceContext* ctx,
const framework::Tensor& src, framework::Tensor* dst) {
auto in = framework::EigenVector<T>::Flatten(src);
auto out = framework::EigenVector<T>::Flatten(*dst);
auto& place = *(ctx->eigen_device());
out.device(place) = out + in;
}
};
template struct ElementwiseAddTo<platform::CUDADeviceContext,
platform::float16>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -51,6 +51,13 @@ struct RowwiseAdd {
const framework::Tensor& vec, framework::Tensor* output);
};
template <typename DeviceContext, typename T>
struct ElementwiseAddTo {
// dst = dst + src
void operator()(DeviceContext* ctx, const framework::Tensor& src,
framework::Tensor* dst);
};
template <typename DeviceContext, typename T>
struct ColwiseSum {
void operator()(const DeviceContext& context, const framework::Tensor& input,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册