未验证 提交 194f3dcf 编写于 作者: Q qingqing01 提交者: GitHub

Support fp16 in GPU impl of fused_elemwise_activation_op. (#20636) (#20655)

* Support fp16 in fused_elemwise_activation_op.
* Fix unit testing in ONLY-CPU mode.
上级 ddcb81d1
...@@ -20,11 +20,15 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -20,11 +20,15 @@ REGISTER_OP_CUDA_KERNEL(
ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext, ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
float>, float>,
ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext, ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
double>); double>,
ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
fused_elemwise_activation_grad, fused_elemwise_activation_grad,
ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext, ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
float>, float>,
ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext, ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
double>); double>,
ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>);
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/math.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -40,8 +42,8 @@ struct AddFunctor { ...@@ -40,8 +42,8 @@ struct AddFunctor {
template <typename T> template <typename T>
struct AddGradFunctor { struct AddGradFunctor {
inline HOSTDEVICE T Dx(T x, T y) { return 1; } inline HOSTDEVICE T Dx(T x, T y) { return static_cast<T>(1.); }
inline HOSTDEVICE T Dy(T x, T y) { return 1; } inline HOSTDEVICE T Dy(T x, T y) { return static_cast<T>(1.); }
}; };
template <typename T> template <typename T>
...@@ -68,14 +70,22 @@ struct ScaleGradFunctor { ...@@ -68,14 +70,22 @@ struct ScaleGradFunctor {
template <typename T> template <typename T>
struct ReluFunctor { struct ReluFunctor {
inline HOSTDEVICE T operator()(T x) { return x * (x > 0); } inline HOSTDEVICE T operator()(T x) {
return x * (x > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0));
}
}; };
template <typename T> template <typename T>
struct ReluGradFunctor { struct ReluGradFunctor {
inline HOSTDEVICE T UseX(T x) { return x > 0 ? 1 : 0; } inline HOSTDEVICE T UseX(T x) {
inline HOSTDEVICE T UseOut(T out) { return out > 0 ? 1 : 0; } return x > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0);
inline HOSTDEVICE T UseXAndOut(T x, T out) { return out > 0 ? 1 : 0; } }
inline HOSTDEVICE T UseOut(T out) {
return out > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0);
}
inline HOSTDEVICE T UseXAndOut(T x, T out) {
return out > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0);
}
}; };
template <typename T> template <typename T>
...@@ -84,9 +94,9 @@ struct TanhFunctor { ...@@ -84,9 +94,9 @@ struct TanhFunctor {
const T kMax = static_cast<T>(13); const T kMax = static_cast<T>(13);
inline HOSTDEVICE T operator()(T x) { inline HOSTDEVICE T operator()(T x) {
// y = 2 / (1 + e^-2x) - 1 // y = 2 / (1 + e^-2x) - 1
T t0 = 2 * x; T t0 = static_cast<T>(2) * x;
T t1 = (t0 < kMin) ? kMin : ((t0 > kMax) ? kMax : t0); T t1 = (t0 < kMin) ? kMin : ((t0 > kMax) ? kMax : t0);
return static_cast<T>(2) / (static_cast<T>(1) + std::exp(-t1)) - return static_cast<T>(2) / (static_cast<T>(1) + real_exp(-t1)) -
static_cast<T>(1); static_cast<T>(1);
} }
}; };
...@@ -107,7 +117,7 @@ struct SigmoidFunctor { ...@@ -107,7 +117,7 @@ struct SigmoidFunctor {
inline HOSTDEVICE T operator()(T x) { inline HOSTDEVICE T operator()(T x) {
// y = 1 / (1 + e^-x) // y = 1 / (1 + e^-x)
T tmp = (x < kMin) ? kMin : ((x > kMax) ? kMax : x); T tmp = (x < kMin) ? kMin : ((x > kMax) ? kMax : x);
return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp)); return static_cast<T>(1) / (static_cast<T>(1) + real_exp(-tmp));
} }
}; };
......
...@@ -33,17 +33,24 @@ from op_test import OpTest ...@@ -33,17 +33,24 @@ from op_test import OpTest
# TestFusedElementwiseActivationOp_channelwise_add # TestFusedElementwiseActivationOp_channelwise_add
def create_test_class(test_case, callback, attrs): def create_test_class(test_case,
callback,
attrs,
dtype=np.float32,
grad_chek=True):
class TestFusedElementwiseActivationOp_base(OpTest): class TestFusedElementwiseActivationOp_base(OpTest):
def setUp(self): def setUp(self):
self.op_type = "fused_elemwise_activation" self.op_type = "fused_elemwise_activation"
self.dtype = np.float32 self.dtype = dtype
self.axis = -1 self.axis = -1
self.init_input() self.init_input()
self.init_output() self.init_output()
self.init_attr() self.init_attr()
self.out = self.out.astype(self.dtype)
self.intermediate_out = self.intermediate_out.astype(self.dtype)
self.inputs = { self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x), 'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y) 'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
...@@ -71,16 +78,25 @@ def create_test_class(test_case, callback, attrs): ...@@ -71,16 +78,25 @@ def create_test_class(test_case, callback, attrs):
self.attrs[key] = attrs[key] self.attrs[key] = attrs[key]
def test_check_output(self): def test_check_output(self):
self.check_output() if self.dtype == np.float16 and core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
else:
self.check_output()
# FIXME(zcd): the intermediate_out_grad is not checked. # FIXME(zcd): the intermediate_out_grad is not checked.
def test_check_grad_normal(self): def test_check_grad_normal(self):
if not grad_chek:
return
if self.attrs["save_intermediate_out"]: if self.attrs["save_intermediate_out"]:
self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005) self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005)
else: else:
self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005) self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005)
def test_check_grad_ingore_x(self): def test_check_grad_ingore_x(self):
if not grad_chek:
return
if self.attrs["save_intermediate_out"]: if self.attrs["save_intermediate_out"]:
self.check_grad( self.check_grad(
['Y'], ['Out'], ['Y'], ['Out'],
...@@ -93,6 +109,8 @@ def create_test_class(test_case, callback, attrs): ...@@ -93,6 +109,8 @@ def create_test_class(test_case, callback, attrs):
no_grad_set=set("X")) no_grad_set=set("X"))
def test_check_grad_ingore_y(self): def test_check_grad_ingore_y(self):
if not grad_chek:
return
if self.attrs["save_intermediate_out"]: if self.attrs["save_intermediate_out"]:
self.check_grad( self.check_grad(
['X'], ['Out'], ['X'], ['Out'],
...@@ -307,11 +325,29 @@ for mode in {0, 1}: ...@@ -307,11 +325,29 @@ for mode in {0, 1}:
'functor_list': ["scale", "elementwise_add"], 'functor_list': ["scale", "elementwise_add"],
'save_intermediate_out': save_intermediate_out, 'save_intermediate_out': save_intermediate_out,
}) })
create_test_class(
'scale_add_fp16' + suffix,
scale_add_func, {
'scale': scale,
'functor_list': ["scale", "elementwise_add"],
'save_intermediate_out': save_intermediate_out,
},
dtype=np.float16,
grad_chek=False)
create_test_class('add_scale' + suffix, add_scale_func, { create_test_class('add_scale' + suffix, add_scale_func, {
'scale': scale, 'scale': scale,
'functor_list': ["elementwise_add", "scale"], 'functor_list': ["elementwise_add", "scale"],
'save_intermediate_out': save_intermediate_out, 'save_intermediate_out': save_intermediate_out,
}) })
create_test_class(
'add_scale_fp16' + suffix,
add_scale_func, {
'scale': scale,
'functor_list': ["elementwise_add", "scale"],
'save_intermediate_out': save_intermediate_out,
},
dtype=np.float16,
grad_chek=False)
create_test_class('add_relu' + suffix, add_relu_func, { create_test_class('add_relu' + suffix, add_relu_func, {
'functor_list': ["elementwise_add", "relu"], 'functor_list': ["elementwise_add", "relu"],
'save_intermediate_out': save_intermediate_out, 'save_intermediate_out': save_intermediate_out,
...@@ -320,11 +356,36 @@ for mode in {0, 1}: ...@@ -320,11 +356,36 @@ for mode in {0, 1}:
'functor_list': ["relu", "elementwise_add"], 'functor_list': ["relu", "elementwise_add"],
'save_intermediate_out': save_intermediate_out, 'save_intermediate_out': save_intermediate_out,
}) })
create_test_class(
'add_relu_fp16' + suffix,
add_relu_func, {
'functor_list': ["elementwise_add", "relu"],
'save_intermediate_out': save_intermediate_out,
},
dtype=np.float16,
grad_chek=False)
create_test_class(
'relu_add_fp16' + suffix,
relu_add_func, {
'functor_list': ["relu", "elementwise_add"],
'save_intermediate_out': save_intermediate_out,
},
dtype=np.float16,
grad_chek=False)
create_test_class('mul_scale' + suffix, mul_scale_func, { create_test_class('mul_scale' + suffix, mul_scale_func, {
'scale': scale, 'scale': scale,
'functor_list': ["elementwise_mul", "scale"], 'functor_list': ["elementwise_mul", "scale"],
'save_intermediate_out': save_intermediate_out, 'save_intermediate_out': save_intermediate_out,
}) })
create_test_class(
'mul_scale' + suffix,
mul_scale_func, {
'scale': scale,
'functor_list': ["elementwise_mul", "scale"],
'save_intermediate_out': save_intermediate_out,
},
dtype=np.float16,
grad_chek=False)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册