diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 1074ed6acc22a81f46c466d917ef973945a12898..e4436549f6185ba04a5f270893596a6dcb11e89b 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -35,7 +35,6 @@ class DropoutOp : public framework::OperatorWithKernel { } }; -template class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { public: DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker) @@ -73,7 +72,6 @@ are set equal to their corresponding inputs. } }; -template class DropoutOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -103,11 +101,10 @@ class DropoutOpGrad : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad, - ops::DropoutOpGrad); +REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad, + ops::DropoutOpGrad); REGISTER_OP_CPU_KERNEL( - dropout, - ops::CPUDropoutKernel); + dropout, ops::CPUDropoutKernel); REGISTER_OP_CPU_KERNEL( dropout_grad, ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index d6f9c04359d733cb4f3f0586e9239ee67deb7078..f6c85a2a537b37feb20e6d62729dc5075af2a5d9 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -18,17 +18,18 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { -template +template __global__ void RandomGenerator(const size_t n, const int seed, - const AttrType dropout_prob, const T* src, + const float dropout_prob, const T* src, T* mask_data, T* dst) { thrust::minstd_rand rng; rng.seed(seed); - thrust::uniform_real_distribution dist(0, 1); + thrust::uniform_real_distribution dist(0, 1); int idx = blockDim.x * blockIdx.x + threadIdx.x; for (; idx < n; idx += blockDim.x * gridDim.x) { @@ -44,14 +45,14 @@ __global__ void RandomGenerator(const size_t n, const int seed, // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT. // Use std::random and thrust::random(thrust is a std library in CUDA) to // implement uniform random. -template +template class GPUDropoutKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* x = context.Input("X"); auto* y = context.Output("Out"); y->mutable_data(context.GetPlace()); - AttrType dropout_prob = context.Attr("dropout_prob"); + float dropout_prob = context.Attr("dropout_prob"); auto X = EigenMatrix::Reshape(*x, 1); auto Y = EigenMatrix::Reshape(*y, 1); @@ -70,11 +71,11 @@ class GPUDropoutKernel : public framework::OpKernel { int threads = 512; int grid = (x->numel() + threads - 1) / threads; - RandomGenerator<<>>( + RandomGenerator< + T><<>>( size, seed, dropout_prob, x_data, mask_data, y_data); } else { - Y.device(place) = X * (1.0f - dropout_prob); + Y.device(place) = X * static_cast(1.0f - dropout_prob); } } }; @@ -83,9 +84,9 @@ class GPUDropoutKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( - dropout, - ops::GPUDropoutKernel); -REGISTER_OP_CUDA_KERNEL( - dropout_grad, - ops::DropoutGradKernel); + dropout, ops::GPUDropoutKernel, + ops::GPUDropoutKernel); +REGISTER_OP_CUDA_KERNEL(dropout_grad, + ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h index 209e4dec1756dc65fbf147c4dbbf0913d3c6ef7e..b5ee86ae2d11dfc835e1a3a6826ce016baf38a29 100644 --- a/paddle/fluid/operators/dropout_op.h +++ b/paddle/fluid/operators/dropout_op.h @@ -25,7 +25,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class CPUDropoutKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index 52fb8c2531357ad7a2b2f8613e5c7fbcef52c6bb..d3312a47f479160439d720dd993ee25a56d732fe 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -483,8 +483,123 @@ DEVICE inline bool operator>=(const half& a, const half& b) { #endif // PADDLE_CUDA_FP16 -// Arithmetic operators on ARMv8.2-A CPU -#if defined(PADDLE_WITH_NATIVE_FP16) +// Arithmetic operators for float16 on GPU +#if defined(PADDLE_CUDA_FP16) +HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return float16(__hadd(half(a), half(b))); +#else + return float16(float(a) + float(b)); +#endif +} + +HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return float16(__hsub(half(a), half(b))); +#else + return float16(float(a) - float(b)); +#endif +} + +HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return float16(__hmul(half(a), half(b))); +#else + return float16(float(a) * float(b)); +#endif +} + +HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + // TODO(kexinzhao): check which cuda version starts to support __hdiv + float num = __half2float(half(a)); + float denom = __half2float(half(b)); + return float16(num / denom); +#else + return float16(float(a) / float(b)); +#endif +} + +HOSTDEVICE inline float16 operator-(const float16& a) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return float16(__hneg(half(a))); +#else + float16 res; + res.x = a.x ^ 0x8000; + return res; +#endif +} + +HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { + a = a + b; + return a; +} + +HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { + a = a - b; + return a; +} + +HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { + a = a * b; + return a; +} + +HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { + a = a / b; + return a; +} + +HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __heq(half(a), half(b)); +#else + return float(a) == float(b); +#endif +} + +HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hne(half(a), half(b)); +#else + return float(a) != float(b); +#endif +} + +HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(half(a), half(b)); +#else + return float(a) < float(b); +#endif +} + +HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hle(half(a), half(b)); +#else + return float(a) <= float(b); +#endif +} + +HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hgt(half(a), half(b)); +#else + return float(a) > float(b); +#endif +} + +HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hge(half(a), half(b)); +#else + return float(a) >= float(b); +#endif +} + +// Arithmetic operators for float16 on ARMv8.2-A CPU +#elif defined(PADDLE_WITH_NATIVE_FP16) HOST inline float16 operator+(const float16& a, const float16& b) { float16 res; asm volatile( @@ -668,71 +783,71 @@ HOST inline bool operator>=(const float16& a, const float16& b) { return (res & 0xffff) != 0; } -// Arithmetic operators, software emulated on other CPU +// Arithmetic operators for float16, software emulated on other CPU #else -HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { +HOST inline float16 operator+(const float16& a, const float16& b) { return float16(float(a) + float(b)); } -HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { +HOST inline float16 operator-(const float16& a, const float16& b) { return float16(float(a) - float(b)); } -HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { +HOST inline float16 operator*(const float16& a, const float16& b) { return float16(float(a) * float(b)); } -HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { +HOST inline float16 operator/(const float16& a, const float16& b) { return float16(float(a) / float(b)); } -HOSTDEVICE inline float16 operator-(const float16& a) { +HOST inline float16 operator-(const float16& a) { float16 res; res.x = a.x ^ 0x8000; return res; } -HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { +HOST inline float16& operator+=(float16& a, const float16& b) { a = float16(float(a) + float(b)); return a; } -HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { +HOST inline float16& operator-=(float16& a, const float16& b) { a = float16(float(a) - float(b)); return a; } -HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { +HOST inline float16& operator*=(float16& a, const float16& b) { a = float16(float(a) * float(b)); return a; } -HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { +HOST inline float16& operator/=(float16& a, const float16& b) { a = float16(float(a) / float(b)); return a; } -HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { +HOST inline bool operator==(const float16& a, const float16& b) { return float(a) == float(b); } -HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { +HOST inline bool operator!=(const float16& a, const float16& b) { return float(a) != float(b); } -HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { +HOST inline bool operator<(const float16& a, const float16& b) { return float(a) < float(b); } -HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { +HOST inline bool operator<=(const float16& a, const float16& b) { return float(a) <= float(b); } -HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { +HOST inline bool operator>(const float16& a, const float16& b) { return float(a) > float(b); } -HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { +HOST inline bool operator>=(const float16& a, const float16& b) { return float(a) >= float(b); } #endif diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py index 60930a612c128cbf18e89711b9246d148e41ec58..eaa3435a86462236a99489749abe877648677053 100644 --- a/python/paddle/fluid/tests/unittests/test_dropout_op.py +++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py @@ -14,6 +14,7 @@ import unittest import numpy as np +import paddle.fluid.core as core from op_test import OpTest @@ -82,5 +83,37 @@ class TestDropoutOp5(OpTest): self.check_output() +class TestFP16DropoutOp(OpTest): + def setUp(self): + self.op_type = "dropout" + self.init_test_case() + + x = np.random.random(self.input_size).astype("float16") + out = x * (1.0 - self.prob) + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = { + 'dropout_prob': self.prob, + 'fix_seed': self.fix_seed, + 'is_test': True + } + self.outputs = {'Out': out} + + def init_test_case(self): + self.input_size = [32, 64] + self.prob = 0.35 + self.fix_seed = True + + def test_check_output(self): + if core.is_compiled_with_cuda() and core.op_support_gpu("dropout"): + self.check_output_with_place(core.CUDAPlace(0), atol=1e-3) + + +class TestFP16DropoutOp2(TestFP16DropoutOp): + def init_test_case(self): + self.input_size = [32, 64, 3] + self.prob = 0.75 + self.fix_seed = False + + if __name__ == '__main__': unittest.main()