diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 2033081af224a4e938a7b4f0f619729feea57506..29498da0f026f5926d29ef924e442121a06a43d8 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -10,8 +10,276 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/math/math_cuda_utils.h" +#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/float16.h" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using float16 = paddle::platform::float16; + +template +struct CudaVecType { + using type = T; + static constexpr int vecsize = 1; +}; + +template <> +struct CudaVecType { + using type = __half2; + static constexpr int vecsize = 2; +}; + +template <> +struct CudaVecType { + using type = float4; + static constexpr int vecsize = 4; +}; + +template +class BaseGPUFunctor { + public: + using ELEMENT_TYPE = T; +}; + +/* ========================================================================== */ + +/* =========================== relu forward ============================ */ +template +class ReluGPUFuctor : public BaseGPUFunctor { + private: + T zero_; + + public: + ReluGPUFuctor() { zero_ = static_cast(0.0f); } + + // for relu forward when T is double + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type* x); + + // when num % vecsize != 0 this func will be used + __device__ __forceinline__ T ComputeRemainder(const T x) { + return x > zero_ ? x : zero_; + } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFuctor::Compute(const CudaVecType::type* x) { +// relu forward : out = max(x, 0) +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 + return __ldg(x) > zero_ ? __ldg(x) : zero_; +#else + return (*x) > zero_ ? (*x) : zero_; +#endif +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFuctor::Compute(const CudaVecType::type* xx) { + // relu forward : out = max(xx, 0) + return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y), + (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w)); +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFuctor::Compute(const CudaVecType::type* in) { +// relu forward : out = max(in, 0) +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 + const half2 kzero = __float2half2_rn(0.0f); + return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in)); +#else + const float2 xx = __half22float2(*in); + return __floats2half2_rn((xx.x > 0.0f) * static_cast(xx.x), + (xx.y > 0.0f) * static_cast(xx.y)); +#endif +} +/* ========================================================================== */ + +/* =========================== relu backward ============================ + */ + +template +class ReluGradGPUFunctor : public BaseGPUFunctor { + private: + T zero_; + + public: + ReluGradGPUFunctor() { zero_ = static_cast(0.0f); } + + // for relu backward when T is double + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type* out, + const typename CudaVecType::type* dout); + + // when num % vecsize != 0 this func will be used + __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) { + // relu backward : dx = out > 0 ? dout : 0; + return out > zero_ ? dout : zero_; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { +// relu backward : dx = out > 0 ? dout : 0; +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 + return __ldg(out) > zero_ ? __ldg(dout) : zero_; +#else + return (*out) > zero_ ? (*dout) : zero_; +#endif +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { + // relu backward : dx = out > 0 ? dout : 0; + return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y), + (out->z > zero_) * (dout->z), + (out->w > zero_) * (dout->w)); +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { +// relu backward : dx = out > 0 ? dout : 0; +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 + const half2 kzero = __float2half2_rn(0.0f); + return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout)); +#else + const float2 xx = __half22float2(*out); + const float2 yy = __half22float2(*dout); + return __floats2half2_rn((xx.x > 0.0f) * static_cast(yy.x), + (xx.y > 0.0f) * static_cast(yy.y)); +#endif +} + +/* ========================================================================== */ + +template +__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout, + T* dx, int num, Functor functor) { + using VecType = typename CudaVecType::type; + constexpr int vecsize = CudaVecType::vecsize; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + int loop = num / vecsize; + int tail = num % vecsize; + const VecType* in_forward = reinterpret_cast(forward_data); + const VecType* in_dout = reinterpret_cast(dout); + VecType* out = reinterpret_cast(dx); + + for (int i = idx; i < loop; i += stride) { + out[i] = functor.Compute((in_forward + i), (in_dout + i)); + } + + while (idx == loop && tail) { + dx[num - tail] = + functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]); + --tail; + } +} + +template +__global__ void ActivationkernelVec(const T* src, T* dst, int num, + Functor functor) { + constexpr int vecsize = CudaVecType::vecsize; + using VecType = typename CudaVecType::type; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + int loop = num / vecsize; + int tail = num % vecsize; + const VecType* in = reinterpret_cast(src); + VecType* out = reinterpret_cast(dst); + + for (int i = idx; i < loop; i += stride) { + out[i] = functor.Compute((in + i)); + } + + while (idx == loop && tail) { + dst[num - tail] = functor.ComputeRemainder(src[num - tail]); + --tail; + } +} + +template +class ActivationGPUKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = nullptr; + framework::Tensor* out = nullptr; + ExtractActivationTensor(context, &in_x, &out); + auto& dev_ctx = context.template device_context(); + + int num = in_x->numel(); + const T* input_data = in_x->data(); + T* output_data = out->mutable_data(dev_ctx.GetPlace(), + static_cast(num * sizeof(T))); + + int block = 512; +#ifdef __HIPCC__ + block = 256; +#endif + Functor functor; + constexpr int vecsize = CudaVecType::vecsize; + int grid = max((num / vecsize + block - 1) / block, 1); + ActivationkernelVec<<>>(input_data, output_data, + num, functor); + } +}; + +template +class ActivationGradGPUKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor *x, *out, *d_out; + framework::Tensor* d_x = nullptr; + x = out = d_out = nullptr; + ExtractActivationGradTensor(context, &x, &out, &d_out, + &d_x); + int numel = d_out->numel(); + auto& dev_ctx = context.template device_context(); + auto* dx_data = d_x->mutable_data( + dev_ctx.GetPlace(), static_cast(numel * sizeof(T))); + auto* dout_data = d_out->data(); + + auto* forward_data = dout_data; + if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { + // Only need forward output Out + forward_data = out->data(); + } else if (static_cast(Functor::FwdDeps()) == + static_cast(kDepX)) { + // Only need forward input X + forward_data = x->data(); + } + + int block = 512; +#ifdef __HIPCC__ + block = 256; +#endif + Functor functor; + constexpr int vecsize = CudaVecType::vecsize; + int grid = max((numel / vecsize + block - 1) / block, 1); + ActivationGradKernelVec<<>>( + forward_data, dout_data, dx_data, numel, functor); + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; @@ -60,7 +328,21 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluCUDAFunctor, ReluGradFunctor); +REGISTER_OP_CUDA_KERNEL( + relu, ops::ActivationGPUKernel>, + ops::ActivationGPUKernel>, + ops::ActivationGPUKernel>); + +REGISTER_OP_CUDA_KERNEL( + relu_grad, ops::ActivationGradGPUKernel>, + ops::ActivationGradGPUKernel>, + ops::ActivationGradGPUKernel>); REGISTER_OP_CUDA_KERNEL( relu_grad_grad,