diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 11190309814e7c75777a6cddd7e4d24bfc7ba9e6..bf2cf58f970addf1dac9f4871ba4abe09c3c7b38 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -32,8 +32,9 @@ USE_OP(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_DEVICE_KERNEL(gelu, MKLDNN); +PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index ef2e83ced26e07f199a122ee3157eb428b63aec9..7df957b2c0eca64bacd1b48065f37ddffec1770a 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -18,6 +18,7 @@ #include #include + #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -27,10 +28,11 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_ITSELF(relu); USE_OP_ITSELF(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); +PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 761b401ca9a2e535e1badfee834ef9ee98a07aae..d1a1aa3008c8b33690ecd9ea85501ad0178f592a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -198,10 +198,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory); - } - - // elementwise_mul & elementwise_div - else { + } else { // elementwise_mul & elementwise_div platform::BinaryMKLDNNHandler binary_handler( BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f); @@ -253,10 +250,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { } else { broadcast_src_memory = reorder_src_memory_p; } - } - - // elementwise_mul & elementwise_div - else { + } else { // elementwise_mul & elementwise_div std::unordered_map args; std::shared_ptr binary_prim; std::shared_ptr post_op_memory; diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc index 3d338f00d4fcbf4be35b2392a10c275526dc5d4b..3be2606bfc93984f918adf595b522fe6bfca72be 100644 --- a/paddle/fluid/operators/gelu_op.cc +++ b/paddle/fluid/operators/gelu_op.cc @@ -14,10 +14,11 @@ limitations under the License. */ #include #include -#include - -#include "paddle/fluid/operators/gelu_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,18 +30,6 @@ class GeluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(%s) of GeluOp should not be null.", "X")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(%s) of GeluOp should not be null.", "Out")); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -156,13 +145,10 @@ class GeluGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(gelu, GeluInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(gelu, ops::GeluOp, ops::GeluOpMaker, ops::GeluGradOpMaker, - ops::GeluGradOpMaker); + ops::GeluGradOpMaker, + GeluInferShapeFunctor); REGISTER_OPERATOR(gelu_grad, ops::GeluGradOp); -REGISTER_OP_CPU_KERNEL( - gelu, ops::GeluKernel, - ops::GeluKernel); -REGISTER_OP_CPU_KERNEL( - gelu_grad, ops::GeluGradKernel, - ops::GeluGradKernel); diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu deleted file mode 100644 index ef836ab72f001a540e081d7e9975ca5ee28758be..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gelu_op.cu +++ /dev/null @@ -1,320 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/gelu_op.h" - -DECLARE_bool(use_fast_math); - -namespace paddle { -namespace operators { - -#ifdef __NVCC__ -template -static __device__ __forceinline__ float FP32FastTanh(float x) { -#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000 - if (FastMode) { - float y; - asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x)); - return y; - } -#endif - return tanhf(x); -} - -template -static __device__ __forceinline__ float FP32GeluFwd(float x) { - auto tanh_out = - FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); - return x * 0.5f * (1.0f + tanh_out); -} - -template -static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) { - auto tanh_out = - FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); - auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) * - (0.79788456f + 0.1070322243f * x * x)) + - 0.5f * (1.0f + tanh_out); - return tmp * y_g; -} - -template -static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y, - size_t n) { - size_t offset = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; - size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; - for (; offset < n; offset += stride) { - using ArrT = phi::AlignedVector<__half, VecSize>; - ArrT in_arr = *reinterpret_cast(x + offset); -#pragma unroll - for (int i = 0; i < VecSize; ++i) { - float tmp = __half2float(in_arr[i]); - in_arr[i] = __float2half(FP32GeluFwd(tmp)); - } - *reinterpret_cast(y + offset) = in_arr; - } -} - -template -static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, - const __half* y_g, __half* x_g, - size_t n) { - size_t offset = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; - size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; - for (; offset < n; offset += stride) { - using ArrT = phi::AlignedVector<__half, VecSize>; - ArrT x_in_arr = *reinterpret_cast(x + offset); - ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); -#pragma unroll - for (int i = 0; i < VecSize; ++i) { - __half2 tmp_fp16_2; - tmp_fp16_2.x = x_in_arr[i]; - tmp_fp16_2.y = y_g_in_arr[i]; - float2 tmp_fp32_2 = __half22float2(tmp_fp16_2); - x_in_arr[i] = - __float2half(FP32GeluBwd(tmp_fp32_2.x, tmp_fp32_2.y)); - } - *reinterpret_cast(x_g + offset) = x_in_arr; - } -} - -static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( - const platform::CUDADeviceContext& dev_ctx, const __half* x, __half* y, - size_t n) { - auto is_aligned = [](const void* p, size_t alignment) { - return reinterpret_cast(p) % alignment == 0; - }; - -#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ - do { \ - constexpr auto kAlignment = \ - alignof(phi::AlignedVector<__half, __vec_size>); \ - if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ - is_aligned(y, kAlignment)) { \ - size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ - size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ - VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ - << " , thread = " << thread; \ - FP16FastGeluFwdCUDAKernel< \ - __vec_size, \ - __use_fast_math><<>>(x, y, n); \ - return true; \ - } \ - } while (0) - - if (FLAGS_use_fast_math) { - PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true); - } else { - PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false); - } - -#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL - return false; -} - -static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( - const platform::CUDADeviceContext& dev_ctx, const __half* x, - const __half* y_g, __half* x_g, size_t n) { - auto is_aligned = [](const void* p, size_t alignment) { - return reinterpret_cast(p) % alignment == 0; - }; - -#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ - do { \ - constexpr auto kAlignment = \ - alignof(phi::AlignedVector<__half, __vec_size>); \ - if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ - is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ - is_aligned(x_g, kAlignment)) { \ - size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ - size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ - VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ - << " , thread = " << thread; \ - FP16FastGeluBwdCUDAKernel< \ - __vec_size, \ - __use_fast_math><<>>(x, y_g, \ - x_g, n); \ - return true; \ - } \ - } while (0) - - if (FLAGS_use_fast_math) { - PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true); - } else { - PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false); - } - -#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL - return false; -} -#endif - -template -struct GeluWithApproximateFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x) { - // this function is tanh approximation of gelu - MPType x = static_cast(arg_x); - MPType one = static_cast(1); - MPType half = static_cast(0.5); - MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - auto tanh_out = - tanh(kAlpha * x * (one + static_cast(GELU_CONSTANT) * x * x)); - MPType out = x * half * (one + tanh_out); - return static_cast(out); - } -}; - -template -struct GeluWithoutApproximateFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x) { - // actual gelu with approximation = false - MPType x = static_cast(arg_x); - return static_cast(x * normcdf(x)); - } -}; - -template -class GeluKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - auto approximate = context.Attr("approximate"); - out->mutable_data(in->place()); - - std::vector ins = {in}; - std::vector outs = {out}; - const auto& dev_ctx = - context.template device_context(); - - if (approximate) { -#ifdef __NVCC__ - if (std::is_same::value) { - size_t n = in->numel(); - const auto* in_ptr = reinterpret_cast(in->data()); - auto* out_ptr = reinterpret_cast<__half*>(out->data()); - if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(dev_ctx, in_ptr, - out_ptr, n)) { - return; - } - } -#endif - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor()); - } - } -}; - -template -struct GeluWithApproximateGradFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { - MPType x = static_cast(arg_x); - MPType dout = static_cast(arg_dout); - MPType one = static_cast(1); - MPType half = static_cast(0.5); - MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - MPType kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - auto cube_x = x * x * x; - auto tanh_out = - tanh(kAlpha * ((static_cast(GELU_CONSTANT) * cube_x) + x)); - auto ans = - half * (one + tanh_out + - (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x)); - return static_cast(ans * dout); - } -}; - -template -struct GeluWithoutApproximateGradFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { - MPType x = static_cast(arg_x); - MPType dout = static_cast(arg_dout); - constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast(0.5); - const MPType cdf = normcdf(x); - const MPType pdf = exp(static_cast(-0.5) * x * x) * kBeta; - return static_cast(dout * (cdf + x * pdf)); - } -}; - -template -class GeluGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto approximate = context.Attr("approximate"); - dx->mutable_data(dout->place()); - - std::vector ins = {x, dout}; - std::vector outs = {dx}; - const auto& dev_ctx = - context.template device_context(); - if (approximate) { -#ifdef __NVCC__ - if (std::is_same::value) { - size_t n = x->numel(); - const auto* x_ptr = reinterpret_cast(x->data()); - const auto* y_g_ptr = reinterpret_cast(dout->data()); - auto* x_g_ptr = reinterpret_cast<__half*>(dx->data()); - if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(dev_ctx, x_ptr, y_g_ptr, - x_g_ptr, n)) { - return; - } - } -#endif - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor()); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - gelu, ops::GeluKernel, - ops::GeluKernel, - ops::GeluKernel); -REGISTER_OP_CUDA_KERNEL( - gelu_grad, ops::GeluGradKernel, - ops::GeluGradKernel, - ops::GeluGradKernel); diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h deleted file mode 100644 index d4fed8a868ff9e66f64c90ab9352e824ab673217..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gelu_op.h +++ /dev/null @@ -1,233 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES -#endif -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" -#endif - -namespace paddle { -namespace operators { - -#define GELU_CONSTANT 0.044715 - -template -struct GeluFunctor { - template - void operator()(Device d, X x, Out out, bool approximate) const { - if (approximate) { - // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3}))) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto temp = - (static_cast(M_2_SQRTPI * M_SQRT1_2) * - (casted_x + static_cast(GELU_CONSTANT) * casted_x.cube())) - .tanh(); - out.device(d) = (casted_x * static_cast(0.5) * - (static_cast(1) + temp)) - .template cast(); - } else { - auto temp = (static_cast(M_2_SQRTPI * M_SQRT1_2) * - (x + static_cast(GELU_CONSTANT) * x.cube())) - .tanh(); - out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); - } - } else { -#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - auto x_data = x.data(); - auto out_data = out.data(); - int n = std::min(x.size(), out.size()); - - std::memset(out_data, 0, n * sizeof(T)); - phi::funcs::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, - out_data, 1); - phi::funcs::CBlas::VMERF(n, out_data, out_data, VML_LA); - for (int i = 0; i < n; i++) { - out_data[i] += static_cast(1); - } - phi::funcs::CBlas::VMUL(n, x_data, out_data, out_data); - for (int i = 0; i < n; i++) { - out_data[i] *= static_cast(0.5); - } -#else - // gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto temp = (casted_x * static_cast(M_SQRT1_2)).erf(); - out.device(d) = (casted_x * static_cast(0.5) * - (static_cast(1) + temp)) - .template cast(); - } else { - auto temp = (x * static_cast(M_SQRT1_2)).erf(); - out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); - } -#endif - } - } -}; - -template -struct GeluGradFunctor { - template - void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const { - if (approximate) { - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto casted_dout = dout.template cast(); - - const float kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - const float kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - const auto y = - (kAlpha * - ((static_cast(GELU_CONSTANT) * casted_x.cube()) + casted_x)) - .tanh(); - dx.device(d) = (static_cast(0.5) * casted_dout * - (static_cast(1) + y + - (casted_x - casted_x * y.square()) * - (kAlpha + kBeta * casted_x.square()))) - .template cast(); - } else { - const T kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - const T kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - const auto y = - (kAlpha * ((static_cast(GELU_CONSTANT) * x.cube()) + x)).tanh(); - dx.device(d) = static_cast(0.5) * dout * - (static_cast(1) + y + - (x - x * y.square()) * (kAlpha + kBeta * x.square())); - } - } else { -#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - auto x_data = x.data(); - auto dx_data = dx.data(); - auto dout_data = dout.data(); - int n = std::min(x.size(), dx.size()); - - auto first = static_cast(std::malloc(n * sizeof(T))); - std::memset(first, 0, n * sizeof(T)); - auto second = static_cast(std::malloc(n * sizeof(T))); - std::memset(second, 0, n * sizeof(T)); - - // first = (0.5 * (1 + erf(x / sqrt(2)))) - phi::funcs::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, first, - 1); - phi::funcs::CBlas::VMERF(n, first, first, VML_LA); - for (int i = 0; i < n; i++) { - first[i] += static_cast(1); - } - phi::funcs::CBlas::SCAL(n, static_cast(0.5), first, 1); - - // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2)) - phi::funcs::CBlas::VSQUARE(n, x_data, second); - phi::funcs::CBlas::SCAL(n, -static_cast(0.5), second, 1); - phi::funcs::CBlas::VEXP(n, second, second); - phi::funcs::CBlas::VMUL(n, x_data, second, second); - phi::funcs::CBlas::SCAL( - n, static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1); - - // dx = dout * (first + second); - phi::funcs::CBlas::VADD(n, first, second, first); - phi::funcs::CBlas::VMUL(n, dout_data, first, dx_data); - - std::free(first); - std::free(second); -#else - // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) * - // exp(- x^2 / 2) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto casted_dout = dout.template cast(); - auto first = static_cast(0.5) * - (static_cast(1) + - ((casted_x * static_cast(M_SQRT1_2)).erf())); - auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * - casted_x * - (-static_cast(0.5) * casted_x.square()).exp(); - dx.device(d) = (casted_dout * (first + second)).template cast(); - } else { - auto first = - static_cast(0.5) * - (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); - - auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * - (-static_cast(0.5) * x.square()).exp(); - dx.device(d) = dout * (first + second); - } -#endif - } - } -}; - -template -class GeluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - auto approximate = context.Attr("approximate"); - out->mutable_data(in->place()); - - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = - *context.template device_context().eigen_device(); - - GeluFunctor functor; - functor(place, eigen_in, eigen_out, approximate); - } -}; - -template -class GeluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto approximate = context.Attr("approximate"); - dx->mutable_data(dout->place()); - - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_dout = framework::EigenVector::Flatten(*dout); - auto eigen_dx = framework::EigenVector::Flatten(*dx); - auto& place = - *context.template device_context().eigen_device(); - - GeluGradFunctor functor; - functor(place, eigen_x, eigen_dout, eigen_dx, approximate); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc index 18bbc7f4929c6493db9161d0415c0728eb8689c0..c5297dd9cd404b7637c2eec79dafcc027509ddcb 100644 --- a/paddle/fluid/operators/gelu_op_npu.cc +++ b/paddle/fluid/operators/gelu_op_npu.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/gelu_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index f3ac53138328dbfad12c6d530a6517f40c658677..b132b3170756d95adfde51e6d6ce7a5f0f25ca26 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -30,7 +30,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_DEVICE_KERNEL(gelu, NPU); template diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc index b8c2e9becf2950d12f87ec5d61c05f3bf0010b12..559d2448ad94525d623e24fc8fb6c5e3881b58e3 100644 --- a/paddle/fluid/operators/gelu_op_xpu.cc +++ b/paddle/fluid/operators/gelu_op_xpu.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include #include - -#include "paddle/fluid/operators/gelu_op.h" - +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc index 0e69b397e04c7eda7f515350caf870be5d7b57a5..da38f906b9bd34ba6c3251059ee12902e62eadaf 100644 --- a/paddle/fluid/operators/log_softmax_op.cc +++ b/paddle/fluid/operators/log_softmax_op.cc @@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_softmax_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -24,10 +27,6 @@ class LogSoftmaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - return UnaryOpUnchangedInferShapeCheckAxis(ctx); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -123,18 +122,11 @@ class LogSoftmaxGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; - +DECLARE_INFER_SHAPE_FUNCTOR(log_softmax, LogSoftmaxInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMetaCheckAxis)); REGISTER_OPERATOR(log_softmax, ops::LogSoftmaxOp, ops::LogSoftmaxOpMaker, ops::LogSoftmaxOpInferVarType, ops::LogSoftmaxGradOpMaker, - ops::LogSoftmaxGradOpMaker); + ops::LogSoftmaxGradOpMaker, + LogSoftmaxInferShapeFunctor); REGISTER_OPERATOR(log_softmax_grad, ops::LogSoftmaxGradOp); - -REGISTER_OP_CPU_KERNEL( - log_softmax, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CPU_KERNEL( - log_softmax_grad, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu deleted file mode 100644 index 26b6ce43303d181c41b60cf36c229d00acb0e626..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/log_softmax_op.cu +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/log_softmax_op.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class LogSoftmaxKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *x = ctx.Input("X"); - auto *out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int input_axis = ctx.Attr("axis"); - auto &dev_ctx = ctx.template device_context(); - phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, *x, input_axis, out); - } -}; - -template -class LogSoftmaxGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *out = ctx.Input("Out"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - int input_axis = ctx.Attr("axis"); - auto &dev_ctx = ctx.template device_context(); - phi::SoftmaxBackwardCUDAKernelDriver(dev_ctx, *out, *dout, - input_axis, dx); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#ifdef PADDLE_WITH_HIP -REGISTER_OP_CUDA_KERNEL( - log_softmax, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - log_softmax_grad, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - log_softmax, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - log_softmax_grad, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); -#endif diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h deleted file mode 100644 index 162087a75662d711a63cbbe4beeaecf265367c6a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/log_softmax_op.h +++ /dev/null @@ -1,197 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -static inline int CanonicalAxis(const int axis, const int rank) { - if (axis < 0) { - return axis + rank; - } - return axis; -} - -static inline size_t SizeToAxis(const int axis, const framework::DDim dims) { - size_t size = 1; - for (int i = 0; i < axis; i++) { - size *= dims[i]; - } - return size; -} - -static inline size_t SizeFromAxis(const int axis, const framework::DDim dims) { - size_t size = 1; - for (int i = axis; i < dims.size(); i++) { - size *= dims[i]; - } - return size; -} - -template -struct ValueClip { - HOSTDEVICE T operator()(const T& x) const { - const T kThreshold = static_cast(-64.); - return x < kThreshold ? kThreshold : x; - } -}; - -template -struct LogSoftmaxFunctor { - void operator()(const DeviceContext& context, const framework::Tensor* X, - framework::Tensor* Y, const int axis) { - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - constexpr int kAxisDim = 1; - - int axis_dim = X->dims()[axis]; - const int n = SizeToAxis(axis, X->dims()); - const int d = SizeFromAxis(axis, X->dims()); - framework::DDim dim_2d{n, d}; - - auto logits = EigenMatrix::From(*X, dim_2d); - auto log_softmax = EigenMatrix::From(*Y, dim_2d); - - const int batch_size = logits.dimension(kBatchDim); - const int num_classes = logits.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_axis(kAxisDim); - Eigen::DSizes batch_classes(batch_size, num_classes); - Eigen::DSizes batch_by_one(batch_size, 1); - Eigen::DSizes one_by_class(1, num_classes); - Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); - Eigen::DSizes one_axis_one(1, axis_dim, 1); - Eigen::DSizes one_axis(1, axis_dim); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - - // For numerical stability, logits should be shifted by maximum number along - // axis, calculate shifted_logits into log_softmax tensor for memory reuse. - if (num_remain == 1) { - // axis == -1, axis and class in same dimension, calculate along - // class dimension directly for higher performance - log_softmax.device(*context.eigen_device()) = - (logits - - logits.maximum(along_axis) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)) - .unaryExpr(ValueClip()); - } else { - // axis != -1, class dimension split into (axis, remain), max and sum - // should be calculated along axis dimension - log_softmax.device(*context.eigen_device()) = - (logits.reshape(batch_axis_remain) - - logits.reshape(batch_axis_remain) - .maximum(along_axis) - .eval() - .reshape(batch_one_remain) - .broadcast(one_axis_one) - .reshape(batch_classes)) - .unaryExpr(ValueClip()); - } - - log_softmax.device(*context.eigen_device()) = - log_softmax - - log_softmax.exp() - .eval() - .reshape(batch_axis_remain) - .sum(along_axis) - .log() - .broadcast(one_axis); - } -}; - -template -class LogSoftmaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Out = context.Output("Out"); - const int rank = X->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - // allocate memory on device. - Out->mutable_data(context.GetPlace()); - - if (X->numel() != 0) { - LogSoftmaxFunctor()( - context.template device_context(), X, Out, axis); - } - } -}; - -template -struct LogSoftmaxGradFunctor { - void operator()(const DeviceContext& context, const framework::Tensor* Y, - const framework::Tensor* dY, framework::Tensor* dX, - const int axis) { - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - - const int n = SizeToAxis(axis, Y->dims()); - const int d = SizeFromAxis(axis, Y->dims()); - framework::DDim dim_2d{n, d}; - - auto y = EigenMatrix::From(*Y, dim_2d); - auto dy = EigenMatrix::From(*dY, dim_2d); - auto dx = EigenMatrix::From(*dX, dim_2d); - - const int axis_dim = Y->dims()[axis]; - const int batch_size = y.dimension(kBatchDim); - const int num_classes = y.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_class(kClassDim); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - Eigen::DSizes one_axis(1, axis_dim); - - dx.device(*context.eigen_device()) = - dy - - (y.exp()) * (dy.reshape(batch_axis_remain) - .sum(along_class) - .broadcast(one_axis)); - } -}; - -template -class LogSoftmaxGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* Out = context.Input("Out"); - auto* dOut = - context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); - const int rank = Out->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - // allocate memory on device. - dX->mutable_data(context.GetPlace()); - - if (Out->numel() != 0) { - LogSoftmaxGradFunctor()( - context.template device_context(), Out, dOut, dX, - axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc index 5795f1dffac785b82662cebb84e8224cec78ecf6..6ce21aec9215a007ac6ca49ee1bffc1a40d40c81 100644 --- a/paddle/fluid/operators/log_softmax_op_npu.cc +++ b/paddle/fluid/operators/log_softmax_op_npu.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/log_softmax_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { @@ -27,7 +28,7 @@ class LogSoftmaxNPUKernel : public framework::OpKernel { auto* X = ctx.Input("X"); auto* Out = ctx.Output("Out"); const int rank = X->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); Out->mutable_data(ctx.GetPlace()); if (X->numel() != 0) { @@ -47,7 +48,7 @@ class LogSoftmaxGradNPUKernel : public framework::OpKernel { auto* dOut = ctx.Input(framework::GradVarName("Out")); auto* dX = ctx.Output(framework::GradVarName("X")); const int rank = dOut->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); // allocate memory on device. dX->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 4d2a2e23b3f70dc48029be2e0a79c9695881b519..de35f67405810180554bfd556f91b7501f9c4ba2 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -9,14 +9,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/prelu_op.h" - #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + framework::OpKernelType innerGetKernelTypeForVar( const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) { #ifdef PADDLE_WITH_MKLDNN @@ -44,95 +49,6 @@ class PReluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "prelu"); - OP_INOUT_CHECK(ctx->HasInput("Alpha"), "Input", "Alpha", "prelu"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "prelu"); - - auto x_dim = ctx->GetInputDim("X"); - std::string mode = ctx->Attrs().Get("mode"); - if (mode == "all") { - PADDLE_ENFORCE_EQ(phi::product(ctx->GetInputDim("Alpha")), 1, - platform::errors::InvalidArgument( - "For mode 'all', size of weight Alpha must be one. " - "But recevied alpha's size: %d.", - product(ctx->GetInputDim("Alpha")))); - } else if (mode == "channel") { - auto x_rank = x_dim.size(); - PADDLE_ENFORCE_GE(x_rank, 2, - platform::errors::InvalidArgument( - "For mode 'channel', rank of input X must be " - "equal or larger than 2. But recevied X's " - "rank: %d", - x_rank)); - const std::string data_format_str = - ctx->Attrs().Get("data_format"); - PADDLE_ENFORCE_EQ(data_format_str == "NCHW" || data_format_str == "NHWC", - true, - platform::errors::InvalidArgument( - "For mode 'channel', data_format must be one of " - "NCHW and NHWC. But recevied data_format: %s", - data_format_str)); - if (data_format_str == "NCHW" || ctx->IsRunMKLDNNKernel()) { - PADDLE_ENFORCE_EQ( - product(ctx->GetInputDim("Alpha")) == x_dim[1], true, - platform::errors::InvalidArgument( - "For mode 'channel', size of weight Alpha must be " - "equal to the number of channels of input(x). But " - "recevied alpha's size: %d, x_dim[1]: %d", - product(ctx->GetInputDim("Alpha")), x_dim[1])); - } else { - PADDLE_ENFORCE_EQ( - product(ctx->GetInputDim("Alpha")) == x_dim[x_rank - 1], true, - platform::errors::InvalidArgument( - "For mode 'channel', size of weight Alpha must be " - "equal to the number of channels of input(x). But " - "recevied alpha's size: %d, x_dim[%d]: %d", - product(ctx->GetInputDim("Alpha")), x_rank - 1, - x_dim[x_rank - 1])); - } - - } else if (mode == "element") { - auto alpha_dim = ctx->GetInputDim("Alpha"); - auto alpha_rank = alpha_dim.size(); - auto x_rank = x_dim.size(); - PADDLE_ENFORCE_GE(x_rank, 1, - platform::errors::InvalidArgument( - "For mode 'element', rank of input X must be " - "equal or larger than 2. But recevied X's " - "rank: %d", - x_rank)); - PADDLE_ENFORCE_EQ( - alpha_rank, x_rank, - platform::errors::InvalidArgument( - "For mode 'element', rank of weight Alpha must be ", - "equal to the rank of input(x). But recevied alpha's rank: %d, " - "x's rank: %d.", - alpha_rank, x_rank)); - size_t x_product = 1; - size_t alpha_product = 1; - for (int64_t i = x_rank - 1; i > 0; i--) { - x_product *= x_dim[i]; - alpha_product *= alpha_dim[i]; - } - PADDLE_ENFORCE_EQ( - alpha_product, x_product, - platform::errors::InvalidArgument( - "For mode 'element', the size of weight Alpha must be " - "equal to the size of input(x). But recevied alpha's size: %d, " - "x's size: %d.", - alpha_product, x_product)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. " - "But recevied " - "mode: '%s'.", - mode)); - } - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -268,13 +184,10 @@ class PReluGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(prelu, PReluInferShapeFunctor, + PD_INFER_META(phi::PReluInferMeta)); REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker, ops::PReluGradOpMaker, - ops::PReluGradOpMaker); + ops::PReluGradOpMaker, + PReluInferShapeFunctor); REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp); -REGISTER_OP_CPU_KERNEL( - prelu, ops::PReluKernel, - ops::PReluKernel); -REGISTER_OP_CPU_KERNEL( - prelu_grad, ops::PReluGradKernel, - ops::PReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu deleted file mode 100644 index 12e55d042d7037606179cc06480e4f80f942d8a2..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/prelu_op.cu +++ /dev/null @@ -1,208 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/prelu.h" -#include "paddle/fluid/operators/prelu_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#define CUDA_NUM_THREADS 1024 - -inline static int PADDLE_GET_BLOCKS(const int N) { - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; -} - -template -class CUDAPReluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* out = context.Output("Out"); - - const T* x_ptr = x->data(); - T* o_ptr = out->mutable_data(context.GetPlace()); - - const T* alpha_ptr = alpha->data(); - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - auto x_rank = dim.size(); - - VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim[" - << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel; - - if (mode == "channel") { - bool channel_last = data_format == "NHWC"; - size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; - math::PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; - prelu_channel_wise(context.cuda_device_context().stream(), x_ptr, - alpha_ptr, o_ptr, dim[0], channel, channel_last, - numel); - } else if (mode == "element") { - math::PreluElementWiseDirectCUDAFunctor prelu_element_wise; - prelu_element_wise(context.cuda_device_context().stream(), x_ptr, - alpha_ptr, o_ptr, dim[0], numel); - } else { - math::PreluScalarDirectCUDAFunctor prelu_scalar; - prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr, - o_ptr, numel); - } - } -}; - -enum PRELU_MODE { Element, ChannelFirst, ChannelLast, Scalar }; - -template -__global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr, - const T* dy_ptr, T* dx_ptr, T* dalpha_ptr, - size_t channel_num, size_t plane_size, - size_t spatial_size, size_t numel, - PRELU_MODE mode) { - CUDA_KERNEL_LOOP(index, numel) { - T scale; - if (mode == Element) { - size_t element_index = index % spatial_size; - scale = alpha_ptr[element_index]; - } else if (mode == ChannelFirst) { - size_t temp = index / plane_size; - size_t channel_index = temp % channel_num; - scale = alpha_ptr[channel_index]; - } else if (mode == ChannelLast) { - size_t channel_index = index % channel_num; - scale = alpha_ptr[channel_index]; - } else { - scale = alpha_ptr[0]; - } - T x = x_ptr[index]; - T dy = dy_ptr[index]; - T zero = static_cast(0); - if (dx_ptr != nullptr) dx_ptr[index] = (x > zero) ? dy : scale * dy; - if (dalpha_ptr != nullptr) dalpha_ptr[index] = (x > zero) ? zero : x * dy; - } -} - -template -class PreluOpGradFunctor { - public: - void operator()(gpuStream_t stream, const T* x, const T* alpha, const T* dy, - T* dx, T* dalpha, const framework::DDim& input_dims, - PRELU_MODE mode) { - size_t numel = 1; - for (size_t i = 0; i < input_dims.size(); ++i) { - numel *= input_dims[i]; - } - size_t plane_size = numel / input_dims[0] / input_dims[1]; - size_t spatial_size = numel / input_dims[0]; - size_t channel = - mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1]; - - PReluOpGradKernel< - T><<>>( - x, alpha, dy, dx, dalpha, channel, plane_size, spatial_size, numel, - mode); - } -}; - -template -class CUDAPReluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* dx = context.Output(framework::GradVarName("X")); - auto* dy = context.Input(framework::GradVarName("Out")); - auto* dalpha = context.Output(framework::GradVarName("Alpha")); - - const T* x_ptr = x->data(); - const T* alpha_ptr = alpha->data(); - const T* dy_ptr = dy->data(); - T* dx_ptr = dx ? dx->mutable_data(context.GetPlace()) : nullptr; - T* dalpha_ptr = - dalpha ? dalpha->mutable_data(context.GetPlace()) : nullptr; - - if (!dx && !dalpha) return; - - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - auto x_rank = dim.size(); - std::vector input_shape = phi::vectorize(dim); - auto stream = context.cuda_device_context().stream(); - - T* dalpha_tmp_ptr; - Tensor dalpha_tmp; - if (dalpha_ptr == nullptr) { - dalpha_tmp_ptr = dalpha_ptr; - } else { - auto& dev_ctx = context.template device_context(); - dalpha_tmp = context.AllocateTmpTensor(dim, dev_ctx); - dalpha_tmp_ptr = dalpha_tmp.mutable_data(context.GetPlace()); - } - - PRELU_MODE m; - bool channel_last = false; - if (mode == "element") { - m = Element; - } else if (mode == "channel") { - channel_last = data_format == "NHWC"; - m = channel_last ? ChannelLast : ChannelFirst; - } else { - m = Scalar; - } - PreluOpGradFunctor prelu_grad; - prelu_grad(stream, x_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr, dim, - m); - - if (dalpha_tmp_ptr == nullptr) return; - - std::vector reduce_dims; - for (size_t i = 0; i < dim.size(); i++) { - if (mode == "channel" && !channel_last && i == 1) continue; - if (mode == "channel" && channel_last && i == dim.size() - 1) continue; - if (mode == "element" && i != 0) continue; - reduce_dims.push_back(i); - } - - TensorReduceImpl>( - context.cuda_device_context(), dalpha_tmp, dalpha, - kps::IdentityFunctor(), reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - prelu, ops::CUDAPReluKernel, - ops::CUDAPReluKernel, - ops::CUDAPReluKernel); -REGISTER_OP_CUDA_KERNEL( - prelu_grad, - ops::CUDAPReluGradKernel, - ops::CUDAPReluGradKernel, - ops::CUDAPReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h deleted file mode 100644 index 384994eb37c2a955c383ddeebafe5f0e64d3c961..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/prelu_op.h +++ /dev/null @@ -1,172 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/transform.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using platform::Transform; - -template -class PReluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* out = context.Output("Out"); - - const T* x_ptr = x->data(); - T* o_ptr = out->mutable_data(context.GetPlace()); - - const T* alpha_ptr = alpha->data(); - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - int index = 0; - int i = 0; - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i]; - } - } - } -}; - -template -class PReluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dx = context.Output(framework::GradVarName("X")); - auto* dout = context.Input(framework::GradVarName("Out")); - auto* dalpha = context.Output(framework::GradVarName("Alpha")); - auto* alpha = context.Input("Alpha"); - const T* alpha_ptr = alpha->data(); - const T* x_ptr = x->data(); - const T* dout_ptr = dout->data(); - std::string mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - int numel = x->numel(); - auto dim = x->dims(); - int index = 0; - int i = 0; - if (dx) { - T* dx_ptr = dx->mutable_data(context.GetPlace()); - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - dx_ptr[i] = x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i]; - } - } - } - - index = 0; - if (dalpha) { - T* dalpha_ptr = dalpha->mutable_data(context.GetPlace()); - memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel()); - - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - dalpha_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } - } - - // TODO(Guanzhong): add GPU kernels - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index b7a7a4ec231ddfdbfd4da75e71aebaa49f99443f..f09e87894784d16a96c0c507acb434c60f6aa054 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -918,6 +918,103 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { out->share_lod(x); } +void PReluInferMeta(const MetaTensor& x, + const MetaTensor& alpha, + const std::string& mode, + const std::string& data_format, + MetaTensor* out, + MetaConfig config) { + auto x_dim = x.dims(); + if (mode == "all") { + PADDLE_ENFORCE_EQ(phi::product(alpha.dims()), + 1, + phi::errors::InvalidArgument( + "For mode 'all', size of weight Alpha must be one. " + "But recevied alpha's size: %d.", + product(alpha.dims()))); + } else if (mode == "channel") { + auto x_rank = x_dim.size(); + PADDLE_ENFORCE_GE(x_rank, + 2, + phi::errors::InvalidArgument( + "For mode 'channel', rank of input X must be " + "equal or larger than 2. But recevied X's " + "rank: %d", + x_rank)); + PADDLE_ENFORCE_EQ(data_format == "NCHW" || data_format == "NHWC", + true, + phi::errors::InvalidArgument( + "For mode 'channel', data_format must be one of " + "NCHW and NHWC. But recevied data_format: %s", + data_format)); + if (data_format == "NCHW" || config.is_run_mkldnn_kernel) { + PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[1], + true, + phi::errors::InvalidArgument( + "For mode 'channel', size of weight Alpha must be " + "equal to the number of channels of input(x). But " + "recevied alpha's size: %d, x_dim[1]: %d", + product(alpha.dims()), + x_dim[1])); + } else { + PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[x_rank - 1], + true, + phi::errors::InvalidArgument( + "For mode 'channel', size of weight Alpha must be " + "equal to the number of channels of input(x). But " + "recevied alpha's size: %d, x_dim[%d]: %d", + product(alpha.dims()), + x_rank - 1, + x_dim[x_rank - 1])); + } + } else if (mode == "element") { + auto alpha_dim = alpha.dims(); + auto alpha_rank = alpha_dim.size(); + auto x_rank = x_dim.size(); + PADDLE_ENFORCE_GE(x_rank, + 1, + phi::errors::InvalidArgument( + "For mode 'element', rank of input X must be " + "equal or larger than 2. But recevied X's " + "rank: %d", + x_rank)); + PADDLE_ENFORCE_EQ( + alpha_rank, + x_rank, + phi::errors::InvalidArgument( + "For mode 'element', rank of weight Alpha must be ", + "equal to the rank of input(x). But recevied alpha's rank: %d, " + "x's rank: %d.", + alpha_rank, + x_rank)); + size_t x_product = 1; + size_t alpha_product = 1; + for (int64_t i = x_rank - 1; i > 0; i--) { + x_product *= x_dim[i]; + alpha_product *= alpha_dim[i]; + } + PADDLE_ENFORCE_EQ( + alpha_product, + x_product, + phi::errors::InvalidArgument( + "For mode 'element', the size of weight Alpha must be " + "equal to the size of input(x). But recevied alpha's size: %d, " + "x's size: %d.", + alpha_product, + x_product)); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. " + "But recevied " + "mode: '%s'.", + mode)); + } + out->set_dims(x_dim); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} + void SearchsortedInferMeta(const MetaTensor& sorted_sequence, const MetaTensor& value, bool out_int32, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index cb680415e7d2c42de7b2339b27b22be500dfdf9b..cb7a83f39a45494644a7a23d3446c8dc5d37947f 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -146,6 +146,13 @@ void MatmulInferMeta(const MetaTensor& x, void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); +void PReluInferMeta(const MetaTensor& x, + const MetaTensor& alpha, + const std::string& mode, + const std::string& data_format, + MetaTensor* out, + MetaConfig config); + void SearchsortedInferMeta(const MetaTensor& sorted_sequence, const MetaTensor& value, bool out_int32, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index baa5b39670f3795c738acc8df0adb6a1a00ee67f..03029550c2afa46b676369ba2f7ac41a98dc118a 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1650,7 +1650,7 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x, PADDLE_ENFORCE_GE( axis, -rank, - errors::InvalidArgument( + phi::errors::InvalidArgument( "Attr(axis) value should be in range [-R, R-1], " "R is the rank of Input(X). But received axis: %d, R: %d.", axis, diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..254c4ea5716d19c65da6a46748a43db8dbddd52b --- /dev/null +++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_grad_kernel.h" + +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/blas/blas_impl.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/gelu_kernel.h" + +namespace phi { + +template +struct GeluGradFunctor { + template + void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const { + if (approximate) { + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto casted_dout = dout.template cast(); + + const float kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + const float kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + const auto y = + (kAlpha * + ((static_cast(GELU_CONSTANT) * casted_x.cube()) + casted_x)) + .tanh(); + dx.device(d) = (static_cast(0.5) * casted_dout * + (static_cast(1) + y + + (casted_x - casted_x * y.square()) * + (kAlpha + kBeta * casted_x.square()))) + .template cast(); + } else { + const T kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + const T kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + const auto y = + (kAlpha * ((static_cast(GELU_CONSTANT) * x.cube()) + x)).tanh(); + dx.device(d) = static_cast(0.5) * dout * + (static_cast(1) + y + + (x - x * y.square()) * (kAlpha + kBeta * x.square())); + } + } else { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_HIP) + auto x_data = x.data(); + auto dx_data = dx.data(); + auto dout_data = dout.data(); + int n = std::min(x.size(), dx.size()); + + auto first = static_cast(std::malloc(n * sizeof(T))); + std::memset(first, 0, n * sizeof(T)); + auto second = static_cast(std::malloc(n * sizeof(T))); + std::memset(second, 0, n * sizeof(T)); + + // first = (0.5 * (1 + erf(x / sqrt(2)))) + phi::funcs::CBlas::AXPY( + n, static_cast(M_SQRT1_2), x_data, 1, first, 1); + phi::funcs::CBlas::VMERF(n, first, first, VML_LA); + for (int i = 0; i < n; i++) { + first[i] += static_cast(1); + } + phi::funcs::CBlas::SCAL(n, static_cast(0.5), first, 1); + + // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2)) + phi::funcs::CBlas::VSQUARE(n, x_data, second); + phi::funcs::CBlas::SCAL(n, -static_cast(0.5), second, 1); + phi::funcs::CBlas::VEXP(n, second, second); + phi::funcs::CBlas::VMUL(n, x_data, second, second); + phi::funcs::CBlas::SCAL( + n, static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1); + + // dx = dout * (first + second); + phi::funcs::CBlas::VADD(n, first, second, first); + phi::funcs::CBlas::VMUL(n, dout_data, first, dx_data); + + std::free(first); + std::free(second); +#else + // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) * + // exp(- x^2 / 2) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto casted_dout = dout.template cast(); + auto first = static_cast(0.5) * + (static_cast(1) + + ((casted_x * static_cast(M_SQRT1_2)).erf())); + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * + casted_x * + (-static_cast(0.5) * casted_x.square()).exp(); + dx.device(d) = (casted_dout * (first + second)).template cast(); + } else { + auto first = + static_cast(0.5) * + (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); + + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * + (-static_cast(0.5) * x.square()).exp(); + dx.device(d) = dout * (first + second); + } +#endif + } + } +}; + +template +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + auto eigen_x = EigenVector::Flatten(x); + auto eigen_out_grad = EigenVector::Flatten(out_grad); + auto eigen_x_grad = EigenVector::Flatten(*x_grad); + auto& dev = *dev_ctx.eigen_device(); + + GeluGradFunctor functor; + functor(dev, eigen_x, eigen_out_grad, eigen_x_grad, approximate); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + gelu_grad, CPU, ALL_LAYOUT, phi::GeluGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..d7af220574565ea96706c2a87aec6751c9203af4 --- /dev/null +++ b/paddle/phi/kernels/cpu/gelu_kernel.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/blas/blas_impl.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +struct GeluFunctor { + template + void operator()(Device d, X x, Out out, bool approximate) const { + if (approximate) { + // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3}))) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto temp = + (static_cast(M_2_SQRTPI * M_SQRT1_2) * + (casted_x + static_cast(GELU_CONSTANT) * casted_x.cube())) + .tanh(); + out.device(d) = (casted_x * static_cast(0.5) * + (static_cast(1) + temp)) + .template cast(); + } else { + auto temp = (static_cast(M_2_SQRTPI * M_SQRT1_2) * + (x + static_cast(GELU_CONSTANT) * x.cube())) + .tanh(); + out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); + } + } else { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_HIP) + auto x_data = x.data(); + auto out_data = out.data(); + int n = std::min(x.size(), out.size()); + + std::memset(out_data, 0, n * sizeof(T)); + phi::funcs::CBlas::AXPY( + n, static_cast(M_SQRT1_2), x_data, 1, out_data, 1); + phi::funcs::CBlas::VMERF(n, out_data, out_data, VML_LA); + for (int i = 0; i < n; i++) { + out_data[i] += static_cast(1); + } + phi::funcs::CBlas::VMUL(n, x_data, out_data, out_data); + for (int i = 0; i < n; i++) { + out_data[i] *= static_cast(0.5); + } +#else + // gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto temp = (casted_x * static_cast(M_SQRT1_2)).erf(); + out.device(d) = (casted_x * static_cast(0.5) * + (static_cast(1) + temp)) + .template cast(); + } else { + auto temp = (x * static_cast(M_SQRT1_2)).erf(); + out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); + } +#endif + } + } +}; + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out) { + dev_ctx.template Alloc(out); + auto eigen_out = EigenVector::Flatten(*out); + auto eigen_x = EigenVector::Flatten(x); + auto& dev = *dev_ctx.eigen_device(); + + GeluFunctor functor; + functor(dev, eigen_x, eigen_out, approximate); +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu, CPU, ALL_LAYOUT, phi::GeluKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5f344b9cc3fe0a4c71470c361f2e8f370bc5908a --- /dev/null +++ b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +using EigenMatrixTemplate = EigenMatrix; + +template +struct LogSoftmaxGradFunctor { + void operator()(const Context& context, + const DenseTensor* Y, + const DenseTensor* dY, + DenseTensor* dX, + const int axis) { + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + + const int n = funcs::SizeToAxis(axis, Y->dims()); + const int d = funcs::SizeFromAxis(axis, Y->dims()); + phi::DDim dim_2d{n, d}; + + auto y = EigenMatrixTemplate::From(*Y, dim_2d); + auto dy = EigenMatrixTemplate::From(*dY, dim_2d); + auto dx = EigenMatrixTemplate::From(*dX, dim_2d); + + const int axis_dim = Y->dims()[axis]; + const int batch_size = y.dimension(kBatchDim); + const int num_classes = y.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + Eigen::DSizes one_axis(1, axis_dim); + + dx.device(*context.eigen_device()) = + dy - + (y.exp()) * (dy.reshape(batch_axis_remain) + .sum(along_class) + .broadcast(one_axis)); + } +}; + +template +void LogSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad) { + const int rank = out.dims().size(); + const int canonical_axis = funcs::CanonicalAxis(axis, rank); + + dev_ctx.template Alloc(x_grad); + if (out.numel() != 0) { + LogSoftmaxGradFunctor()( + dev_ctx, &out, &out_grad, x_grad, canonical_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(log_softmax_grad, + CPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..241742378cc5d012d2816745d0f83fc586089ef7 --- /dev/null +++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +using EigenMatrixTemplate = EigenMatrix; + +template +struct ValueClip { + HOSTDEVICE T operator()(const T& x) const { + const T kThreshold = static_cast(-64.); + return x < kThreshold ? kThreshold : x; + } +}; + +template +struct LogSoftmaxFunctor { + void operator()(const Context& context, + const DenseTensor* X, + DenseTensor* Y, + const int axis) { + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + constexpr int kAxisDim = 1; + + int axis_dim = X->dims()[axis]; + const int n = funcs::SizeToAxis(axis, X->dims()); + const int d = funcs::SizeFromAxis(axis, X->dims()); + phi::DDim dim_2d{n, d}; + + auto logits = EigenMatrixTemplate::From(*X, dim_2d); + auto log_softmax = EigenMatrixTemplate::From(*Y, dim_2d); + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_axis(kAxisDim); + Eigen::DSizes batch_classes(batch_size, num_classes); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); + Eigen::DSizes one_axis_one(1, axis_dim, 1); + Eigen::DSizes one_axis(1, axis_dim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + + // For numerical stability, logits should be shifted by maximum number along + // axis, calculate shifted_logits into log_softmax tensor for memory reuse. + if (num_remain == 1) { + // axis == -1, axis and class in same dimension, calculate along + // class dimension directly for higher performance + log_softmax.device(*context.eigen_device()) = + (logits - + logits.maximum(along_axis) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); + } else { + // axis != -1, class dimension split into (axis, remain), max and sum + // should be calculated along axis dimension + log_softmax.device(*context.eigen_device()) = + (logits.reshape(batch_axis_remain) - + logits.reshape(batch_axis_remain) + .maximum(along_axis) + .eval() + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) + .unaryExpr(ValueClip()); + } + + log_softmax.device(*context.eigen_device()) = + log_softmax - + log_softmax.exp() + .eval() + .reshape(batch_axis_remain) + .sum(along_axis) + .log() + .broadcast(one_axis); + } +}; + +template +void LogSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { + const int rank = x.dims().size(); + const int canonical_axis = funcs::CanonicalAxis(axis, rank); + + dev_ctx.template Alloc(out); + if (x.numel() != 0) { + LogSoftmaxFunctor()(dev_ctx, &x, out, canonical_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + log_softmax, CPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/prelu_grad_kernel.cc b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..97558cdb31f666fd7c5dd8b15e1d7feef6556a0b --- /dev/null +++ b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad) { + const T* alpha_ptr = alpha.data(); + const T* x_ptr = x.data(); + const T* out_grad_ptr = out_grad.data(); + int numel = x.numel(); + auto dim = x.dims(); + int index = 0; + int i = 0; + if (x_grad) { + T* x_grad_ptr = dev_ctx.template Alloc(x_grad); + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i] + : alpha_ptr[index] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i] + : alpha_ptr[index] * out_grad_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + x_grad_ptr[i] = + x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[index] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + x_grad_ptr[i] = + x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[0] * out_grad_ptr[i]; + } + } + } + + index = 0; + if (alpha_grad) { + T* alpha_grad_ptr = dev_ctx.template Alloc(alpha_grad); + memset(alpha_grad_ptr, 0, sizeof(T) * alpha_grad->numel()); + + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + alpha_grad_ptr[index] += + x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + alpha_grad_ptr[index] += + x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + alpha_grad_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + alpha_grad_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + prelu_grad, CPU, ALL_LAYOUT, phi::PReluGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/prelu_kernel.cc b/paddle/phi/kernels/cpu/prelu_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f389ab9ff459d1935518f35e7884d144bec5020 --- /dev/null +++ b/paddle/phi/kernels/cpu/prelu_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out) { + const T* x_ptr = x.data(); + const T* alpha_ptr = alpha.data(); + T* o_ptr = dev_ctx.template Alloc(out); + + int numel = x.numel(); + auto dim = x.dims(); + int index = 0; + int i = 0; + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i]; + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu, CPU, ALL_LAYOUT, phi::PReluKernel, float, double) {} diff --git a/paddle/phi/kernels/gelu_grad_kernel.h b/paddle/phi/kernels/gelu_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..fd70e8d54bc8d004373efd1874f4b07a9ebde6a8 --- /dev/null +++ b/paddle/phi/kernels/gelu_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/gelu_kernel.h b/paddle/phi/kernels/gelu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..bc106a04031fbcc2a96209e170d60eda8cc7b5e1 --- /dev/null +++ b/paddle/phi/kernels/gelu_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +#define GELU_CONSTANT 0.044715 + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h new file mode 100644 index 0000000000000000000000000000000000000000..2b9be7c6154354f7fd20b316610521a02801243f --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_funcs.h @@ -0,0 +1,176 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/flags.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +#ifdef __NVCC__ +template +static __device__ __forceinline__ float FP32FastTanh(float x) { +#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000 + if (FastMode) { + float y; + asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x)); + return y; + } +#endif + return tanhf(x); +} + +template +static __device__ __forceinline__ float FP32GeluFwd(float x) { + auto tanh_out = + FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); + return x * 0.5f * (1.0f + tanh_out); +} + +template +static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) { + auto tanh_out = + FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); + auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) * + (0.79788456f + 0.1070322243f * x * x)) + + 0.5f * (1.0f + tanh_out); + return tmp * y_g; +} + +template +static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, + __half* y, + size_t n) { + size_t offset = + static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; + for (; offset < n; offset += stride) { + using ArrT = phi::AlignedVector<__half, VecSize>; + ArrT in_arr = *reinterpret_cast(x + offset); +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + float tmp = __half2float(in_arr[i]); + in_arr[i] = __float2half(FP32GeluFwd(tmp)); + } + *reinterpret_cast(y + offset) = in_arr; + } +} + +template +static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, + const __half* y_g, + __half* x_g, + size_t n) { + size_t offset = + static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; + for (; offset < n; offset += stride) { + using ArrT = phi::AlignedVector<__half, VecSize>; + ArrT x_in_arr = *reinterpret_cast(x + offset); + ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + __half2 tmp_fp16_2; + tmp_fp16_2.x = x_in_arr[i]; + tmp_fp16_2.y = y_g_in_arr[i]; + float2 tmp_fp32_2 = __half22float2(tmp_fp16_2); + x_in_arr[i] = + __float2half(FP32GeluBwd(tmp_fp32_2.x, tmp_fp32_2.y)); + } + *reinterpret_cast(x_g + offset) = x_in_arr; + } +} + +static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( + const GPUContext& dev_ctx, const __half* x, __half* y, size_t n) { + auto is_aligned = [](const void* p, size_t alignment) { + return reinterpret_cast(p) % alignment == 0; + }; + +#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ + do { \ + constexpr auto kAlignment = \ + alignof(phi::AlignedVector<__half, __vec_size>); \ + if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ + is_aligned(y, kAlignment)) { \ + size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ + size_t block = (n / __vec_size + thread - 1) / thread; \ + block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ + VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ + << " , thread = " << thread; \ + FP16FastGeluFwdCUDAKernel< \ + __vec_size, \ + __use_fast_math><<>>(x, y, n); \ + return true; \ + } \ + } while (0) + + if (FLAGS_use_fast_math) { + PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true); + } else { + PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false); + } + +#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL + return false; +} + +static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( + const GPUContext& dev_ctx, + const __half* x, + const __half* y_g, + __half* x_g, + size_t n) { + auto is_aligned = [](const void* p, size_t alignment) { + return reinterpret_cast(p) % alignment == 0; + }; + +#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ + do { \ + constexpr auto kAlignment = \ + alignof(phi::AlignedVector<__half, __vec_size>); \ + if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ + is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ + is_aligned(x_g, kAlignment)) { \ + size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ + size_t block = (n / __vec_size + thread - 1) / thread; \ + block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ + VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ + << " , thread = " << thread; \ + FP16FastGeluBwdCUDAKernel< \ + __vec_size, \ + __use_fast_math><<>>( \ + x, y_g, x_g, n); \ + return true; \ + } \ + } while (0) + + if (FLAGS_use_fast_math) { + PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true); + } else { + PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false); + } + +#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL + return false; +} +#endif + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..1e21f8d4267bca5363d58b63e0a37d076b4d06af --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/gpu/gelu_funcs.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +template +struct GeluWithApproximateGradFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { + MPType x = static_cast(arg_x); + MPType dout = static_cast(arg_dout); + MPType one = static_cast(1); + MPType half = static_cast(0.5); + MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + MPType kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + auto cube_x = x * x * x; + auto tanh_out = + tanh(kAlpha * ((static_cast(GELU_CONSTANT) * cube_x) + x)); + auto ans = + half * (one + tanh_out + + (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x)); + return static_cast(ans * dout); + } +}; + +template +struct GeluWithoutApproximateGradFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { + MPType x = static_cast(arg_x); + MPType dout = static_cast(arg_dout); + constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast(0.5); + const MPType cdf = normcdf(x); + const MPType pdf = exp(static_cast(-0.5) * x * x) * kBeta; + return static_cast(dout * (cdf + x * pdf)); + } +}; + +template +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + std::vector ins = {&x, &out_grad}; + std::vector outs = {x_grad}; + if (approximate) { +#ifdef __NVCC__ + if (std::is_same::value) { + size_t n = x.numel(); + const auto* x_ptr = reinterpret_cast(x.data()); + const auto* y_g_ptr = reinterpret_cast(out_grad.data()); + auto* x_g_ptr = reinterpret_cast<__half*>(x_grad->data()); + if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( + dev_ctx, x_ptr, y_g_ptr, x_g_ptr, n)) { + return; + } + } +#endif + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor()); + } else { + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu_grad, + GPU, + ALL_LAYOUT, + phi::GeluGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..ce6dda2d6cc6526853cf563779cfe5ad1a21ffe1 --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_kernel.cu @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/gpu/gelu_funcs.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +template +struct GeluWithApproximateFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x) { + // this function is tanh approximation of gelu + MPType x = static_cast(arg_x); + MPType one = static_cast(1); + MPType half = static_cast(0.5); + MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + auto tanh_out = + tanh(kAlpha * x * (one + static_cast(GELU_CONSTANT) * x * x)); + MPType out = x * half * (one + tanh_out); + return static_cast(out); + } +}; + +template +struct GeluWithoutApproximateFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x) { + // actual gelu with approximation = false + MPType x = static_cast(arg_x); + return static_cast(x * normcdf(x)); + } +}; + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out) { + dev_ctx.template Alloc(out); + std::vector ins = {&x}; + std::vector outs = {out}; + if (approximate) { +#ifdef __NVCC__ + if (std::is_same::value) { + size_t n = x.numel(); + const auto* in_ptr = reinterpret_cast(x.data()); + auto* out_ptr = reinterpret_cast<__half*>(out->data()); + if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( + dev_ctx, in_ptr, out_ptr, n)) { + return; + } + } +#endif + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); + } else { + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu, + GPU, + ALL_LAYOUT, + phi::GeluKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f7b282536558db524c082de11c7ca92b2bd98edc --- /dev/null +++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +namespace phi { + +template +void LogSoftmaxGradKernel(const Context &dev_ctx, + const DenseTensor &out, + const DenseTensor &out_grad, + int axis, + DenseTensor *x_grad) { + dev_ctx.template Alloc(x_grad); + phi::SoftmaxBackwardCUDAKernelDriver( + dev_ctx, out, out_grad, axis, x_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(log_softmax_grad, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(log_softmax_grad, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d7e34c6c14e7a49f50c016d888f6fb875dca0776 --- /dev/null +++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +namespace phi { + +template +void LogSoftmaxKernel(const Context &dev_ctx, + const DenseTensor &x, + int axis, + DenseTensor *out) { + dev_ctx.template Alloc(out); + phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, x, axis, out); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(log_softmax, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(log_softmax, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/gpu/prelu_funcs.h b/paddle/phi/kernels/gpu/prelu_funcs.h new file mode 100644 index 0000000000000000000000000000000000000000..76ee9439a2050b000b5cffd1df47581141a874c7 --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_funcs.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +#define CUDA_NUM_THREADS 1024 + +inline static int PADDLE_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__global__ void PReluChannelFirstWiseKernel(const T *input, + const T *alpha, + T *output, + size_t channel_num, + size_t plane_size, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t temp = index / plane_size; + size_t channel_index = temp % channel_num; + T scale = alpha[channel_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluChannelLastWiseKernel(const T *input, + const T *alpha, + T *output, + size_t channel_num, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t channel_index = index % channel_num; + T scale = alpha[channel_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluElementWiseKernel(const T *input, + const T *alpha, + T *output, + size_t spatial_size, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t element_index = index % spatial_size; + T scale = alpha[element_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluScalarKernel(const T *input, + const T *alpha, + T *output, + size_t numel) { + T scale = alpha[0]; + CUDA_KERNEL_LOOP(index, numel) { + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +class PreluChannelWiseDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t channel, + bool channel_last, + size_t numel); +}; + +template +class PreluElementWiseDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t numel); +}; + +template +class PreluScalarDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t numel); +}; + +template +void PreluChannelWiseDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t channel, + bool channel_last, + size_t numel) { + if (channel_last) { + PReluChannelLastWiseKernel<<>>( + input, alpha, output, channel, numel); + } else { + PReluChannelFirstWiseKernel<<>>( + input, alpha, output, channel, numel / batch_size / channel, numel); + } +} + +template +void PreluElementWiseDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t numel) { + PReluElementWiseKernel<<>>( + input, alpha, output, numel / batch_size, numel); +} + +template +void PreluScalarDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t numel) { + PReluScalarKernel<<>>( + input, alpha, output, numel); +} + +template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; + +template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; + +template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d8661268e82c35f48d9877120574628c4325ae4e --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu @@ -0,0 +1,183 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/gpu/prelu_funcs.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" + +namespace phi { + +enum PRELU_MODE { Element, ChannelFirst, ChannelLast, PRELU_Scalar }; + +template +__global__ void PReluOpGradKernel(const T* x_ptr, + const T* alpha_ptr, + const T* out_grad_ptr, + T* x_grad_ptr, + T* alpha_grad_ptr, + size_t channel_num, + size_t plane_size, + size_t spatial_size, + size_t numel, + PRELU_MODE mode) { + CUDA_KERNEL_LOOP(index, numel) { + T scale; + if (mode == Element) { + size_t element_index = index % spatial_size; + scale = alpha_ptr[element_index]; + } else if (mode == ChannelFirst) { + size_t temp = index / plane_size; + size_t channel_index = temp % channel_num; + scale = alpha_ptr[channel_index]; + } else if (mode == ChannelLast) { + size_t channel_index = index % channel_num; + scale = alpha_ptr[channel_index]; + } else { + scale = alpha_ptr[0]; + } + T x = x_ptr[index]; + T out_grad = out_grad_ptr[index]; + T zero = static_cast(0); + if (x_grad_ptr != nullptr) + x_grad_ptr[index] = (x > zero) ? out_grad : scale * out_grad; + if (alpha_grad_ptr != nullptr) + alpha_grad_ptr[index] = (x > zero) ? zero : x * out_grad; + } +} + +template +class PreluOpGradFunctor { + public: + void operator()(gpuStream_t stream, + const T* x, + const T* alpha, + const T* out_grad, + T* x_grad, + T* alpha_grad, + const DDim& input_dims, + PRELU_MODE mode) { + size_t numel = 1; + for (size_t i = 0; i < input_dims.size(); ++i) { + numel *= input_dims[i]; + } + size_t plane_size = numel / input_dims[0] / input_dims[1]; + size_t spatial_size = numel / input_dims[0]; + size_t channel = + mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1]; + + PReluOpGradKernel< + T><<>>( + x, + alpha, + out_grad, + x_grad, + alpha_grad, + channel, + plane_size, + spatial_size, + numel, + mode); + } +}; + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad) { + dev_ctx.template Alloc(x_grad); + + const T* x_ptr = x.data(); + const T* alpha_ptr = alpha.data(); + const T* out_grad_ptr = out_grad.data(); + T* x_grad_ptr = x_grad ? dev_ctx.template Alloc(x_grad) : nullptr; + T* alpha_grad_ptr = + alpha_grad ? dev_ctx.template Alloc(alpha_grad) : nullptr; + + if (!x_grad && !alpha_grad) return; + + int numel = x.numel(); + auto dim = x.dims(); + auto x_rank = dim.size(); + std::vector input_shape = phi::vectorize(dim); + auto stream = dev_ctx.stream(); + + T* alpha_grad_tmp_ptr; + DenseTensor alpha_grad_tmp; + if (alpha_grad_ptr == nullptr) { + alpha_grad_tmp_ptr = alpha_grad_ptr; + } else { + DenseTensorMeta alpha_grad_meta( + alpha_grad->dtype(), dim, alpha_grad->layout()); + alpha_grad_tmp = phi::Empty(dev_ctx, std::move(alpha_grad_meta)); + alpha_grad_tmp_ptr = alpha_grad_tmp.data(); + } + + PRELU_MODE m; + bool channel_last = false; + if (mode == "element") { + m = Element; + } else if (mode == "channel") { + channel_last = data_format == "NHWC"; + m = channel_last ? ChannelLast : ChannelFirst; + } else { + m = PRELU_Scalar; + } + PreluOpGradFunctor prelu_grad; + prelu_grad(stream, + x_ptr, + alpha_ptr, + out_grad_ptr, + x_grad_ptr, + alpha_grad_tmp_ptr, + dim, + m); + + if (alpha_grad_tmp_ptr == nullptr) return; + + std::vector reduce_dims; + for (size_t i = 0; i < dim.size(); i++) { + if (mode == "channel" && !channel_last && i == 1) continue; + if (mode == "channel" && channel_last && i == dim.size() - 1) continue; + if (mode == "element" && i != 0) continue; + reduce_dims.push_back(i); + } + + phi::funcs::ReduceKernel>( + static_cast(dev_ctx), + alpha_grad_tmp, + alpha_grad, + kps::IdentityFunctor(), + reduce_dims); +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu_grad, + GPU, + ALL_LAYOUT, + phi::PReluGradKernel, + float, + phi::dtype::float16, + double) {} diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8255a7ba2ed96dcdeb8d6e23a4637ce56d636a12 --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_kernel.cu @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/prelu_funcs.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out) { + const T* x_ptr = x.data(); + T* o_ptr = dev_ctx.template Alloc(out); + + const T* alpha_ptr = alpha.data(); + int numel = x.numel(); + auto dim = x.dims(); + auto x_rank = dim.size(); + + VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim[" + << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel; + + if (mode == "channel") { + bool channel_last = data_format == "NHWC"; + size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; + PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; + prelu_channel_wise(dev_ctx.stream(), + x_ptr, + alpha_ptr, + o_ptr, + dim[0], + channel, + channel_last, + numel); + } else if (mode == "element") { + PreluElementWiseDirectCUDAFunctor prelu_element_wise; + prelu_element_wise( + dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, dim[0], numel); + } else { + PreluScalarDirectCUDAFunctor prelu_scalar; + prelu_scalar(dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, numel); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu, + GPU, + ALL_LAYOUT, + phi::PReluKernel, + float, + phi::dtype::float16, + double) {} diff --git a/paddle/phi/kernels/log_softmax_grad_kernel.h b/paddle/phi/kernels/log_softmax_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..6336bc14105bb55deacbfdc20a69a56c6ceca81a --- /dev/null +++ b/paddle/phi/kernels/log_softmax_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/log_softmax_kernel.h b/paddle/phi/kernels/log_softmax_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..2caaa86d46c35888c5aaa944019c070f0dd64e17 --- /dev/null +++ b/paddle/phi/kernels/log_softmax_kernel.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/prelu_grad_kernel.h b/paddle/phi/kernels/prelu_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..15917e2e1f02e896d12e971e7dfa52685f57a676 --- /dev/null +++ b/paddle/phi/kernels/prelu_grad_kernel.h @@ -0,0 +1,31 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad); +} // namespace phi diff --git a/paddle/phi/kernels/prelu_kernel.h b/paddle/phi/kernels/prelu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..251332a8158dcbfa45cbb6c183e06789c21894db --- /dev/null +++ b/paddle/phi/kernels/prelu_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/ops/compat/gelu_sig.cc b/paddle/phi/ops/compat/gelu_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..bf4b47bcf5fa9c1fb9d03f6b332c0c867211f5ac --- /dev/null +++ b/paddle/phi/ops/compat/gelu_sig.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GeluOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("gelu", {"X"}, {"approximate"}, {"Out"}); +} + +KernelSignature GeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("gelu_grad", + {"X", GradVarName("Out")}, + {"approximate"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(gelu_grad, phi::GeluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gelu, phi::GeluOpArgumentMapping); diff --git a/paddle/phi/ops/compat/log_softmax_sig.cc b/paddle/phi/ops/compat/log_softmax_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..b1ecc6d56768f069c208a0230722929200f1dfe0 --- /dev/null +++ b/paddle/phi/ops/compat/log_softmax_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature LogSoftmaxGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("log_softmax_grad", + {"Out", GradVarName("Out")}, + {"axis"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(log_softmax_grad, + phi::LogSoftmaxGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/prelu_sig.cc b/paddle/phi/ops/compat/prelu_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..bd296c5e95318332523a3cf07e85f1afd6f8a95c --- /dev/null +++ b/paddle/phi/ops/compat/prelu_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature PReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("prelu_grad", + {"X", "Alpha", GradVarName("Out")}, + {"mode", "data_format"}, + {GradVarName("X"), GradVarName("Alpha")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(prelu_grad, phi::PReluGradOpArgumentMapping);