From ece200b3c13aed849e0c462cec5a5f66e3320613 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Wed, 9 Feb 2022 07:55:17 +0800 Subject: [PATCH] Move norm to pten (#39324) * add norm cpu * update code; * norm bug fix * move norm op to pten; test=develop * move norm op to pten; test=develop * add norm util; test=develop * fix norm npu bug; test=develop * fix norm kernel bug; test=develop * move kernel args to pten; test=develop * move kernel args to pten sig; test=develop --- paddle/fluid/operators/norm_op.cc | 6 +- paddle/fluid/operators/norm_op.cu | 200 ------------------ paddle/fluid/operators/norm_op.h | 143 ------------- paddle/fluid/operators/norm_op_npu.cc | 2 +- paddle/pten/core/compat/arg_map_context.h | 12 ++ paddle/pten/kernels/cpu/norm_grad_kernel.cc | 87 ++++++++ paddle/pten/kernels/cpu/norm_kernel.cc | 79 +++++++ paddle/pten/kernels/funcs/common_shape.h | 13 ++ paddle/pten/kernels/gpu/norm_grad_kernel.cu | 120 +++++++++++ paddle/pten/kernels/gpu/norm_kernel.cu | 133 ++++++++++++ paddle/pten/kernels/norm_grad_kernel.h | 31 +++ paddle/pten/kernels/norm_kernel.h | 30 +++ paddle/pten/ops/compat/norm_sig.cc | 34 +++ .../fluid/tests/unittests/test_norm_op.py | 2 + 14 files changed, 543 insertions(+), 349 deletions(-) delete mode 100644 paddle/fluid/operators/norm_op.cu delete mode 100644 paddle/fluid/operators/norm_op.h create mode 100644 paddle/pten/kernels/cpu/norm_grad_kernel.cc create mode 100644 paddle/pten/kernels/cpu/norm_kernel.cc create mode 100644 paddle/pten/kernels/gpu/norm_grad_kernel.cu create mode 100644 paddle/pten/kernels/gpu/norm_kernel.cu create mode 100644 paddle/pten/kernels/norm_grad_kernel.h create mode 100644 paddle/pten/kernels/norm_kernel.h create mode 100644 paddle/pten/ops/compat/norm_sig.cc diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index f6dbe10023..5d394424d5 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/norm_op.h" #include #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -115,7 +115,3 @@ REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker, ops::NormOpGradOpMaker, ops::NormOpGradOpMaker); REGISTER_OPERATOR(norm_grad, ops::NormOpGrad); -REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel, - ops::NormKernel); -REGISTER_OP_CPU_KERNEL(norm_grad, ops::NormGradKernel, - ops::NormGradKernel); diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu deleted file mode 100644 index 0c960e10fc..0000000000 --- a/paddle/fluid/operators/norm_op.cu +++ /dev/null @@ -1,200 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -Indicesou may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/norm_op.h" -#include "paddle/fluid/platform/bfloat16.h" - -namespace paddle { -namespace operators { - -__device__ __forceinline__ platform::float16 square_root(platform::float16 x) { - return static_cast(sqrtf(static_cast(x))); -} - -__device__ __forceinline__ float square_root(float x) { return sqrtf(x); } - -__device__ __forceinline__ double square_root(double x) { return sqrt(x); } - -template -__global__ void Normalize(const T* x, const int pre, - const int axis_n, // dim in axis - const int post, const T eps, T* y, T* out_norm) { - using MT = typename details::MPTypeTrait::Type; - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - int num = pre * post; - for (int i = blockIdx.x; i < num; i += gridDim.x) { - int base = (i / post) * post * axis_n + (i % post); - - MT sum = 0.0; - __shared__ MT norm; - for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { - const MT x_ij = static_cast(x[base + j * post]); - sum += x_ij * x_ij; - } - MT reduce_result = BlockReduce(temp_storage).Sum(sum); - - if (threadIdx.x == 0) { - norm = square_root(reduce_result + static_cast(eps)); - out_norm[i] = static_cast(norm); - } - __syncthreads(); - for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { - const int index = base + j * post; - y[index] = static_cast((static_cast(x[index]) / norm)); - } - } -} - -template -class NormCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_x = ctx.Input("X"); - auto* out_y = ctx.Output("Out"); - - auto xdim = in_x->dims(); - int axis = ctx.Attr("axis"); - if (axis < 0) axis = xdim.size() + axis; - T eps = static_cast(ctx.Attr("epsilon")); - - bool is_test = ctx.Attr("is_test"); - - framework::Tensor* out_norm; - framework::Tensor out_norm_tmp; - if (is_test) { - auto out_dim = in_x->dims(); - out_dim[axis] = 1; - out_norm = &out_norm_tmp; - out_norm->Resize(out_dim); - } else { - out_norm = ctx.Output("Norm"); - } - - const T* x = in_x->data(); - T* y = out_y->mutable_data(ctx.GetPlace()); - T* norm = out_norm->mutable_data(ctx.GetPlace()); - - int pre, n, post; - GetDims(xdim, axis, &pre, &n, &post); - - auto& dev_ctx = ctx.cuda_device_context(); -#ifdef __HIPCC__ - const int block = 256; -#else - const int block = 512; -#endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - int grid = std::min(max_blocks, pre * post); - Normalize<<>>(x, pre, n, post, - eps, y, norm); - } -}; - -template -__global__ void NormalizeGradient(const T* x, const T* x_norm, const T* y_grad, - const int pre, const int axis_n, - const int post, T* x_grad) { - using MT = typename details::MPTypeTrait::Type; - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage_sum; - int num = pre * post; - for (int i = blockIdx.x; i < num; i += gridDim.x) { - MT sum = 0.0; - __shared__ MT row_sum; - __shared__ MT row_sqrt_norm; - __shared__ MT row_norm; - - auto base = (i / post) * post * axis_n + (i % post); - - for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { - int index = base + j * post; - sum += static_cast(x[index]) * static_cast(y_grad[index]); - } - MT reduce_result = BlockReduce(temp_storage_sum).Sum(sum); - - if (threadIdx.x == 0) { - row_sum = reduce_result; - row_sqrt_norm = static_cast(x_norm[i]); - row_norm = row_sqrt_norm * row_sqrt_norm; - } - __syncthreads(); - for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { - int index = base + j * post; - const MT x_ij = static_cast(x[index]); - const MT dy_ij = static_cast(y_grad[index]); - x_grad[index] = - static_cast((dy_ij - x_ij * row_sum / row_norm) / row_sqrt_norm); - } - } -} - -template -class NormGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_x = ctx.Input("X"); - auto* in_norm = ctx.Input("Norm"); - auto* in_dy = ctx.Input(framework::GradVarName("Out")); - auto* out_dx = ctx.Output(framework::GradVarName("X")); - T* dx = out_dx->mutable_data(ctx.GetPlace()); - const T* x = in_x->data(); - const T* x_norm = in_norm->data(); - const T* dy = in_dy->data(); - - auto xdim = in_x->dims(); - int axis = ctx.Attr("axis"); - if (axis < 0) axis = xdim.size() + axis; - int pre, n, post; - GetDims(xdim, axis, &pre, &n, &post); - - auto& dev_ctx = ctx.cuda_device_context(); - -#ifdef __HIPCC__ - const int block = 256; -#else - const int block = 512; -#endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - int grid = std::min(max_blocks, pre * post); - NormalizeGradient<<>>( - x, x_norm, dy, pre, n, post, dx); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; - -REGISTER_OP_CUDA_KERNEL(norm, - ops::NormCUDAKernel, - ops::NormCUDAKernel, - ops::NormCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - norm_grad, ops::NormGradCUDAKernel, - ops::NormGradCUDAKernel, - ops::NormGradCUDAKernel); diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h deleted file mode 100644 index 058c523625..0000000000 --- a/paddle/fluid/operators/norm_op.h +++ /dev/null @@ -1,143 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -Indicesou may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/math_function.h" - -namespace paddle { -namespace operators { - -inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n, - int* post) { - *pre = 1; - *post = 1; - *n = dim[axis]; - for (int i = 0; i < axis; ++i) { - (*pre) *= dim[i]; - } - for (int i = axis + 1; i < dim.size(); ++i) { - (*post) *= dim[i]; - } -} - -template -class NormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_x = ctx.Input("X"); - auto* out_y = ctx.Output("Out"); - - auto xdim = in_x->dims(); - T eps = static_cast(ctx.Attr("epsilon")); - int axis = ctx.Attr("axis"); - if (axis < 0) axis = xdim.size() + axis; - int pre, n, post; - GetDims(xdim, axis, &pre, &n, &post); - - bool is_test = ctx.Attr("is_test"); - - framework::Tensor* out_norm; - framework::Tensor out_norm_tmp; - if (is_test) { - auto out_dim = in_x->dims(); - out_dim[axis] = 1; - out_norm = &out_norm_tmp; - out_norm->Resize(out_dim); - } else { - out_norm = ctx.Output("Norm"); - } - - out_y->mutable_data(ctx.GetPlace()); - out_norm->mutable_data(ctx.GetPlace()); - - auto* place = ctx.template device_context().eigen_device(); - - Eigen::DSizes shape(pre, n, post); - Eigen::DSizes norm_shape(pre, post); - - auto x_e = framework::EigenVector::Flatten(*in_x); - auto y_e = framework::EigenVector::Flatten(*out_y); - auto norm_e = framework::EigenVector::Flatten(*out_norm); - auto x = x_e.reshape(shape); - auto y = y_e.reshape(shape); - auto norm = norm_e.reshape(norm_shape); - - Eigen::DSizes rdim(1); - // y = x / sqrt((sum(x * x) + epsilon)) - // norm = sqrt(sum(x * x) + epsilon) - auto x2 = x * x; - auto sum = x2.sum(rdim) + eps; - norm.device(*place) = sum.sqrt(); - - // y = x / norm - Eigen::DSizes rshape(pre, 1, post); - Eigen::DSizes bcast(1, n, 1); - y.device(*place) = x / norm.reshape(rshape).broadcast(bcast); - } -}; - -template -class NormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_x = ctx.Input("X"); - auto* in_norm = ctx.Input("Norm"); - auto* in_dy = ctx.Input(framework::GradVarName("Out")); - auto* out_dx = ctx.Output(framework::GradVarName("X")); - out_dx->mutable_data(ctx.GetPlace()); - - auto xdim = in_x->dims(); - int axis = ctx.Attr("axis"); - if (axis < 0) axis = xdim.size() + axis; - int pre, n, post; - GetDims(xdim, axis, &pre, &n, &post); - - auto* place = ctx.template device_context().eigen_device(); - - auto x_e = framework::EigenVector::Flatten(*in_x); - auto dy_e = framework::EigenVector::Flatten(*in_dy); - auto norm_e = framework::EigenVector::Flatten(*in_norm); - auto dx_e = framework::EigenVector::Flatten(*out_dx); - - Eigen::DSizes shape(pre, n, post); - Eigen::DSizes rshape(pre, 1, post); - auto x = x_e.reshape(shape); - auto dy = dy_e.reshape(shape); - auto norm = norm_e.reshape(rshape); - auto dx = dx_e.reshape(shape); - - framework::Tensor rsum; - rsum.mutable_data({pre, post}, ctx.GetPlace()); - auto sum = framework::EigenTensor::From(rsum); - - Eigen::DSizes rdim(1); - Eigen::DSizes bcast(1, n, 1); - - // dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)] - // = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x)) - // = [dy - x * sum(x*dy) / (sum(x*x) + e)] / sqrt(sum(x*x)) - // 1. sum = sum(x*dy) - sum.device(*place) = (x * dy).sum(rdim); - // 2. dx = x * sum - dx.device(*place) = sum.reshape(rshape).broadcast(bcast) * x; - // 3. dx / (sum(x*x) + e) - // where, norm.pow(2) = sum(x*x) + e, which is calculated in forward. - dx.device(*place) = dx / norm.pow(2).broadcast(bcast); - // 4. [dy - dx] / sqrt(sum(x*x)) - dx.device(*place) = (dy - dx) / norm.broadcast(bcast); - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/norm_op_npu.cc b/paddle/fluid/operators/norm_op_npu.cc index ea29ef4fb3..b03e64e778 100644 --- a/paddle/fluid/operators/norm_op_npu.cc +++ b/paddle/fluid/operators/norm_op_npu.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/norm_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/pten/core/compat/arg_map_context.h b/paddle/pten/core/compat/arg_map_context.h index 6898dd36d6..42ab0f1fcc 100644 --- a/paddle/pten/core/compat/arg_map_context.h +++ b/paddle/pten/core/compat/arg_map_context.h @@ -24,6 +24,18 @@ limitations under the License. */ namespace pten { +constexpr char kGradVarSuffix[] = "@GRAD"; + +constexpr size_t kGradVarSuffixSize = 5U; + +inline std::string GradVarName(const std::string& var_name) { + std::string result; + result.reserve(var_name.size() + kGradVarSuffixSize); + result += var_name; + result += kGradVarSuffix; + return result; +} + // tuple(input_names, attr_names, output_names) using KernelArgsTuple = std::tuple, paddle::SmallVector, diff --git a/paddle/pten/kernels/cpu/norm_grad_kernel.cc b/paddle/pten/kernels/cpu/norm_grad_kernel.cc new file mode 100644 index 0000000000..3357e6f76f --- /dev/null +++ b/paddle/pten/kernels/cpu/norm_grad_kernel.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/norm_grad_kernel.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/pten/kernels/funcs/eigen/eigen_function.h" + +#include "paddle/pten/kernels/funcs/eigen/common.h" + +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/kernel_registry.h" + +#include "paddle/pten/kernels/funcs/common_shape.h" +namespace pten { + +template +void NormGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& norm, + int axis, + float epsilon, + bool is_test, + DenseTensor* x_grad) { + auto* in_x = &x; + auto* in_dy = &out_grad; + auto* in_norm = &norm; + auto* out_dx = x_grad; + + ctx.template Alloc(out_dx); + + auto xdim = in_x->dims(); + if (axis < 0) axis = xdim.size() + axis; + int pre, n, post; + funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post); + + auto* place = ctx.eigen_device(); + + auto x_e = paddle::framework::EigenVector::Flatten(*in_x); + auto dy_e = paddle::framework::EigenVector::Flatten(*in_dy); + auto norm_e = paddle::framework::EigenVector::Flatten(*in_norm); + auto dx_e = paddle::framework::EigenVector::Flatten(*out_dx); + + Eigen::DSizes shape(pre, n, post); + Eigen::DSizes rshape(pre, 1, post); + auto x_r = x_e.reshape(shape); + auto dy = dy_e.reshape(shape); + auto norm_r = norm_e.reshape(rshape); + auto dx = dx_e.reshape(shape); + + DenseTensor rsum; + rsum.Resize({pre, post}); + ctx.template Alloc(&rsum); + auto sum = paddle::framework::EigenTensor::From(rsum); + + Eigen::DSizes rdim(1); + Eigen::DSizes bcast(1, n, 1); + + // dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)] + // = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x)) + // = [dy - x * sum(x*dy) / (sum(x*x) + e)] / sqrt(sum(x*x)) + // 1. sum = sum(x*dy) + sum.device(*place) = (x_r * dy).sum(rdim); + // 2. dx = x * sum + dx.device(*place) = sum.reshape(rshape).broadcast(bcast) * x_r; + // 3. dx / (sum(x*x) + e) + // where, norm.pow(2) = sum(x*x) + e, which is calculated in forward. + dx.device(*place) = dx / norm_r.pow(2).broadcast(bcast); + // 4. [dy - dx] / sqrt(sum(x*x)) + dx.device(*place) = (dy - dx) / norm_r.broadcast(bcast); +} + +} // namespace pten + +PT_REGISTER_KERNEL( + norm_grad, CPU, ALL_LAYOUT, pten::NormGradKernel, float, double) {} diff --git a/paddle/pten/kernels/cpu/norm_kernel.cc b/paddle/pten/kernels/cpu/norm_kernel.cc new file mode 100644 index 0000000000..ef2cf405c1 --- /dev/null +++ b/paddle/pten/kernels/cpu/norm_kernel.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/norm_kernel.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/funcs/common_shape.h" +#include "paddle/pten/kernels/funcs/eigen/eigen_function.h" + +namespace pten { + +template +void NormKernel(const Context& ctx, + const DenseTensor& x, + int axis, + float epsilon, + bool is_test, + DenseTensor* out, + DenseTensor* norm) { + auto xdim = x.dims(); + T eps = epsilon; + if (axis < 0) axis = xdim.size() + axis; + int pre, n, post; + funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post); + + DenseTensor* out_norm; + DenseTensor out_norm_tmp; + if (is_test) { + auto out_dim = x.dims(); + out_dim[axis] = 1; + out_norm = &out_norm_tmp; + out_norm->Resize(out_dim); + } else { + out_norm = norm; + } + + ctx.template Alloc(out); + ctx.template Alloc(out_norm); + + auto* place = ctx.eigen_device(); + + Eigen::DSizes shape(pre, n, post); + Eigen::DSizes norm_shape(pre, post); + + auto x_e = paddle::framework::EigenVector::Flatten(x); + auto y_e = paddle::framework::EigenVector::Flatten(*out); + auto norm_e = paddle::framework::EigenVector::Flatten(*out_norm); + auto x_r = x_e.reshape(shape); + auto y = y_e.reshape(shape); + auto norm_reshape = norm_e.reshape(norm_shape); + + Eigen::DSizes rdim(1); + // y = x / sqrt((sum(x * x) + epsilon)) + // norm = sqrt(sum(x * x) + epsilon) + auto x2 = x_r * x_r; + auto sum = x2.sum(rdim) + eps; + norm_reshape.device(*place) = sum.sqrt(); + + // y = x / norm + Eigen::DSizes rshape(pre, 1, post); + Eigen::DSizes bcast(1, n, 1); + y.device(*place) = x_r / norm_reshape.reshape(rshape).broadcast(bcast); +} + +} // namespace pten + +PT_REGISTER_KERNEL(norm, CPU, ALL_LAYOUT, pten::NormKernel, float, double) {} diff --git a/paddle/pten/kernels/funcs/common_shape.h b/paddle/pten/kernels/funcs/common_shape.h index 6bb45ad199..9a96a5fd45 100644 --- a/paddle/pten/kernels/funcs/common_shape.h +++ b/paddle/pten/kernels/funcs/common_shape.h @@ -89,5 +89,18 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, } } +inline void GetPrePostNumel( + const framework::DDim &dim, int axis, int *pre, int *n, int *post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + } // namespace funcs } // namespace pten diff --git a/paddle/pten/kernels/gpu/norm_grad_kernel.cu b/paddle/pten/kernels/gpu/norm_grad_kernel.cu new file mode 100644 index 0000000000..35701d349a --- /dev/null +++ b/paddle/pten/kernels/gpu/norm_grad_kernel.cu @@ -0,0 +1,120 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/pten/kernels/norm_grad_kernel.h" +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/pten/common/bfloat16.h" + +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/core/kernel_registry.h" + +#include "paddle/pten/kernels/funcs/common_shape.h" + +namespace pten { + +template +__global__ void NormalizeGradient(const T* x, + const T* x_norm, + const T* y_grad, + const int pre, + const int axis_n, + const int post, + T* x_grad) { + using MT = typename paddle::operators::details::MPTypeTrait::Type; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage_sum; + int num = pre * post; + for (int i = blockIdx.x; i < num; i += gridDim.x) { + MT sum = 0.0; + __shared__ MT row_sum; + __shared__ MT row_sqrt_norm; + __shared__ MT row_norm; + + auto base = (i / post) * post * axis_n + (i % post); + + for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { + int index = base + j * post; + sum += static_cast(x[index]) * static_cast(y_grad[index]); + } + MT reduce_result = BlockReduce(temp_storage_sum).Sum(sum); + + if (threadIdx.x == 0) { + row_sum = reduce_result; + row_sqrt_norm = static_cast(x_norm[i]); + row_norm = row_sqrt_norm * row_sqrt_norm; + } + __syncthreads(); + for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { + int index = base + j * post; + const MT x_ij = static_cast(x[index]); + const MT dy_ij = static_cast(y_grad[index]); + x_grad[index] = + static_cast((dy_ij - x_ij * row_sum / row_norm) / row_sqrt_norm); + } + } +} + +template +void NormGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& norm, + int axis, + float epsilon, + bool is_test, + DenseTensor* x_grad) { + auto* in_x = &x; + auto* in_norm = &norm; + auto* in_dy = &out_grad; + auto* out_dx = x_grad; + ctx.template Alloc(out_dx); + T* dx = out_dx->data(); + const T* x_data = in_x->data(); + const T* x_norm = in_norm->data(); + const T* dy = in_dy->data(); + + auto xdim = in_x->dims(); + if (axis < 0) axis = xdim.size() + axis; + int pre, n, post; + funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post); + +#ifdef __HIPCC__ + const int block = 256; +#else + const int block = 512; +#endif + int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid = std::min(max_blocks, pre * post); + NormalizeGradient<<>>( + x_data, x_norm, dy, pre, n, post, dx); +} + +} // namespace pten + +PT_REGISTER_KERNEL(norm_grad, + GPU, + ALL_LAYOUT, + pten::NormGradKernel, + float, + double, + paddle::platform::float16) {} diff --git a/paddle/pten/kernels/gpu/norm_kernel.cu b/paddle/pten/kernels/gpu/norm_kernel.cu new file mode 100644 index 0000000000..6e2ee65231 --- /dev/null +++ b/paddle/pten/kernels/gpu/norm_kernel.cu @@ -0,0 +1,133 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/pten/kernels/norm_kernel.h" +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/pten/common/float16.h" + +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/core/kernel_registry.h" + +#include "paddle/pten/kernels/funcs/common_shape.h" + +namespace pten { + +__device__ __forceinline__ dtype::float16 square_root(dtype::float16 x) { + return static_cast(sqrtf(static_cast(x))); +} + +__device__ __forceinline__ float square_root(float x) { return sqrtf(x); } + +__device__ __forceinline__ double square_root(double x) { return sqrt(x); } + +template +__global__ void Normalize(const T* x, + const int pre, + const int axis_n, // dim in axis + const int post, + const T eps, + T* y, + T* out_norm) { + using MT = typename paddle::operators::details::MPTypeTrait::Type; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + int num = pre * post; + for (int i = blockIdx.x; i < num; i += gridDim.x) { + int base = (i / post) * post * axis_n + (i % post); + + MT sum = 0.0; + __shared__ MT norm; + for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { + const MT x_ij = static_cast(x[base + j * post]); + sum += x_ij * x_ij; + } + MT reduce_result = BlockReduce(temp_storage).Sum(sum); + + if (threadIdx.x == 0) { + norm = square_root(reduce_result + static_cast(eps)); + out_norm[i] = static_cast(norm); + } + __syncthreads(); + for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { + const int index = base + j * post; + y[index] = static_cast((static_cast(x[index]) / norm)); + } + } +} + +template +void NormKernel(const Context& ctx, + const DenseTensor& x, + int axis, + float epsilon, + bool is_test, + DenseTensor* out, + DenseTensor* norm) { + auto* in_x = &x; + auto* out_y = out; + + auto xdim = in_x->dims(); + if (axis < 0) axis = xdim.size() + axis; + T eps = static_cast(epsilon); + + DenseTensor* out_norm; + DenseTensor out_norm_tmp; + if (is_test) { + auto out_dim = in_x->dims(); + out_dim[axis] = 1; + out_norm = &out_norm_tmp; + out_norm->Resize(out_dim); + } else { + out_norm = norm; + } + + const T* x_ptr = in_x->data(); + ctx.template Alloc(out_y); + ctx.template Alloc(out_norm); + + T* y = out_y->data(); + T* norm_ptr = out_norm->data(); + + int pre, n, post; + funcs::GetPrePostNumel(xdim, axis, &pre, &n, &post); + +#ifdef __HIPCC__ + const int block = 256; +#else + const int block = 512; +#endif + int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid = std::min(max_blocks, pre * post); + Normalize<<>>( + x_ptr, pre, n, post, eps, y, norm_ptr); +} + +} // namespace pten + +PT_REGISTER_KERNEL(norm, + GPU, + ALL_LAYOUT, + pten::NormKernel, + float, + double, + paddle::platform::float16) {} diff --git a/paddle/pten/kernels/norm_grad_kernel.h b/paddle/pten/kernels/norm_grad_kernel.h new file mode 100644 index 0000000000..e04aec3783 --- /dev/null +++ b/paddle/pten/kernels/norm_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" + +namespace pten { + +template +void NormGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& out, + int axis, + float epsilon, + bool is_test, + DenseTensor* x_grad); + +} // namespace pten diff --git a/paddle/pten/kernels/norm_kernel.h b/paddle/pten/kernels/norm_kernel.h new file mode 100644 index 0000000000..50a0d3fa34 --- /dev/null +++ b/paddle/pten/kernels/norm_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" + +namespace pten { + +template +void NormKernel(const Context& ctx, + const DenseTensor& x, + int axis, + float epsilon, + bool is_test, + DenseTensor* out, + DenseTensor* norm); + +} // namespace pten diff --git a/paddle/pten/ops/compat/norm_sig.cc b/paddle/pten/ops/compat/norm_sig.cc new file mode 100644 index 0000000000..b236ae784e --- /dev/null +++ b/paddle/pten/ops/compat/norm_sig.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/core/compat/op_utils.h" + +namespace pten { + +KernelSignature NormOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "norm", {"X"}, {"axis", "epsilon", "is_test"}, {"Out", "Norm"}); +} + +KernelSignature NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("norm_grad", + {GradVarName("Out"), "X", "Norm"}, + {"axis", "epsilon", "is_test"}, + {GradVarName("X")}); +} + +} // namespace pten + +PT_REGISTER_ARG_MAPPING_FN(norm, pten::NormOpArgumentMapping); +PT_REGISTER_ARG_MAPPING_FN(norm_grad, pten::NormGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py index 6b424e03cc..626de9b12b 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_norm_op.py @@ -154,4 +154,6 @@ class API_NormTest(unittest.TestCase): if __name__ == '__main__': + import paddle + paddle.enable_static() unittest.main() -- GitLab