/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" #define WARPSIZE 32 namespace ops = paddle::operators; namespace plat = paddle::platform; namespace paddle { namespace operators { template struct SameDimsElemwiseAdd { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { AddRangeFunctor functor(x->data(), y->data(), z->data()); auto& dev_ctx = ctx.template device_context(); platform::ForRange for_range(dev_ctx, x->numel()); for_range(functor); } }; template <> struct SameDimsElemwiseAdd { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { auto size = x->numel(); dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); const half* x2 = reinterpret_cast(x->data()); const half* y2 = reinterpret_cast(y->data()); half* z2 = reinterpret_cast(z->data()); SameDimsElemwiseAddCUDAKernel<<< grid_size, block_size, 0, ctx.template device_context().stream()>>>( x2, y2, z2, size); } }; template static __global__ void SimpleElemwiseAddGradCUDAKernel(const T* dout, int64_t size, T* dx, T* dy) { int col = blockIdx.x * blockDim.x + threadIdx.x; while (col < size) { dx[col] = dout[col]; dy[col] = dout[col]; col += blockDim.x * gridDim.x; } } template typename std::enable_if< std::is_same::value>::type ElementwiseAddGrad(const framework::ExecutionContext& ctx, const framework::Tensor* x, const framework::Tensor* y, const framework::Tensor* out, const framework::Tensor* dout, framework::Tensor* dx, framework::Tensor* dy) { dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); auto size = x->numel(); dim3 grid_size = dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); SimpleElemwiseAddGradCUDAKernel< T><<().stream()>>>( dout->data(), size, dx->mutable_data(ctx.GetPlace()), dy->mutable_data(ctx.GetPlace())); } inline static bool UseReduceFirstAxisRank1(const framework::DDim& dout_dims, const framework::DDim& x_dims, const framework::DDim& y_dims, const int axis) { int start_axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); if (y_dims[y_dims.size() - 1] == 1) { return false; } if (y_dims.size() > 1) { for (int i = 0; i < y_dims.size() - 1; ++i) { if (y_dims[i] != 1) { return false; } } return true; } else if (start_axis == x_dims.size() - 1) { return true; } return false; } inline static bool UseReduceFirstAxisRank2(const framework::DDim& dout_dims, const framework::DDim& x_dims, const framework::DDim& y_dims, const int axis) { int start_axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); if (y_dims.size() < 2 || x_dims[x_dims.size() - 2] != y_dims[y_dims.size() - 2] || x_dims[x_dims.size() - 1] != y_dims[y_dims.size() - 1]) { return false; } if (start_axis == x_dims.size() - 2) { return true; } else if (start_axis == 0) { for (int i = 0; i < y_dims.size() - 2; ++i) { if (y_dims[i] != 1) { return false; } } return true; } return false; } inline static bool UseReduceSecondAxisRank2(const framework::DDim& dout_dims, const framework::DDim& x_dims, const framework::DDim& y_dims, const int axis, int* start, int* end) { if (x_dims.size() != y_dims.size() || y_dims.size() < 3) { return false; } auto y_dims_vec = framework::vectorize(y_dims); auto start_iter = std::find(y_dims_vec.begin(), y_dims_vec.end(), 1); auto end_iter = std::find(y_dims_vec.rbegin(), y_dims_vec.rend(), 1); if (start_iter == y_dims_vec.end() || start_iter == y_dims_vec.end() - 1) { return false; } else { *start = std::distance(y_dims_vec.begin(), start_iter); *end = y_dims_vec.size() - 1 - std::distance(y_dims_vec.rbegin(), end_iter); for (int i = *start; i <= *end; ++i) { if (y_dims[i] != 1) { return false; } } return true; } } template __global__ __launch_bounds__(1024) void ReduceFirstAixsKernel( const T* in, T* out, const int64_t num_rows, const int64_t num_cols, OP op, T init) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; T sum = init; if (row < num_rows && col < num_cols) sum = in[row * num_cols + col]; __shared__ __align__( alignof(T)) char partial_sums_raw[WARPSIZE * (WARPSIZE + 1) * sizeof(T)]; T* partial_sums = reinterpret_cast(partial_sums_raw); row += gridDim.y * blockDim.y; if (col < num_cols) { for (; row < num_rows; row += gridDim.y * blockDim.y) { sum = op(sum, in[row * num_cols + col]); } } partial_sums[threadIdx.x * (WARPSIZE + 1) + threadIdx.y] = sum; __syncthreads(); if (threadIdx.y == 0 && col < num_cols) { T s = partial_sums[threadIdx.x * (WARPSIZE + 1)]; const int numRowsThisBlock = min(static_cast(blockDim.y), num_rows - blockIdx.y * blockDim.y); for (int row = 1; row < numRowsThisBlock; ++row) { T t = partial_sums[threadIdx.x * (WARPSIZE + 1) + row]; s = op(s, t); } out[col * gridDim.y + blockIdx.y] = s; } } template static void ElemwiseYGradRank1CUDA(const framework::ExecutionContext& ctx, const framework::Tensor& dout, const int rows, const int cols, framework::Tensor* dx, framework::Tensor* dy) { dim3 block_dim(WARPSIZE, std::min(rows, 1024 / WARPSIZE)); dim3 grid_dim((cols + (WARPSIZE - 1)) / WARPSIZE, 1, 1); if (dx) { dx->mutable_data(ctx.GetPlace()); framework::TensorCopy( dout, ctx.GetPlace(), ctx.template device_context(), dx); } if (dy) { dy->mutable_data(ctx.GetPlace()); const T* dout_data = dout.data(); T* dy_data = dy->data(); auto stream = ctx.template device_context().stream(); ReduceFirstAixsKernel<<>>( dout_data, dy_data, rows, cols, AddFunctor(), static_cast(0)); } } template __global__ __launch_bounds__(1024) void ReduceFirstOrSecondAxisKernel( const T* in, T* out, const int num_planes, const int num_rows, const int num_cols, OP op, T init) { const int gid = threadIdx.x + blockIdx.x * blockDim.x; const int elems_per_plane = num_rows * num_cols; const int plane = gid / num_cols; const int col = gid % num_cols; if (plane >= num_planes) return; if (num_rows == 1) { out[plane * elems_per_plane + col] = in[plane * elems_per_plane + col]; return; } T sum = op(in[plane * elems_per_plane + col], in[plane * elems_per_plane + num_cols + col]); for (int row = 2; row < num_rows; ++row) { sum = op(sum, in[plane * elems_per_plane + row * num_cols + col]); } out[plane * num_cols + col] = sum; } template static void ElemwiseYGradRank2CUDA(const framework::ExecutionContext& ctx, const framework::Tensor& dout, const int planes, const int rows, const int cols, framework::Tensor* dx, framework::Tensor* dy) { int num_threads = 128; int num_blocks = (rows + num_threads - 1) / num_threads; if (planes != 1) { num_blocks = (planes * cols + num_threads - 1) / num_threads; } if (dx) { dx->mutable_data(ctx.GetPlace()); framework::TensorCopy( dout, ctx.GetPlace(), ctx.template device_context(), dx); } if (dy) { dy->mutable_data(ctx.GetPlace()); const T* dout_data = dout.data(); T* dy_data = dy->data(); auto stream = ctx.template device_context().stream(); ReduceFirstOrSecondAxisKernel<<>>( dout_data, dy_data, planes, rows, cols, AddFunctor(), static_cast(0)); } } template static bool ElemwiseGradUseReduce(const framework::ExecutionContext& ctx, const int axis, const framework::DDim x_dims, const framework::DDim y_dims, const framework::Tensor& dout, framework::Tensor* dx, framework::Tensor* dy) { int start = 0; int end = 0; auto x_dims_vec = framework::vectorize(x_dims); if (UseReduceFirstAxisRank1(dout.dims(), x_dims, y_dims, axis)) { int rows = std::accumulate(x_dims_vec.begin(), x_dims_vec.end() - 1, 1, std::multiplies()); int cols = dx->dims()[dx->dims().size() - 1]; if (cols > 512 && cols < 4096) { ElemwiseYGradRank1CUDA(ctx, dout, rows, cols, dx, dy); return true; } } if (UseReduceFirstAxisRank2(dout.dims(), x_dims, y_dims, axis)) { int rows = std::accumulate(x_dims_vec.begin(), x_dims_vec.end() - 2, 1, std::multiplies()); int cols = dx->dims()[dx->dims().size() - 1] * dx->dims()[dx->dims().size() - 2]; if (cols > 4096) { ElemwiseYGradRank2CUDA(ctx, dout, 1, rows, cols, dx, dy); return true; } } if (UseReduceSecondAxisRank2(dout.dims(), x_dims, y_dims, axis, &start, &end)) { int planes = std::accumulate(x_dims_vec.begin(), x_dims_vec.begin() + start, 1, std::multiplies()); int rows = std::accumulate(x_dims_vec.begin() + start, x_dims_vec.begin() + end + 1, 1, std::multiplies()); int cols = std::accumulate(x_dims_vec.begin() + end + 1, x_dims_vec.end(), 1, std::multiplies()); if (rows / (planes * cols) < 16) { ElemwiseYGradRank2CUDA(ctx, dout, planes, rows, cols, dx, dy); return true; } } return false; } template class ElementwiseAddGradKernel : public ElemwiseGradKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { ElemwiseGradKernel::Compute(ctx); using Tensor = framework::Tensor; auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); // skip out auto* out = dout; int axis = ctx.Attr("axis"); // Special case when dy is not needed and dx doesn't reduce if (dx != nullptr && dy == nullptr && dx->dims() == dout->dims()) { VLOG(4) << "Special case when dy is not needed and dx doesn't " "reduce"; framework::TensorCopy( *dout, ctx.GetPlace(), ctx.template device_context(), dx); } else if (dx == nullptr && dy != nullptr && dy->dims() == dout->dims()) { VLOG(4) << "Special case when dx is not needed and dy doesn't " "reduce"; framework::TensorCopy( *dout, ctx.GetPlace(), ctx.template device_context(), dy); } else if (dx && dy && (dx->dims() == dy->dims())) { ElementwiseAddGrad(ctx, x, y, out, dout, dx, dy); } else if (dx && dx->dims() == dout->dims() && ElemwiseGradUseReduce( ctx, axis, x->dims(), y->dims(), *dout, dx, dy)) { } else if (dy && dy->dims() == dout->dims() && ElemwiseGradUseReduce( ctx, axis, x->dims(), y->dims(), *dout, dy, dx)) { } else { DefaultElementwiseAddGrad(ctx, x, y, out, dout, dx, dy); } } }; } // namespace operators } // namespace paddle REGISTER_OP_CUDA_KERNEL( elementwise_add, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel); REGISTER_OP_CUDA_KERNEL( elementwise_add_grad, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel); REGISTER_OP_CUDA_KERNEL( elementwise_add_grad_grad, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel); REGISTER_OP_CUDA_KERNEL( grad_add, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel);