// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include "paddle/fluid/framework/array.h" #include "paddle/fluid/operators/stack_op.h" namespace paddle { namespace operators { template __global__ void StackCUDAKernel(VecXType x, T* y, int total_num, int n, int post) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < total_num) { int i = idx / (n * post); int which_x = idx / post - i * n; int x_index = i * post + idx % post; y[idx] = x[which_x][x_index]; } } template __global__ void StackGradCUDAKernel(VecDxType dx, const T* dy, int total_num, int n, int post) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < total_num) { int i = idx / (n * post); int which_x = idx / post - i * n; int x_index = i * post + idx % post; dx[which_x][x_index] = dy[idx]; } } struct GPUStackFunctor { template void operator()(const DeviceContext& ctx, const std::vector& x, T* y, int pre, int n, int post) const { int total_num = pre * post * n; int threads = 512; int grid = (total_num + threads - 1) / threads; constexpr auto kMaxThreshold = 16; if (n <= kMaxThreshold) { framework::Array arr; for (int i = 0; i < n; ++i) arr[i] = x[i]; StackCUDAKernel<<>>(arr, y, total_num, n, post); } else { VLOG(10) << "Stack more than " << kMaxThreshold << " tensors may be slow on GPU."; thrust::device_vector dev_x(x); StackCUDAKernel<<>>(dev_x.data().get(), y, total_num, n, post); } } }; struct GPUStackGradFunctor { template void operator()(const DeviceContext& ctx, std::vector& dx, // NOLINT const T* dy, int pre, int n, int post) const { int total_num = pre * post * n; int threads = 512; int grid = (total_num + threads - 1) / threads; constexpr auto kMaxThreshold = 16; if (n <= kMaxThreshold) { framework::Array arr; for (int i = 0; i < n; ++i) arr[i] = dx[i]; StackGradCUDAKernel<<>>( arr, dy, total_num, n, post); } else { VLOG(10) << "Stack more than " << kMaxThreshold << " tensors may be slow on GPU."; thrust::device_vector dev_dx(dx); StackGradCUDAKernel<<>>( dev_dx.data().get(), dy, total_num, n, post); } } }; } // namespace operators } // namespace paddle namespace plat = paddle::platform; namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( stack, ops::StackKernel, ops::StackKernel); REGISTER_OP_CUDA_KERNEL(stack_grad, ops::StackGradKernel, ops::StackGradKernel);