// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/flip_kernel.h" namespace phi { template __global__ void flip_cuda_kernel(const int N, const T* in_data, T* out_data, int64_t* x_shape, int64_t* x_stride, int* flip_dims, int flip_dims_size, int total_dims) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= N) { return; } int cur_indices = idx, rem = 0, dst_offset = 0; for (int i = 0; i < total_dims; ++i) { int64_t temp = cur_indices; cur_indices = cur_indices / x_stride[i]; rem = temp - cur_indices * x_stride[i]; // flip the indices if it is in flip_dims for (int j = 0; j < flip_dims_size; ++j) { if (i == flip_dims[j]) { cur_indices = x_shape[i] - 1 - cur_indices; } } dst_offset += cur_indices * x_stride[i]; cur_indices = rem; } out_data[idx] = in_data[dst_offset]; } template void FlipKernel(const Context& dev_ctx, const DenseTensor& x, const std::vector& axis, DenseTensor* out) { const auto gplace = dev_ctx.GetPlace(); auto cplace = phi::CPUPlace(); std::vector flip_dims = axis; auto* in_data = x.data(); auto* out_data = dev_ctx.template Alloc(out); const int flip_dims_size = static_cast(flip_dims.size()); auto x_dims = x.dims(); const int total_dims = x_dims.size(); const int N = x.numel(); int block_size = 512; dim3 dim_block(block_size); dim3 dim_grid((N + block_size - 1) / block_size); for (size_t i = 0; i < flip_dims.size(); ++i) { if (flip_dims[i] < 0) { flip_dims[i] += total_dims; } } auto x_stride = phi::stride(x_dims); std::vector x_dims_v = phi::vectorize(x_dims); std::vector x_stride_v = phi::vectorize(x_stride); int bytes = total_dims * sizeof(int64_t); auto x_strides_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); int64_t* x_strides_array_gpu = reinterpret_cast(x_strides_array_tmp->ptr()); paddle::memory::Copy(gplace, x_strides_array_gpu, cplace, x_stride_v.data(), bytes, dev_ctx.stream()); auto x_shape_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); int64_t* x_shape_array_gpu = reinterpret_cast(x_shape_array_tmp->ptr()); paddle::memory::Copy(gplace, x_shape_array_gpu, cplace, x_dims_v.data(), bytes, dev_ctx.stream()); bytes = flip_dims_size * sizeof(int); auto flip_dims_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); int* flip_dims_array_gpu = reinterpret_cast(flip_dims_array_tmp->ptr()); paddle::memory::Copy(gplace, flip_dims_array_gpu, cplace, flip_dims.data(), bytes, dev_ctx.stream()); flip_cuda_kernel <<>>(N, in_data, out_data, x_shape_array_gpu, x_strides_array_gpu, flip_dims_array_gpu, flip_dims_size, total_dims); } } // namespace phi PD_REGISTER_KERNEL(flip, GPU, ALL_LAYOUT, phi::FlipKernel, float, double, phi::dtype::float16, int, int64_t, bool, phi::dtype::complex, phi::dtype::complex) {}