diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 0ce55b7cf7331889addd238322977c3d51d4e0ca..9a522359628eb16d306d81ca9b39e648c4f9085a 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -13,7 +13,7 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/activation_functor.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 7bcd336732960e1013e5fab5a6c85e11c88b5d26..5f2097f3330507e621a0e56485b9fdc73d880bab 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -42,7 +42,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/gpu/elementwise_grad.h" @@ -982,7 +982,7 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( #pragma unroll for (int i = BLOCK_X >> 1; i > 0; i >>= 1) { // reduce sum with wrap - val += platform::CudaShuffleXorSync(0xFFFFFFFF, val, i); + val += phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, val, i); } size_t idx_j = j + threadIdx.y; @@ -1004,7 +1004,8 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( #pragma unroll for (int i = BLOCK_X >> 1; i > 0; i >>= 1) { // reduce sum with wrap - inter_val += platform::CudaShuffleXorSync(0xFFFFFFFF, inter_val, i); + inter_val += + phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, inter_val, i); } if (threadIdx.x == 0 && (idx_j < w)) d_intermediate[idx_j] = inter_val; } @@ -1160,14 +1161,14 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; if (BcastY) { if (dy) { - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (threadIdx.x == 0) { dy[j] = val; } } } else { if (dx) { - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (threadIdx.x == 0) { dx[j] = val; } @@ -1175,7 +1176,7 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( } if (!SameShapeOfIntermediateOutAndOut) { if (d_intermediate) { - inter_val = paddle::platform::reduceSum(inter_val, tid, h); + inter_val = phi::backends::gpu::reduceSum(inter_val, tid, h); if (threadIdx.x == 0) { d_intermediate[j] = inter_val; } diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index a13bfcf12ea8d2dc6e09339388aff1f017f13098..ef5087f0534e1f6b5d5e5f3c40b77d85c413f825 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -22,9 +22,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/attn_gemm.h" #include "paddle/fluid/operators/fused/fmha_ref.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu index 6da533aa77f3c097c1da9acca8e0257a42bf2661..664e20b686d7ed26eacfd062d4c7de337124107c 100644 --- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu +++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index 1b8dc4bb324ca768648df16a0b60157e17aa2c56..0fbc14436e9146f6474ef3fda4f4fb876f9de4a9 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -22,10 +22,10 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/fused/quant_dequant_kernel.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/functors.h" diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index 74ba0b54afd4550fb8d407d5c692a72b38d9680f..c6cfc8698351196b9559b1aa4d66885064373879 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -25,8 +25,8 @@ namespace cub = hipcub; #endif #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 8f13424ce49b54a495d753dc84baa2fb06ea791c..9cb3f19ab17409b2bf25c0148b428530ba9d7766 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/fused/attn_gemm.h" #include "paddle/fluid/operators/fused/fused_gate_attention.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h index 777ee83c38dc6d4aabba4294a92c9a1b68f5ba39..79fc561698989babed35887840362d51f4343603 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h @@ -26,9 +26,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/attn_gemm.h" #include "paddle/fluid/operators/fused/fmha_ref.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index d0a2935197a8cd87360c4ab643e520407e4389fc..6b2ba1670a3b72be947496609476cf673ca94962 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -21,7 +21,7 @@ namespace cub = hipcub; #endif #include "paddle/fluid/operators/group_norm_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 86d01f6dece4c2371e7369b856f7db5485bf5354..3d1bd7490795dd40b82ba6629a314ee02201eb6a 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -25,8 +25,8 @@ namespace cub = hipcub; #include #include "paddle/fluid/operators/fused/quant_dequant_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" @@ -55,7 +55,7 @@ static __forceinline__ __device__ U WarpReduceSum(U val) { unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); for (int offset = warpSize / 2; offset > 0; offset /= 2) { - val += paddle::platform::CudaShuffleDownSync(mask, val, offset); + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); } return val; } diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index 696ddb5a059ed1ada75228281a501629bda5ff64..400f10558e15515cf02ede27da965654162e0e20 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/beam_search.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index 3c4253ef800aaad5f9e9637b2adc50257e34a636..34595180c9d721a6ddea7f0d4f0ba66ca02236a5 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/row_conv_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -242,7 +242,7 @@ __global__ void RowConvGradFilterImproved(const T *in, for (int offset = 16; offset > 0; offset = offset / 2) { // blockDim.x is 32. - val += platform::CudaShuffleDownSync(mask, val, offset); + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); } __syncthreads(); @@ -307,7 +307,7 @@ __global__ void RowConvGradFilter(const T *in, for (int offset = 16; offset > 0; offset = offset / 2) { // blockDim.x is 32. - val += platform::CudaShuffleDownSync(mask, val, offset); + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); } __syncthreads(); diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index e95bca3c2791ef0d8758b087d5badd4e296308c9..f210f46ea4376bb0e23ad277ce7b8586beb71ede 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -26,9 +26,9 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/kernel_primitives/functor_primitives.h" #include "paddle/fluid/operators/top_k_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #define FINAL_MASK 0xffffffff @@ -283,8 +283,10 @@ __forceinline__ __device__ Pair WarpReduce(Pair input, if (largest) { #pragma unroll for (int offset = 16; offset > 0; offset >>= 1) { - T tmp_val = platform::CudaShuffleDownSync(FINAL_MASK, input.v, offset); - int tmp_id = platform::CudaShuffleDownSync(FINAL_MASK, input.id, offset); + T tmp_val = + phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.v, offset); + int tmp_id = + phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.id, offset); if (input.v < tmp_val || (input.v == tmp_val && input.id > tmp_id)) { input.v = tmp_val; input.id = tmp_id; @@ -293,8 +295,10 @@ __forceinline__ __device__ Pair WarpReduce(Pair input, } else { #pragma unroll for (int offset = 16; offset > 0; offset >>= 1) { - T tmp_val = platform::CudaShuffleDownSync(FINAL_MASK, input.v, offset); - int tmp_id = platform::CudaShuffleDownSync(FINAL_MASK, input.id, offset); + T tmp_val = + phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.v, offset); + int tmp_id = + phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.id, offset); if (input.v > tmp_val || (input.v == tmp_val && input.id > tmp_id)) { input.v = tmp_val; input.id = tmp_id; @@ -357,7 +361,8 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); if (tid_max / 32 == wid) { - if (platform::CudaShuffleSync(mask, *beam, tid_max % 32, 32) == MaxLength) + if (phi::backends::gpu::CudaShuffleSync(mask, *beam, tid_max % 32, 32) == + MaxLength) break; } } diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h deleted file mode 100644 index c1db9c6770c2464e6b65b756602beb9b10549885..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// NOTE(): support float16 to half in header file. -#define PADDLE_CUDA_FP16 -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/phi/core/enforce.h" - -namespace paddle { -namespace platform { - -#define FULL_WARP_MASK 0xFFFFFFFF -#define CREATE_SHFL_MASK(mask, predicate) \ - mask = __ballot_sync(FULL_WARP_MASK, (predicate)) - -#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kPowerOfTwoDim = (dim); \ - __VA_ARGS__; \ - } break - -#define CUDA_LAUNCH_KERNEL_HELPER(...) \ - CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); - -template -__forceinline__ __device__ T -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { - return __shfl_down_sync(mask, val, static_cast(delta), width); -} - -template -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, - T val, - int width = warpSize) { - return __shfl_xor_sync(mask, val, width); -} - -template <> -__forceinline__ __device__ float16 -CudaShuffleDownSync(unsigned mask, float16 val, int delta, int width) { - return float16(__shfl_down_sync( - mask, val.to_half(), static_cast(delta), width)); -} - -template <> -__forceinline__ __device__ bfloat16 -CudaShuffleDownSync(unsigned mask, bfloat16 val, int delta, int width) { -#if defined(PADDLE_CUDA_BF16) - return bfloat16(__shfl_down_sync(mask, - static_cast(val), - static_cast(delta), - width)); -#else - PADDLE_ENFORCE( - false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11."); -#endif -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( - unsigned mask, paddle::platform::complex val, int delta, int width) { - float real = static_cast(__shfl_down_sync( - mask, static_cast(val.real), static_cast(delta), width)); - float imag = static_cast(__shfl_down_sync( - mask, static_cast(val.imag), static_cast(delta), width)); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ paddle::platform::complex -CudaShuffleDownSync(unsigned mask, - paddle::platform::complex val, - int delta, - int width) { - double real = - static_cast(__shfl_down_sync(mask, - static_cast(val.real), - static_cast(delta), - width)); - double imag = - static_cast(__shfl_down_sync(mask, - static_cast(val.imag), - static_cast(delta), - width)); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, - float16 val, - int width) { - return float16(__shfl_xor_sync(mask, val.to_half(), width)); -} - -template <> -__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask, - bfloat16 val, - int width) { -#if defined(PADDLE_CUDA_BF16) - return bfloat16(__shfl_xor_sync(mask, static_cast(val), width)); -#else - PADDLE_ENFORCE( - false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11."); -#endif -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( - unsigned mask, paddle::platform::complex val, int width) { - float real = static_cast( - __shfl_xor_sync(mask, static_cast(val.real), width)); - float imag = static_cast( - __shfl_xor_sync(mask, static_cast(val.imag), width)); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( - unsigned mask, paddle::platform::complex val, int width) { - double real = static_cast( - __shfl_xor_sync(mask, static_cast(val.real), width)); - double imag = static_cast( - __shfl_xor_sync(mask, static_cast(val.imag), width)); - return paddle::platform::complex(real, imag); -} - -template -__forceinline__ __device__ T -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { - return __shfl_sync(mask, val, src_line, width); -} - -template -HOSTDEVICE T Infinity() { - return INFINITY; -} - -template -__device__ T reduceSum(T val, int tid, int len) { - // NOTE(zcd): The warp size should be taken from the - // parameters of the GPU but not specified as 32 simply. - // To make the reduceSum more efficiently, - // I use Warp-Level Parallelism and assume the Warp size - // is 32 which may be different for different GPU, - // but most card's warp size is 32. - const int warpSize = 32; - __shared__ T shm[warpSize]; - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, tid < len); - - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::CudaShuffleDownSync(mask, val, offset); - - if (tid < warpSize) shm[tid] = 0; - __syncthreads(); - - if (tid % warpSize == 0) { - shm[tid / warpSize] = val; - } - __syncthreads(); - - CREATE_SHFL_MASK(mask, tid < warpSize); - - if (tid < warpSize) { - val = shm[tid]; - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::CudaShuffleDownSync(mask, val, offset); - } - return val; -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu index a3fff0dbed8e2e38e3f97026ad5ff04a85df6bc7..f20c89f97a4f5251de46bbb4649bf8e254f80462 100644 --- a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu +++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu @@ -22,9 +22,9 @@ #include #define PADDLE_CUDA_FP16 -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" using paddle::platform::float16; @@ -214,7 +214,7 @@ static __forceinline__ __device__ T WarpReduceSum(T val) { unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); for (int offset = warpSize / 2; offset > 0; offset /= 2) { - val += paddle::platform::CudaShuffleDownSync(mask, val, offset); + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); } return val; } diff --git a/paddle/fluid/platform/device/gpu/gpu_device_function.h b/paddle/fluid/platform/device/gpu/gpu_device_function.h deleted file mode 100644 index a8daa5e87fdc38ebe3efd4de767d35869e607eff..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/device/gpu/gpu_device_function.h +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h" -#else -#include "paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h" -#endif - -#endif diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h deleted file mode 100644 index a8ce5f1a1827bba7983bb32a1b463c5a91cdf521..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h +++ /dev/null @@ -1,168 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// NOTE(): support float16 to half in header file. -#define PADDLE_CUDA_FP16 -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace platform { - -#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) - -#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kPowerOfTwoDim = (dim); \ - __VA_ARGS__; \ - } break - -#define CUDA_LAUNCH_KERNEL_HELPER(...) \ - CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); - -template -__forceinline__ __device__ T -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { - return __shfl_down(val, delta, width); -} - -template -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, - T val, - int width = warpSize) { - return __shfl_xor(val, width); -} - -template <> -__forceinline__ __device__ float16 -CudaShuffleDownSync(unsigned mask, float16 val, int delta, int width) { - return float16(__shfl_down( - static_cast(val), static_cast(delta), width)); -} - -template <> -__forceinline__ __device__ bfloat16 -CudaShuffleDownSync(unsigned mask, bfloat16 val, int delta, int width) { - return bfloat16(__shfl_down( - static_cast(val), static_cast(delta), width)); -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( - unsigned mask, paddle::platform::complex val, int delta, int width) { - float real = __shfl_down(val.real, delta, width); - float imag = __shfl_down(val.imag, delta, width); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ paddle::platform::complex -CudaShuffleDownSync(unsigned mask, - paddle::platform::complex val, - int delta, - int width) { - double real = __shfl_down(val.real, delta, width); - double imag = __shfl_down(val.imag, delta, width); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, - float16 val, - int width) { - return float16(__shfl_xor(static_cast(val), width)); -} - -template <> -__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask, - bfloat16 val, - int width) { - return bfloat16(__shfl_xor(static_cast(val), width)); -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( - unsigned mask, paddle::platform::complex val, int width) { - float real = __shfl_xor(val.real, width); - float imag = __shfl_xor(val.imag, width); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( - unsigned mask, paddle::platform::complex val, int width) { - double real = __shfl_xor(val.real, width); - double imag = __shfl_xor(val.imag, width); - return paddle::platform::complex(real, imag); -} - -template -__forceinline__ __device__ T -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { - return __shfl(val, src_line, width); -} - -template -HOSTDEVICE T Infinity() { - return INFINITY; -} - -template -__device__ T reduceSum(T val, int tid, int len) { - // NOTE(zcd): The warp size should be taken from the - // parameters of the GPU but not specified as 32 simply. - // To make the reduceSum more efficiently, - // I use Warp-Level Parallelism and assume the Warp size - // is 32 which may be different for different GPU, - // but most card's warp size is 32. -#ifdef PADDLE_WITH_HIP - const int warpSize = 64; -#else - const int warpSize = 32; -#endif - __shared__ T shm[warpSize]; - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, tid < len); - - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::CudaShuffleDownSync(mask, val, offset); - - if (tid < warpSize) shm[tid] = 0; - __syncthreads(); - - if (tid % warpSize == 0) { - shm[tid / warpSize] = val; - } - __syncthreads(); - - CREATE_SHFL_MASK(mask, tid < warpSize); - - if (tid < warpSize) { - val = shm[tid]; - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::CudaShuffleDownSync(mask, val, offset); - } - return val; -} - -} // namespace platform -} // namespace paddle