未验证 提交 9918bf9c 编写于 作者: W Wang Xin 提交者: GitHub

[PHI decoupling] remove "gpu_primitives.h" in fluid (#48063)

* remove "gpu_primitives.h" in fluid namespace

* fix PR-CI-GpuPS fail

* fix PR-CI-GpuPS fail
上级 a33d563c
...@@ -13,12 +13,12 @@ limitations under the License. */ ...@@ -13,12 +13,12 @@ limitations under the License. */
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS; const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS) #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0 #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
...@@ -45,7 +45,7 @@ __global__ void PullCopy(float** dest, ...@@ -45,7 +45,7 @@ __global__ void PullCopy(float** dest,
int x = low; int x = low;
int y = i - (x ? len[x - 1] : 0); int y = i - (x ? len[x - 1] : 0);
float* feature_value_ptr = float* feature_value_ptr =
(float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); // NOLINT
int mf_dim = gpu_dim[x] - 3; int mf_dim = gpu_dim[x] - 3;
gpu_accessor.Select( gpu_accessor.Select(
dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
...@@ -79,7 +79,7 @@ __global__ void PullDedupCopy(const size_t N, ...@@ -79,7 +79,7 @@ __global__ void PullDedupCopy(const size_t N,
return; return;
} }
float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) * float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) * // NOLINT
uint64_t(max_val_size)); uint64_t(max_val_size));
switch (off) { switch (off) {
case 0: case 0:
...@@ -125,9 +125,10 @@ __global__ void PushCopyWithPool(float* dest, ...@@ -125,9 +125,10 @@ __global__ void PushCopyWithPool(float* dest,
} }
int x = low; int x = low;
int y = i - (x ? len[low - 1] : 0); int y = i - (x ? len[low - 1] : 0);
float* cur = (float*)((char*)dest + i * grad_value_size); float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT
cur[gpu_accessor.common_push_value.SlotIndex()] = (float)slot_vector[x]; cur[gpu_accessor.common_push_value.SlotIndex()] =
(float)slot_vector[x]; // NOLINT
int mf_dim = mf_dim_vector[x]; int mf_dim = mf_dim_vector[x];
cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim; cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim;
...@@ -170,31 +171,29 @@ __global__ void PushMergeCopyAtomic(const size_t N, ...@@ -170,31 +171,29 @@ __global__ void PushMergeCopyAtomic(const size_t N,
int y = i - slot_lens[x]; int y = i - slot_lens[x];
const float* ptr = src[x] + y * hidden; const float* ptr = src[x] + y * hidden;
float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size); float* cur =
(float*)((char*)dest + d_restore_idx[i] * grad_value_size); // NOLINT
int mf_dim = slot_dims[x] - 3; int mf_dim = slot_dims[x] - 3;
switch (off) { switch (off) {
case 0: case 0:
cur[accessor.SlotIndex()] = (float)slot_vector[x]; cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT
cur[accessor.MfDimIndex()] = mf_dim; cur[accessor.MfDimIndex()] = mf_dim;
paddle::platform::CudaAtomicAdd(&cur[accessor.ShowIndex()], phi::CudaAtomicAdd(&cur[accessor.ShowIndex()], *(ptr + off));
*(ptr + off));
break; break;
case 1: case 1:
paddle::platform::CudaAtomicAdd(&cur[accessor.ClickIndex()], phi::CudaAtomicAdd(&cur[accessor.ClickIndex()], *(ptr + off));
*(ptr + off));
break; break;
case 2: case 2:
paddle::platform::CudaAtomicAdd(&cur[accessor.EmbedGIndex()], phi::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
*(ptr + off) * -1. * bs); *(ptr + off) * -1. * bs);
break; break;
default: default:
int embedx_idx = off - 3; int embedx_idx = off - 3;
if (mf_dim < embedx_idx) { if (mf_dim < embedx_idx) {
return; return;
} }
paddle::platform::CudaAtomicAdd( phi::CudaAtomicAdd(&cur[accessor.EmbedxGIndex() + embedx_idx],
&cur[accessor.EmbedxGIndex() + embedx_idx], *(ptr + off) * -1. * bs);
*(ptr + off) * -1. * bs);
break; break;
} }
} }
...@@ -228,7 +227,7 @@ __global__ void PushMergeCopy(const size_t N, ...@@ -228,7 +227,7 @@ __global__ void PushMergeCopy(const size_t N,
int i = idx / hidden; int i = idx / hidden;
int off = idx % hidden; int off = idx % hidden;
// filter 0 keys // filter 0 keys
float* cur = (float*)((char*)dest + i * grad_value_size); float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT
if (total_keys[i] == 0) { if (total_keys[i] == 0) {
switch (off) { switch (off) {
...@@ -262,7 +261,7 @@ __global__ void PushMergeCopy(const size_t N, ...@@ -262,7 +261,7 @@ __global__ void PushMergeCopy(const size_t N,
switch (off) { switch (off) {
case 0: case 0:
cur[accessor.SlotIndex()] = (float)slot_vector[x]; cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT
cur[accessor.MfDimIndex()] = mf_dim; cur[accessor.MfDimIndex()] = mf_dim;
SUM_GRAD_VALUE SUM_GRAD_VALUE
cur[accessor.ShowIndex()] = val; cur[accessor.ShowIndex()] = val;
...@@ -331,8 +330,8 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl( ...@@ -331,8 +330,8 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
const uint64_t total_length, const uint64_t total_length,
const int batch_size, const int batch_size,
size_t grad_value_size, size_t grad_value_size,
std::vector<int>& slot_vector, std::vector<int>& slot_vector, // NOLINT
std::vector<int>& slot_mf_dim_vector) { std::vector<int>& slot_mf_dim_vector) { // NOLINT
auto stream = dynamic_cast<phi::GPUContext*>( auto stream = dynamic_cast<phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place)) paddle::platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
......
...@@ -22,12 +22,12 @@ limitations under the License. */ ...@@ -22,12 +22,12 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS; const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS) #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0 #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
......
...@@ -20,8 +20,8 @@ ...@@ -20,8 +20,8 @@
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
#include "paddle/fluid/operators/fused_token_prune_op.cu.h" #include "paddle/fluid/operators/fused_token_prune_op.cu.h"
...@@ -149,7 +149,7 @@ __global__ void ReduceSum2<half>( ...@@ -149,7 +149,7 @@ __global__ void ReduceSum2<half>(
} }
if (tid == 0) { if (tid == 0) {
platform::fastAtomicAdd<platform::float16>( phi::fastAtomicAdd<platform::float16>(
reinterpret_cast<platform::float16*>(dst), reinterpret_cast<platform::float16*>(dst),
static_cast<size_t>(batch * max_seq_len + col), static_cast<size_t>(batch * max_seq_len + col),
static_cast<size_t>(bsz * max_seq_len), static_cast<size_t>(bsz * max_seq_len),
......
...@@ -23,7 +23,7 @@ namespace cub = hipcub; ...@@ -23,7 +23,7 @@ namespace cub = hipcub;
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -23,8 +23,8 @@ We retain the following license from the original files: ...@@ -23,8 +23,8 @@ We retain the following license from the original files:
#include "paddle/fluid/operators/assign_pos_op.h" #include "paddle/fluid/operators/assign_pos_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
DECLARE_bool(avoid_op_randomness); DECLARE_bool(avoid_op_randomness);
...@@ -47,7 +47,7 @@ __global__ void AssignPos(T* cum_count, ...@@ -47,7 +47,7 @@ __global__ void AssignPos(T* cum_count,
CUDA_KERNEL_LOOP(i, limit) { CUDA_KERNEL_LOOP(i, limit) {
int number_idx = numbers[i]; int number_idx = numbers[i];
if (number_idx > -1) { if (number_idx > -1) {
int p = platform::CudaAtomicAdd(cum_count + number_idx, -1); int p = phi::CudaAtomicAdd(cum_count + number_idx, -1);
out[p - 1] = i; out[p - 1] = i;
} }
} }
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/batch_fc_op.h" #include "paddle/fluid/operators/batch_fc_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/fluid/operators/bilateral_slice_op.h" #include "paddle/fluid/operators/bilateral_slice_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,11 +16,11 @@ limitations under the License. */ ...@@ -16,11 +16,11 @@ limitations under the License. */
#include "paddle/fluid/operators/center_loss_op.h" #include "paddle/fluid/operators/center_loss_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T, int BlockDimX, int BlockDimY, int GridDimX> template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
__global__ void ComputeDifferent(T *centers_diff, __global__ void ComputeDifferent(T *centers_diff,
...@@ -75,7 +75,7 @@ __global__ void UpdateCenters(T *centers, ...@@ -75,7 +75,7 @@ __global__ void UpdateCenters(T *centers,
const T *diff = centers_diff + idy * D; const T *diff = centers_diff + idy * D;
T *cent = centers + id * D; T *cent = centers + id * D;
for (int i = idx; i < D; i += BlockDimX) { for (int i = idx; i < D; i += BlockDimX) {
paddle::platform::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count); phi::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
} }
idy += BlockDimY * GridDimX; idy += BlockDimY * GridDimX;
} }
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -77,8 +77,7 @@ __global__ void CEmbeddingGrad(T *table, ...@@ -77,8 +77,7 @@ __global__ void CEmbeddingGrad(T *table,
auto id = ids[row]; auto id = ids[row];
if (id >= start_idx && id < end_idx) { if (id >= start_idx && id < end_idx) {
auto real_idx = id - start_idx; auto real_idx = id - start_idx;
paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col], phi::CudaAtomicAdd(&table[real_idx * columns + col], output[i]);
output[i]);
} }
} }
} }
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_split_op.h" #include "paddle/fluid/operators/collective/c_split_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/conv_shift_op.h" #include "paddle/fluid/operators/conv_shift_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
......
...@@ -16,12 +16,12 @@ limitations under the License. */ ...@@ -16,12 +16,12 @@ limitations under the License. */
#include "paddle/fluid/operators/cvm_op.h" #include "paddle/fluid/operators/cvm_op.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
using Tensor = phi::DenseTensor; using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/data_norm_op.h" #include "paddle/fluid/operators/data_norm_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
...@@ -29,7 +29,7 @@ namespace operators { ...@@ -29,7 +29,7 @@ namespace operators {
using Tensor = phi::DenseTensor; using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
using DataLayout = phi::DataLayout; using DataLayout = phi::DataLayout;
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
inline int GET_BLOCKS(const int N) { inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
......
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/deformable_psroi_pooling_op.h" #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
...@@ -41,7 +41,7 @@ namespace operators { ...@@ -41,7 +41,7 @@ namespace operators {
using Tensor = phi::DenseTensor; using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
using paddle::platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
static inline int GET_BLOCKS(const int N) { static inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
...@@ -447,18 +447,14 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( ...@@ -447,18 +447,14 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
// compute gradient of input // compute gradient of input
if (bottom_data_diff) { if (bottom_data_diff) {
platform::CudaAtomicAdd( phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x0,
bottom_data_diff + bottom_index + y0 * width + x0, q00 * diff_val);
q00 * diff_val); phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x0,
platform::CudaAtomicAdd( q01 * diff_val);
bottom_data_diff + bottom_index + y1 * width + x0, phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x1,
q01 * diff_val); q10 * diff_val);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x1,
bottom_data_diff + bottom_index + y0 * width + x1, q11 * diff_val);
q10 * diff_val);
platform::CudaAtomicAdd(
bottom_data_diff + bottom_index + y1 * width + x1,
q11 * diff_val);
} }
// compute gradient of trans // compute gradient of trans
...@@ -478,8 +474,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( ...@@ -478,8 +474,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
u00 * (1 - dist_x)) * u00 * (1 - dist_x)) *
trans_std * diff_val; trans_std * diff_val;
diff_y *= roi_height; diff_y *= roi_height;
platform::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x); phi::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x);
platform::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y); phi::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y);
} }
} }
} }
......
...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/dequantize_log_op.h" #include "paddle/fluid/operators/dequantize_log_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/box_clip_op.h" #include "paddle/fluid/operators/detection/box_clip_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
......
...@@ -11,7 +11,7 @@ limitations under the License. */ ...@@ -11,7 +11,7 @@ limitations under the License. */
#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h" #include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -26,8 +26,8 @@ namespace cub = hipcub; ...@@ -26,8 +26,8 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/gather.cu.h"
namespace paddle { namespace paddle {
...@@ -50,7 +50,7 @@ static __global__ void GetLengthLoD(const int nthreads, ...@@ -50,7 +50,7 @@ static __global__ void GetLengthLoD(const int nthreads,
const int* batch_ids, const int* batch_ids,
int* length_lod) { int* length_lod) {
CUDA_KERNEL_LOOP(i, nthreads) { CUDA_KERNEL_LOOP(i, nthreads) {
platform::CudaAtomicAdd(length_lod + batch_ids[i], 1); phi::CudaAtomicAdd(length_lod + batch_ids[i], 1);
} }
} }
......
...@@ -14,13 +14,13 @@ limitations under the License. */ ...@@ -14,13 +14,13 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = phi::DenseTensor; using Tensor = phi::DenseTensor;
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
#define CUDA_BLOCK_SIZE 16 #define CUDA_BLOCK_SIZE 16
template <typename T> template <typename T>
......
...@@ -15,12 +15,12 @@ limitations under the License. */ ...@@ -15,12 +15,12 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
using paddle::platform::float16; using paddle::platform::float16;
using paddle::platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math.h" #include "paddle/phi/kernels/funcs/math.h"
......
...@@ -43,7 +43,7 @@ limitations under the License. */ ...@@ -43,7 +43,7 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/gpu/elementwise_grad.h" #include "paddle/phi/kernels/gpu/elementwise_grad.h"
#endif #endif
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/fake_quantize_op.h" #include "paddle/fluid/operators/fake_quantize_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/aligned_vector.h"
namespace paddle { namespace paddle {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/gather_scatter_kernel.h" #include "paddle/fluid/operators/gather_scatter_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -35,7 +35,7 @@ class ReduceAdd { ...@@ -35,7 +35,7 @@ class ReduceAdd {
typename tensor_t, typename tensor_t,
std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr> std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
__device__ void operator()(tensor_t* self_data, tensor_t* src_data) const { __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
platform::CudaAtomicAdd(self_data, *src_data); phi::CudaAtomicAdd(self_data, *src_data);
} }
template <typename tensor_t, template <typename tensor_t,
std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr> std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
......
...@@ -41,8 +41,8 @@ limitations under the License. */ ...@@ -41,8 +41,8 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/graph_khop_sampler_imp.h" #include "paddle/fluid/operators/graph_khop_sampler_imp.h"
#include "paddle/fluid/operators/graph_khop_sampler_op.h" #include "paddle/fluid/operators/graph_khop_sampler_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
constexpr int WARP_SIZE = 32; constexpr int WARP_SIZE = 32;
...@@ -134,8 +134,7 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, ...@@ -134,8 +134,7 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
const int num = curand(&rng) % (idx + 1); const int num = curand(&rng) % (idx + 1);
#endif #endif
if (num < k) { if (num < k) {
paddle::platform::CudaAtomicMax(output_idxs + out_row_start + num, phi::CudaAtomicMax(output_idxs + out_row_start + num, idx);
idx);
} }
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -22,7 +22,7 @@ namespace cub = hipcub; ...@@ -22,7 +22,7 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/group_norm_op.h" #include "paddle/fluid/operators/group_norm_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -51,7 +51,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { ...@@ -51,7 +51,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
typedef cub::WarpReduce<T> WarpReduce; typedef cub::WarpReduce<T> WarpReduce;
typename WarpReduce::TempStorage temp_storage; typename WarpReduce::TempStorage temp_storage;
value = WarpReduce(temp_storage).Sum(value); value = WarpReduce(temp_storage).Sum(value);
if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value); if (cub::LaneId() == 0) phi::CudaAtomicAdd(sum, value);
} }
template <typename T> template <typename T>
...@@ -429,14 +429,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x, ...@@ -429,14 +429,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x,
if (flags & kHasScale) { if (flags & kHasScale) {
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11070
platform::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data); phi::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
#else #else
CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data); CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data);
#endif #endif
} }
if (flags & kHasBias) { if (flags & kHasBias) {
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11070
platform::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data); phi::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
#else #else
CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data); CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data);
#endif #endif
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/fluid/operators/interpolate_op.h" #include "paddle/fluid/operators/interpolate_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -126,7 +126,7 @@ __global__ void KeNearestNeighborInterpBw(T* in, ...@@ -126,7 +126,7 @@ __global__ void KeNearestNeighborInterpBw(T* in,
in_img_idx * num_channels + channel_id]; in_img_idx * num_channels + channel_id];
} }
const T out_pos = out[out_id_h * output_w + out_id_w]; const T out_pos = out[out_id_h * output_w + out_id_w];
platform::CudaAtomicAdd(in_pos, out_pos); phi::CudaAtomicAdd(in_pos, out_pos);
} }
} }
...@@ -243,12 +243,11 @@ __global__ void KeLinearInterpBw(T* in, ...@@ -243,12 +243,11 @@ __global__ void KeLinearInterpBw(T* in,
const T* out_pos = &out[out_id_w]; const T* out_pos = &out[out_id_w];
if (data_layout == DataLayout::kNCHW) { if (data_layout == DataLayout::kNCHW) {
platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
} else { } else {
platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id * num_channels], phi::CudaAtomicAdd(&in_pos[w_id * num_channels], w1lambda * out_pos[0]);
w1lambda * out_pos[0]);
} }
} }
} }
...@@ -408,19 +407,19 @@ __global__ void KeBilinearInterpBw(T* in, ...@@ -408,19 +407,19 @@ __global__ void KeBilinearInterpBw(T* in,
const T* out_pos = &out[out_id_h * output_w + out_id_w]; const T* out_pos = &out[out_id_h * output_w + out_id_w];
if (data_layout == DataLayout::kNCHW) { if (data_layout == DataLayout::kNCHW) {
platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[h_id * in_img_w], phi::CudaAtomicAdd(&in_pos[h_id * in_img_w],
h1lambda * w2lambda * out_pos[0]); h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id], phi::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
h1lambda * w1lambda * out_pos[0]); h1lambda * w1lambda * out_pos[0]);
} else { } else {
platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id * num_channels], phi::CudaAtomicAdd(&in_pos[w_id * num_channels],
h2lambda * w1lambda * out_pos[0]); h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels], phi::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
h1lambda * w2lambda * out_pos[0]); h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&in_pos[h_id * in_img_w * num_channels + w_id * num_channels], &in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
h1lambda * w1lambda * out_pos[0]); h1lambda * w1lambda * out_pos[0]);
} }
...@@ -638,22 +637,22 @@ __global__ void KeTrilinearInterpBw(T* in, ...@@ -638,22 +637,22 @@ __global__ void KeTrilinearInterpBw(T* in,
const T* out_pos = &out[out_id_h * output_w + out_id_w]; const T* out_pos = &out[out_id_h * output_w + out_id_w];
// trilinear interpolation grad // trilinear interpolation grad
platform::CudaAtomicAdd(&in_pos1[0], phi::CudaAtomicAdd(&in_pos1[0],
d2lambda * h2lambda * w2lambda * out_pos[0]); d2lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[w_id], phi::CudaAtomicAdd(&in_pos1[w_id],
d2lambda * h2lambda * w1lambda * out_pos[0]); d2lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w], phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
d2lambda * h1lambda * w2lambda * out_pos[0]); d2lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id], phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
d2lambda * h1lambda * w1lambda * out_pos[0]); d2lambda * h1lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[0], phi::CudaAtomicAdd(&in_pos2[0],
d1lambda * h2lambda * w2lambda * out_pos[0]); d1lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[w_id], phi::CudaAtomicAdd(&in_pos2[w_id],
d1lambda * h2lambda * w1lambda * out_pos[0]); d1lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w], phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
d1lambda * h1lambda * w2lambda * out_pos[0]); d1lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id], phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
d1lambda * h1lambda * w1lambda * out_pos[0]); d1lambda * h1lambda * w1lambda * out_pos[0]);
} else { } else {
int in_pos1_idx = out_id_h * input_w + int in_pos1_idx = out_id_h * input_w +
in_img_idt * in_img_h * in_img_w * num_channels + in_img_idt * in_img_h * in_img_w * num_channels +
...@@ -666,22 +665,22 @@ __global__ void KeTrilinearInterpBw(T* in, ...@@ -666,22 +665,22 @@ __global__ void KeTrilinearInterpBw(T* in,
const T* out_pos = &out[out_id_h * output_w + out_id_w]; const T* out_pos = &out[out_id_h * output_w + out_id_w];
// trilinear interpolation grad // trilinear interpolation grad
platform::CudaAtomicAdd(&in_pos1[0], phi::CudaAtomicAdd(&in_pos1[0],
d2lambda * h2lambda * w2lambda * out_pos[0]); d2lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[w_id * num_channels], phi::CudaAtomicAdd(&in_pos1[w_id * num_channels],
d2lambda * h2lambda * w1lambda * out_pos[0]); d2lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels], phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
d2lambda * h1lambda * w2lambda * out_pos[0]); d2lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&in_pos1[h_id * in_img_w * num_channels + w_id * num_channels], &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels],
d2lambda * h1lambda * w1lambda * out_pos[0]); d2lambda * h1lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[0], phi::CudaAtomicAdd(&in_pos2[0],
d1lambda * h2lambda * w2lambda * out_pos[0]); d1lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[w_id * num_channels], phi::CudaAtomicAdd(&in_pos2[w_id * num_channels],
d1lambda * h2lambda * w1lambda * out_pos[0]); d1lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels], phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
d1lambda * h1lambda * w2lambda * out_pos[0]); d1lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&in_pos2[h_id * in_img_w * num_channels + w_id * num_channels], &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels],
d1lambda * h1lambda * w1lambda * out_pos[0]); d1lambda * h1lambda * w1lambda * out_pos[0]);
} }
...@@ -903,8 +902,8 @@ __global__ void KeBicubicInterpBw(T* in, ...@@ -903,8 +902,8 @@ __global__ void KeBicubicInterpBw(T* in,
in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels + in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels +
access_x * num_channels + channel_id]; access_x * num_channels + channel_id];
} }
platform::CudaAtomicAdd(&in_pos[0], phi::CudaAtomicAdd(&in_pos[0],
(out_pos[0] * y_coeffs[j] * x_coeffs[i])); (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
} }
} }
} }
......
...@@ -22,8 +22,8 @@ ...@@ -22,8 +22,8 @@
#include "paddle/fluid/operators/limit_by_capacity_op.h" #include "paddle/fluid/operators/limit_by_capacity_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -39,7 +39,7 @@ __global__ void limit_by_capacity_impl( ...@@ -39,7 +39,7 @@ __global__ void limit_by_capacity_impl(
wid = i / n_expert; wid = i / n_expert;
eid = i % n_expert; eid = i % n_expert;
auto proposal = expc[wid * n_expert + eid]; auto proposal = expc[wid * n_expert + eid];
auto cap_left = paddle::platform::CudaAtomicAdd(cap + eid, proposal * (-1)); auto cap_left = phi::CudaAtomicAdd(cap + eid, proposal * (-1));
if (cap_left >= proposal) { if (cap_left >= proposal) {
out[wid * n_expert + eid] = proposal; out[wid * n_expert + eid] = proposal;
} else if (cap_left >= 0) { } else if (cap_left >= 0) {
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/lookup_table_op.h" #include "paddle/fluid/operators/lookup_table_op.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -93,7 +93,7 @@ __global__ void LookupTableGrad(T *table, ...@@ -93,7 +93,7 @@ __global__ void LookupTableGrad(T *table,
const T *out = output + idy * D; const T *out = output + idy * D;
T *tab = table + id * D; T *tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) { for (int i = idx; i < D; i += BlockDimX) {
paddle::platform::CudaAtomicAdd(&tab[i], out[i]); phi::CudaAtomicAdd(&tab[i], out[i]);
} }
idy += BlockDimY * GridDimX; idy += BlockDimY * GridDimX;
} }
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/lookup_table_v2_op.h" #include "paddle/fluid/operators/lookup_table_v2_op.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -65,10 +65,10 @@ __global__ void LookupTableV2Grad(T *table, ...@@ -65,10 +65,10 @@ __global__ void LookupTableV2Grad(T *table,
const T *out = output + idy * D; const T *out = output + idy * D;
T *tab = table + id * D; T *tab = table + id * D;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
#else #else
for (int i = idx; i < D; i += blockDim.x) { for (int i = idx; i < D; i += blockDim.x) {
paddle::platform::CudaAtomicAdd(&tab[i], out[i]); phi::CudaAtomicAdd(&tab[i], out[i]);
} }
#endif #endif
idy += blockDim.y * gridDim.x; idy += blockDim.y * gridDim.x;
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/cos_sim_functor.h" #include "paddle/fluid/operators/math/cos_sim_functor.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -44,7 +44,7 @@ __global__ void CosSimDyKernel(const T* x_norm, ...@@ -44,7 +44,7 @@ __global__ void CosSimDyKernel(const T* x_norm,
for (size_t i = 0; i < cols; ++i) { for (size_t i = 0; i < cols; ++i) {
T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
z_data * y[i] * reciprocal_y_norm_square); z_data * y[i] * reciprocal_y_norm_square);
platform::CudaAtomicAdd(dy + i, dy_data); phi::CudaAtomicAdd(dy + i, dy_data);
} }
} }
} }
......
...@@ -15,10 +15,9 @@ limitations under the License. */ ...@@ -15,10 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math.h" #include "paddle/phi/kernels/funcs/math.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -466,8 +466,7 @@ __global__ void col2imOCF(const T* col_data, ...@@ -466,8 +466,7 @@ __global__ void col2imOCF(const T* col_data,
if (height_offset >= 0 && height_offset < im_height && if (height_offset >= 0 && height_offset < im_height &&
width_offset >= 0 && width_offset < im_width) { width_offset >= 0 && width_offset < im_width) {
paddle::platform::CudaAtomicAdd(im_data + im_offset, phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
col_data[col_offset]);
} }
} }
} }
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/maxouting.h" #include "paddle/fluid/operators/math/maxouting.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/fluid/operators/math/sequence_pooling.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
......
...@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/sequence_scale.h" #include "paddle/fluid/operators/math/sequence_scale.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T, int BlockSize> template <typename T, int BlockSize>
__global__ void SequenceScaleKernel(T* seq, __global__ void SequenceScaleKernel(T* seq,
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/unpooling.h" #include "paddle/fluid/operators/math/unpooling.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -15,13 +15,13 @@ limitations under the License. */ ...@@ -15,13 +15,13 @@ limitations under the License. */
#include "paddle/fluid/operators/mean_iou_op.h" #include "paddle/fluid/operators/mean_iou_op.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T> template <typename T>
__global__ void CountCUDAKernel(const int num_classes, __global__ void CountCUDAKernel(const int num_classes,
......
...@@ -22,8 +22,8 @@ ...@@ -22,8 +22,8 @@
#include "paddle/fluid/operators/number_count_op.h" #include "paddle/fluid/operators/number_count_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -77,7 +77,7 @@ __global__ void NumberCount(const T* numbers, ...@@ -77,7 +77,7 @@ __global__ void NumberCount(const T* numbers,
#endif #endif
} }
if (threadIdx.x % WARP_SIZE == 0) { if (threadIdx.x % WARP_SIZE == 0) {
platform::CudaAtomicAdd(number_count + i, x); phi::CudaAtomicAdd(number_count + i, x);
} }
} }
} }
......
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
#include "paddle/fluid/operators/one_hot_op.h" #include "paddle/fluid/operators/one_hot_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename InT, typename OutT> template <typename InT, typename OutT>
__global__ void FillOutputKernel(const InT* p_in_data, __global__ void FillOutputKernel(const InT* p_in_data,
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/optimizers/sgd_op.h" #include "paddle/fluid/operators/optimizers/sgd_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -56,7 +56,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows, ...@@ -56,7 +56,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) { for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
// Since index in rows of SelectedRows can be duplicate, we have to use // Since index in rows of SelectedRows can be duplicate, we have to use
// Atomic Operation to avoid concurrent write error. // Atomic Operation to avoid concurrent write error.
paddle::platform::CudaAtomicAdd( phi::CudaAtomicAdd(
tensor_out_ptr + index, tensor_out_ptr + index,
-static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]); -static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]);
} }
......
...@@ -16,13 +16,13 @@ limitations under the License. */ ...@@ -16,13 +16,13 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T> template <typename T>
__global__ void Pad2DConstNCHW(const int nthreads, __global__ void Pad2DConstNCHW(const int nthreads,
...@@ -257,9 +257,8 @@ __global__ void Pad2DGradReflectNCHW(const int out_size, ...@@ -257,9 +257,8 @@ __global__ void Pad2DGradReflectNCHW(const int out_size,
in_w = max(in_w, -in_w); in_w = max(in_w, -in_w);
in_h = min(in_h, 2 * in_height - in_h - 2); in_h = min(in_h, 2 * in_height - in_h - 2);
in_w = min(in_w, 2 * in_width - in_w - 2); in_w = min(in_w, 2 * in_width - in_w - 2);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
&d_in_data[(nc * in_height + in_h) * in_width + in_w], d_out_data[out_index]);
d_out_data[out_index]);
} }
} }
...@@ -288,7 +287,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size, ...@@ -288,7 +287,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size,
in_w = max(in_w, -in_w); in_w = max(in_w, -in_w);
in_h = min(in_h, in_height * 2 - in_h - 2); in_h = min(in_h, in_height * 2 - in_h - 2);
in_w = min(in_w, in_width * 2 - in_w - 2); in_w = min(in_w, in_width * 2 - in_w - 2);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c], &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
d_out_data[out_index]); d_out_data[out_index]);
} }
...@@ -313,9 +312,8 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size, ...@@ -313,9 +312,8 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size,
nc /= out_height; nc /= out_height;
const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
platform::CudaAtomicAdd( phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
&d_in_data[(nc * in_height + in_h) * in_width + in_w], d_out_data[out_index]);
d_out_data[out_index]);
} }
} }
...@@ -340,7 +338,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, ...@@ -340,7 +338,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size,
n /= out_height; n /= out_height;
const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c], &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
d_out_data[out_index]); d_out_data[out_index]);
} }
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#endif #endif
namespace paddle { namespace paddle {
...@@ -96,7 +96,7 @@ DEVICE void PrRoIPoolingDistributeDiff(T* diff, ...@@ -96,7 +96,7 @@ DEVICE void PrRoIPoolingDistributeDiff(T* diff,
const T coeff) { const T coeff) {
bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
if (!overflow) { if (!overflow) {
paddle::platform::CudaAtomicAdd(diff + h * width + w, top_diff * coeff); phi::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
} }
} }
#else #else
...@@ -166,7 +166,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff, ...@@ -166,7 +166,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff,
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
template <typename T> template <typename T>
DEVICE void AccumulateRois(T* offset, T data) { DEVICE void AccumulateRois(T* offset, T data) {
paddle::platform::CudaAtomicAdd(offset, data); phi::CudaAtomicAdd(offset, data);
} }
#else #else
template <typename T> template <typename T>
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
// Licensed under the Apache License, Version 2.0 (the "License"). // Licensed under the Apache License, Version 2.0 (the "License").
#include "paddle/fluid/operators/prune_gate_by_capacity_op.h" #include "paddle/fluid/operators/prune_gate_by_capacity_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
...@@ -47,7 +47,7 @@ __global__ void prune_gate_by_capacity_kernel(const T1* gate_idx_data, ...@@ -47,7 +47,7 @@ __global__ void prune_gate_by_capacity_kernel(const T1* gate_idx_data,
const int64_t batch_size) { const int64_t batch_size) {
CUDA_KERNEL_LOOP(i, batch_size) { CUDA_KERNEL_LOOP(i, batch_size) {
auto orig_cap = auto orig_cap =
platform::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1); phi::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1);
if (orig_cap <= 0) { if (orig_cap <= 0) {
new_gate_idx_data[i] = -1; new_gate_idx_data[i] = -1;
} else { } else {
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/fluid/operators/pull_box_extended_sparse_op.h" #include "paddle/fluid/operators/pull_box_extended_sparse_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -37,7 +37,7 @@ limitations under the License. */ ...@@ -37,7 +37,7 @@ limitations under the License. */
#include "xpu/kernel/math.h" // NOLINT #include "xpu/kernel/math.h" // NOLINT
#else #else
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#endif #endif
#include "paddle/fluid/operators/pull_box_sparse_op.h" #include "paddle/fluid/operators/pull_box_sparse_op.h"
...@@ -46,9 +46,13 @@ namespace ops = paddle::operators; ...@@ -46,9 +46,13 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
REGISTER_OP_KERNEL(pull_box_sparse, KP, plat::XPUPlace, REGISTER_OP_KERNEL(pull_box_sparse,
KP,
plat::XPUPlace,
ops::PullBoxSparseKernel<float>); ops::PullBoxSparseKernel<float>);
REGISTER_OP_KERNEL(push_box_sparse, KP, plat::XPUPlace, REGISTER_OP_KERNEL(push_box_sparse,
KP,
plat::XPUPlace,
ops::PushBoxSparseKernel<float>); ops::PushBoxSparseKernel<float>);
#else #else
REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>); REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
......
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
#include "paddle/fluid/operators/pull_gpups_sparse_op.h" #include "paddle/fluid/operators/pull_gpups_sparse_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
template <typename T> template <typename T>
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.cu.h" #include "paddle/fluid/operators/fake_dequantize_op.cu.h"
#include "paddle/fluid/operators/fake_quantize_op.cu.h" #include "paddle/fluid/operators/fake_quantize_op.cu.h"
#include "paddle/fluid/operators/quantize_linear_op.h" #include "paddle/fluid/operators/quantize_linear_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "paddle/fluid/operators/random_routing_op.h" #include "paddle/fluid/operators/random_routing_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/operators/rank_attention.cu.h" #include "paddle/fluid/operators/rank_attention.cu.h"
#include "paddle/fluid/operators/rank_attention_op.h" #include "paddle/fluid/operators/rank_attention_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
......
...@@ -16,11 +16,11 @@ ...@@ -16,11 +16,11 @@
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h" #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
template <typename T> template <typename T>
......
...@@ -16,11 +16,11 @@ limitations under the License. */ ...@@ -16,11 +16,11 @@ limitations under the License. */
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h" #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
template <typename T> template <typename T>
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h" #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -72,7 +72,7 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data, ...@@ -72,7 +72,7 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data,
for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) { for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
for (int tid_x = threadIdx.x; tid_x < x_item_length; for (int tid_x = threadIdx.x; tid_x < x_item_length;
tid_x += blockDim.x) { tid_x += blockDim.x) {
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&dx_data[(x_offset + tid_y) * x_item_length + tid_x], &dx_data[(x_offset + tid_y) * x_item_length + tid_x],
dout_data[(out_offset + tid_z * x_item_count + tid_y) * dout_data[(out_offset + tid_z * x_item_count + tid_y) *
x_item_length + x_item_length +
......
...@@ -11,7 +11,7 @@ limitations under the License. */ ...@@ -11,7 +11,7 @@ limitations under the License. */
#include "paddle/fluid/operators/shuffle_channel_op.h" #include "paddle/fluid/operators/shuffle_channel_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#include "paddle/fluid/operators/temporal_shift_op.h" #include "paddle/fluid/operators/temporal_shift_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -28,8 +28,8 @@ limitations under the License. */ ...@@ -28,8 +28,8 @@ limitations under the License. */
#include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#define FINAL_MASK 0xffffffff #define FINAL_MASK 0xffffffff
#ifdef __HIPCC__ #ifdef __HIPCC__
...@@ -713,7 +713,7 @@ __device__ void RadixCountUsingMask(const T* input, ...@@ -713,7 +713,7 @@ __device__ void RadixCountUsingMask(const T* input,
if (GetLaneId() == 0) { if (GetLaneId() == 0) {
#pragma unroll #pragma unroll
for (uint32_t i = 0; i < RadixSize; ++i) { for (uint32_t i = 0; i < RadixSize; ++i) {
platform::CudaAtomicAdd(&shared_mem[i], counts[i]); phi::CudaAtomicAdd(&shared_mem[i], counts[i]);
} }
} }
......
...@@ -16,9 +16,9 @@ limitations under the License. */ ...@@ -16,9 +16,9 @@ limitations under the License. */
#include "paddle/fluid/framework/gpu_utils.h" #include "paddle/fluid/framework/gpu_utils.h"
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/fast_divmod.h" #include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/autotune/auto_tune_base.h" #include "paddle/phi/kernels/autotune/auto_tune_base.h"
......
...@@ -24,17 +24,15 @@ ...@@ -24,17 +24,15 @@
#define PADDLE_CUDA_FP16 #define PADDLE_CUDA_FP16
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
using paddle::platform::float16; using paddle::platform::float16;
using paddle::platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T> template <typename T>
__global__ void AddKernel(const T* data_a, T* data_b, size_t num) { __global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
CUDA_KERNEL_LOOP(i, num) { CUDA_KERNEL_LOOP(i, num) { phi::CudaAtomicAdd(&data_b[i], data_a[i]); }
paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
}
} }
template <typename T> template <typename T>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <stdio.h>
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace platform {
#define CUDA_ATOMIC_WRAPPER(op, T) \
__device__ __forceinline__ T CudaAtomic##op(T *address, const T val)
#define USE_CUDA_ATOMIC(op, T) \
CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
// Default thread count per block(or block size).
// TODO(typhoonzero): need to benchmark against setting this value
// to 1024.
constexpr int PADDLE_CUDA_NUM_THREADS = 512;
// For atomicAdd.
USE_CUDA_ATOMIC(Add, float);
USE_CUDA_ATOMIC(Add, int);
USE_CUDA_ATOMIC(Add, unsigned int);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
USE_CUDA_ATOMIC(Add, unsigned long long int); // NOLINT
CUDA_ATOMIC_WRAPPER(Add, int64_t) {
// Here, we check long long int must be int64_t.
static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT
"long long should be int64");
return CudaAtomicAdd(
reinterpret_cast<unsigned long long int *>(address), // NOLINT
static_cast<unsigned long long int>(val)); // NOLINT
}
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
USE_CUDA_ATOMIC(Add, double);
#else
CUDA_ATOMIC_WRAPPER(Add, double) {
unsigned long long int *address_as_ull = // NOLINT
reinterpret_cast<unsigned long long int *>(address); // NOLINT
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
old = atomicCAS(address_as_ull,
assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
#ifdef PADDLE_CUDA_FP16
// NOTE(dzhwinter): cuda do not have atomicCAS for half.
// Just use the half address as a unsigned value address and
// do the atomicCAS. According to the value store at high 16 bits
// or low 16 bits, then do a different sum and CAS.
// Given most warp-threads will failed on the atomicCAS, so this
// implemented should be avoided in high concurrency. It's will be
// slower than the way convert value into 32bits and do a full atomicCAS.
// convert the value into float and do the add arithmetic.
// then store the result into a uint32.
inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
float16 low_half;
// the float16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half = static_cast<float16>(static_cast<float>(low_half) + x);
return (val & 0xFFFF0000u) | low_half.x;
}
inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
float16 high_half;
// the float16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half = static_cast<float16>(static_cast<float>(high_half) + x);
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}
#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) {
return *reinterpret_cast<float16 *>(&x);
}
static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) {
return *reinterpret_cast<__half *>(&x);
}
CUDA_ATOMIC_WRAPPER(Add, float16) {
return CUDAFP16ToPDFP16(
atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val)));
}
#else
CUDA_ATOMIC_WRAPPER(Add, float16) {
// concrete packed float16 value may exsits in lower or higher 16bits
// of the 32bits address.
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t sum;
uint32_t newval;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// the float16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
} while (old != assumed);
float16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// the float16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f));
} while (old != assumed);
float16 ret;
ret.x = old >> 16;
return ret;
}
}
#endif
template <typename T, bool IsAvailable, typename NVType, typename NVVec2Type>
struct VecAtomicAddHelperBase {
static constexpr auto kIsAvailable = IsAvailable;
using NVT = NVType;
using NVVec2T = NVVec2Type;
};
template <typename T>
struct VecAtomicAddHelper : VecAtomicAddHelperBase<T, false, void, void> {};
#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
template <>
struct VecAtomicAddHelper<platform::float16>
: VecAtomicAddHelperBase<platform::float16, true, __half, __half2> {};
#endif
#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
template <>
struct VecAtomicAddHelper<platform::bfloat16>
: VecAtomicAddHelperBase<platform::bfloat16,
true,
__nv_bfloat16,
__nv_bfloat162> {};
#endif
// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )"
// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up.
template <typename T,
typename std::enable_if<VecAtomicAddHelper<T>::kIsAvailable>::type * =
nullptr>
__device__ __forceinline__ void fastAtomicAdd(T *tensor,
size_t index,
const size_t numel,
T value) {
// whether the address is 32-byte aligned.
using NVT = typename VecAtomicAddHelper<T>::NVT;
using NVVec2T = typename VecAtomicAddHelper<T>::NVVec2T;
NVT *target_addr = reinterpret_cast<NVT *>(tensor + index);
bool aligned_half2 =
(reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(NVVec2T) == 0);
if (aligned_half2 && index < (numel - 1)) {
NVVec2T value2;
value2.x = *reinterpret_cast<NVT *>(&value);
value2.y = 0.0;
atomicAdd(reinterpret_cast<NVVec2T *>(target_addr), value2);
} else if (!aligned_half2 && index > 0) {
NVVec2T value2;
value2.x = 0.0;
value2.y = *reinterpret_cast<NVT *>(&value);
atomicAdd(reinterpret_cast<NVVec2T *>(target_addr - 1), value2);
} else {
atomicAdd(reinterpret_cast<NVT *>(tensor) + index,
*reinterpret_cast<NVT *>(&value));
}
}
template <typename T,
typename std::enable_if<!VecAtomicAddHelper<T>::kIsAvailable>::type
* = nullptr>
__device__ __forceinline__ void fastAtomicAdd(T *arr,
size_t index,
const size_t numel,
T value) {
CudaAtomicAdd(arr + index, value);
}
#endif
// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
bfloat16 low_half;
// the bfloat16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half = static_cast<bfloat16>(static_cast<float>(low_half) + x);
return (val & 0xFFFF0000u) | low_half.x;
}
inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
bfloat16 high_half;
// the bfloat16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half = static_cast<bfloat16>(static_cast<float>(high_half) + x);
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}
#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) {
return *reinterpret_cast<bfloat16 *>(&x);
}
static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) {
return *reinterpret_cast<__nv_bfloat16 *>(&x);
}
CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
PDBF16ToCUDABF16(val)));
}
#else
CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
// concrete packed bfloat16 value may exsits in lower or higher 16bits
// of the 32bits address.
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t sum;
uint32_t newval;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// the bfloat16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(
address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
} while (old != assumed);
bfloat16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// the bfloat16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(
address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
} while (old != assumed);
bfloat16 ret;
ret.x = old >> 16;
return ret;
}
}
#endif
CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
float *real = reinterpret_cast<float *>(address);
float *imag = real + 1;
return complex<float>(CudaAtomicAdd(real, val.real),
CudaAtomicAdd(imag, val.imag));
}
CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
double *real = reinterpret_cast<double *>(address);
double *imag = real + 1;
return complex<double>(CudaAtomicAdd(real, val.real),
CudaAtomicAdd(imag, val.imag));
}
// For atomicMax
USE_CUDA_ATOMIC(Max, int);
USE_CUDA_ATOMIC(Max, unsigned int);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT
#else
CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { // NOLINT
if (*address >= val) {
return *address;
}
unsigned long long int old = *address, assumed; // NOLINT
do {
assumed = old;
if (assumed >= val) {
break;
}
old = atomicCAS(address, assumed, val);
} while (assumed != old);
}
#endif
CUDA_ATOMIC_WRAPPER(Max, int64_t) {
// Here, we check long long int must be int64_t.
static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT
"long long should be int64");
long long int res = *address; // NOLINT
while (val > res) {
long long int old = res; // NOLINT
res = (long long int)atomicCAS((unsigned long long int *)address, // NOLINT
(unsigned long long int)old, // NOLINT
(unsigned long long int)val); // NOLINT
if (res == old) {
break;
}
}
return res;
}
CUDA_ATOMIC_WRAPPER(Max, float) {
if (*address >= val) {
return *address;
}
int *const address_as_i = reinterpret_cast<int *>(address);
int old = *address_as_i, assumed;
do {
assumed = old;
if (__int_as_float(assumed) >= val) {
break;
}
old = atomicCAS(address_as_i, assumed, __float_as_int(val));
} while (assumed != old);
return __int_as_float(old);
}
CUDA_ATOMIC_WRAPPER(Max, double) {
if (*address >= val) {
return *address;
}
unsigned long long int *const address_as_ull = // NOLINT
reinterpret_cast<unsigned long long int *>(address); // NOLINT
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
if (__longlong_as_double(assumed) >= val) {
break;
}
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
} while (assumed != old);
return __longlong_as_double(old);
}
#ifdef PADDLE_CUDA_FP16
inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) {
float16 low_half;
// The float16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half = static_cast<float16>(max(static_cast<float>(low_half), x));
return (val & 0xFFFF0000u) | low_half.x;
}
inline static __device__ uint32_t max_to_high_half(uint32_t val, float x) {
float16 high_half;
// The float16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half = static_cast<float16>(max(static_cast<float>(high_half), x));
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}
CUDA_ATOMIC_WRAPPER(Max, float16) {
if (*address >= val) {
return *address;
}
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// The float16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed, max_to_low_half(assumed, val_f));
} while (old != assumed);
float16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// The float16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed, max_to_high_half(assumed, val_f));
} while (old != assumed);
float16 ret;
ret.x = old >> 16;
return ret;
}
}
#endif
// For atomicMin
USE_CUDA_ATOMIC(Min, int);
USE_CUDA_ATOMIC(Min, unsigned int);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT
#else
CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { // NOLINT
if (*address <= val) {
return *address;
}
unsigned long long int old = *address, assumed; // NOLINT
do {
assumed = old;
if (assumed <= val) {
break;
}
old = atomicCAS(address, assumed, val);
} while (assumed != old);
}
#endif
CUDA_ATOMIC_WRAPPER(Min, int64_t) {
// Here, we check long long int must be int64_t.
static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT
"long long should be int64");
long long int res = *address; // NOLINT
while (val < res) {
long long int old = res; // NOLINT
res = (long long int)atomicCAS((unsigned long long int *)address, // NOLINT
(unsigned long long int)old, // NOLINT
(unsigned long long int)val); // NOLINT
if (res == old) {
break;
}
}
return res;
}
CUDA_ATOMIC_WRAPPER(Min, float) {
if (*address <= val) {
return *address;
}
int *const address_as_i = reinterpret_cast<int *>(address);
int old = *address_as_i, assumed;
do {
assumed = old;
if (__int_as_float(assumed) <= val) {
break;
}
old = atomicCAS(address_as_i, assumed, __float_as_int(val));
} while (assumed != old);
return __int_as_float(old);
}
CUDA_ATOMIC_WRAPPER(Min, double) {
if (*address <= val) {
return *address;
}
unsigned long long int *const address_as_ull = // NOLINT
reinterpret_cast<unsigned long long int *>(address); // NOLINT
unsigned long long int old = *address_as_ull, assumed; // NOLINT
do {
assumed = old;
if (__longlong_as_double(assumed) <= val) {
break;
}
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
} while (assumed != old);
return __longlong_as_double(old);
}
#ifdef PADDLE_CUDA_FP16
inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) {
float16 low_half;
// The float16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half = static_cast<float16>(min(static_cast<float>(low_half), x));
return (val & 0xFFFF0000u) | low_half.x;
}
inline static __device__ uint32_t min_to_high_half(uint32_t val, float x) {
float16 high_half;
// The float16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half = static_cast<float16>(min(static_cast<float>(high_half), x));
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}
CUDA_ATOMIC_WRAPPER(Min, float16) {
if (*address <= val) {
return *address;
}
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// The float16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed, min_to_low_half(assumed, val_f));
} while (old != assumed);
float16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// The float16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed, min_to_high_half(assumed, val_f));
} while (old != assumed);
float16 ret;
ret.x = old >> 16;
return ret;
}
}
#endif
#ifdef PADDLE_WITH_CUDA
/*
* One thead block deals with elementwise atomicAdd for vector of len.
* @in: [x1, x2, x3, ...]
* @out:[y1+x1, y2+x2, y3+x3, ...]
* */
template <typename T,
typename std::enable_if<!VecAtomicAddHelper<T>::kIsAvailable>::type
* = nullptr>
__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
for (int i = tid; i < len; i += threads_per_block) {
CudaAtomicAdd(&out[i], in[i]);
}
}
// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
template <typename T,
typename std::enable_if<VecAtomicAddHelper<T>::kIsAvailable>::type * =
nullptr>
__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
int i = 0;
int loops = len / 2 * 2;
using NVT = typename VecAtomicAddHelper<T>::NVT;
using NVVec2T = typename VecAtomicAddHelper<T>::NVVec2T;
bool aligned_half2 =
(reinterpret_cast<std::uintptr_t>(out) % sizeof(NVT) == 0);
if (aligned_half2) {
for (i = tid * 2; i < loops; i += threads_per_block * 2) {
NVVec2T value2;
T value_1 = in[i];
T value_2 = in[i + 1];
value2.x = *reinterpret_cast<NVT *>(&value_1);
value2.y = *reinterpret_cast<NVT *>(&value_2);
atomicAdd(reinterpret_cast<NVVec2T *>(&out[i]), value2);
}
for (; i < len; i += threads_per_block) {
fastAtomicAdd(out, i, len, in[i]);
}
} else {
for (int i = tid; i < len; i += threads_per_block) {
fastAtomicAdd(out, i, len, in[i]);
}
}
}
#endif
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册