未验证 提交 9918bf9c 编写于 作者: W Wang Xin 提交者: GitHub

[PHI decoupling] remove "gpu_primitives.h" in fluid (#48063)

* remove "gpu_primitives.h" in fluid namespace

* fix PR-CI-GpuPS fail

* fix PR-CI-GpuPS fail
上级 a33d563c
...@@ -13,12 +13,12 @@ limitations under the License. */ ...@@ -13,12 +13,12 @@ limitations under the License. */
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS; const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS) #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0 #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
...@@ -45,7 +45,7 @@ __global__ void PullCopy(float** dest, ...@@ -45,7 +45,7 @@ __global__ void PullCopy(float** dest,
int x = low; int x = low;
int y = i - (x ? len[x - 1] : 0); int y = i - (x ? len[x - 1] : 0);
float* feature_value_ptr = float* feature_value_ptr =
(float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); // NOLINT
int mf_dim = gpu_dim[x] - 3; int mf_dim = gpu_dim[x] - 3;
gpu_accessor.Select( gpu_accessor.Select(
dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
...@@ -79,7 +79,7 @@ __global__ void PullDedupCopy(const size_t N, ...@@ -79,7 +79,7 @@ __global__ void PullDedupCopy(const size_t N,
return; return;
} }
float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) * float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) * // NOLINT
uint64_t(max_val_size)); uint64_t(max_val_size));
switch (off) { switch (off) {
case 0: case 0:
...@@ -125,9 +125,10 @@ __global__ void PushCopyWithPool(float* dest, ...@@ -125,9 +125,10 @@ __global__ void PushCopyWithPool(float* dest,
} }
int x = low; int x = low;
int y = i - (x ? len[low - 1] : 0); int y = i - (x ? len[low - 1] : 0);
float* cur = (float*)((char*)dest + i * grad_value_size); float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT
cur[gpu_accessor.common_push_value.SlotIndex()] = (float)slot_vector[x]; cur[gpu_accessor.common_push_value.SlotIndex()] =
(float)slot_vector[x]; // NOLINT
int mf_dim = mf_dim_vector[x]; int mf_dim = mf_dim_vector[x];
cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim; cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim;
...@@ -170,21 +171,20 @@ __global__ void PushMergeCopyAtomic(const size_t N, ...@@ -170,21 +171,20 @@ __global__ void PushMergeCopyAtomic(const size_t N,
int y = i - slot_lens[x]; int y = i - slot_lens[x];
const float* ptr = src[x] + y * hidden; const float* ptr = src[x] + y * hidden;
float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size); float* cur =
(float*)((char*)dest + d_restore_idx[i] * grad_value_size); // NOLINT
int mf_dim = slot_dims[x] - 3; int mf_dim = slot_dims[x] - 3;
switch (off) { switch (off) {
case 0: case 0:
cur[accessor.SlotIndex()] = (float)slot_vector[x]; cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT
cur[accessor.MfDimIndex()] = mf_dim; cur[accessor.MfDimIndex()] = mf_dim;
paddle::platform::CudaAtomicAdd(&cur[accessor.ShowIndex()], phi::CudaAtomicAdd(&cur[accessor.ShowIndex()], *(ptr + off));
*(ptr + off));
break; break;
case 1: case 1:
paddle::platform::CudaAtomicAdd(&cur[accessor.ClickIndex()], phi::CudaAtomicAdd(&cur[accessor.ClickIndex()], *(ptr + off));
*(ptr + off));
break; break;
case 2: case 2:
paddle::platform::CudaAtomicAdd(&cur[accessor.EmbedGIndex()], phi::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
*(ptr + off) * -1. * bs); *(ptr + off) * -1. * bs);
break; break;
default: default:
...@@ -192,8 +192,7 @@ __global__ void PushMergeCopyAtomic(const size_t N, ...@@ -192,8 +192,7 @@ __global__ void PushMergeCopyAtomic(const size_t N,
if (mf_dim < embedx_idx) { if (mf_dim < embedx_idx) {
return; return;
} }
paddle::platform::CudaAtomicAdd( phi::CudaAtomicAdd(&cur[accessor.EmbedxGIndex() + embedx_idx],
&cur[accessor.EmbedxGIndex() + embedx_idx],
*(ptr + off) * -1. * bs); *(ptr + off) * -1. * bs);
break; break;
} }
...@@ -228,7 +227,7 @@ __global__ void PushMergeCopy(const size_t N, ...@@ -228,7 +227,7 @@ __global__ void PushMergeCopy(const size_t N,
int i = idx / hidden; int i = idx / hidden;
int off = idx % hidden; int off = idx % hidden;
// filter 0 keys // filter 0 keys
float* cur = (float*)((char*)dest + i * grad_value_size); float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT
if (total_keys[i] == 0) { if (total_keys[i] == 0) {
switch (off) { switch (off) {
...@@ -262,7 +261,7 @@ __global__ void PushMergeCopy(const size_t N, ...@@ -262,7 +261,7 @@ __global__ void PushMergeCopy(const size_t N,
switch (off) { switch (off) {
case 0: case 0:
cur[accessor.SlotIndex()] = (float)slot_vector[x]; cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT
cur[accessor.MfDimIndex()] = mf_dim; cur[accessor.MfDimIndex()] = mf_dim;
SUM_GRAD_VALUE SUM_GRAD_VALUE
cur[accessor.ShowIndex()] = val; cur[accessor.ShowIndex()] = val;
...@@ -331,8 +330,8 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl( ...@@ -331,8 +330,8 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
const uint64_t total_length, const uint64_t total_length,
const int batch_size, const int batch_size,
size_t grad_value_size, size_t grad_value_size,
std::vector<int>& slot_vector, std::vector<int>& slot_vector, // NOLINT
std::vector<int>& slot_mf_dim_vector) { std::vector<int>& slot_mf_dim_vector) { // NOLINT
auto stream = dynamic_cast<phi::GPUContext*>( auto stream = dynamic_cast<phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place)) paddle::platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
......
...@@ -22,12 +22,12 @@ limitations under the License. */ ...@@ -22,12 +22,12 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS; const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS) #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0 #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
......
...@@ -20,8 +20,8 @@ ...@@ -20,8 +20,8 @@
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
#include "paddle/fluid/operators/fused_token_prune_op.cu.h" #include "paddle/fluid/operators/fused_token_prune_op.cu.h"
...@@ -149,7 +149,7 @@ __global__ void ReduceSum2<half>( ...@@ -149,7 +149,7 @@ __global__ void ReduceSum2<half>(
} }
if (tid == 0) { if (tid == 0) {
platform::fastAtomicAdd<platform::float16>( phi::fastAtomicAdd<platform::float16>(
reinterpret_cast<platform::float16*>(dst), reinterpret_cast<platform::float16*>(dst),
static_cast<size_t>(batch * max_seq_len + col), static_cast<size_t>(batch * max_seq_len + col),
static_cast<size_t>(bsz * max_seq_len), static_cast<size_t>(bsz * max_seq_len),
......
...@@ -23,7 +23,7 @@ namespace cub = hipcub; ...@@ -23,7 +23,7 @@ namespace cub = hipcub;
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -23,8 +23,8 @@ We retain the following license from the original files: ...@@ -23,8 +23,8 @@ We retain the following license from the original files:
#include "paddle/fluid/operators/assign_pos_op.h" #include "paddle/fluid/operators/assign_pos_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
DECLARE_bool(avoid_op_randomness); DECLARE_bool(avoid_op_randomness);
...@@ -47,7 +47,7 @@ __global__ void AssignPos(T* cum_count, ...@@ -47,7 +47,7 @@ __global__ void AssignPos(T* cum_count,
CUDA_KERNEL_LOOP(i, limit) { CUDA_KERNEL_LOOP(i, limit) {
int number_idx = numbers[i]; int number_idx = numbers[i];
if (number_idx > -1) { if (number_idx > -1) {
int p = platform::CudaAtomicAdd(cum_count + number_idx, -1); int p = phi::CudaAtomicAdd(cum_count + number_idx, -1);
out[p - 1] = i; out[p - 1] = i;
} }
} }
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/batch_fc_op.h" #include "paddle/fluid/operators/batch_fc_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/fluid/operators/bilateral_slice_op.h" #include "paddle/fluid/operators/bilateral_slice_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,11 +16,11 @@ limitations under the License. */ ...@@ -16,11 +16,11 @@ limitations under the License. */
#include "paddle/fluid/operators/center_loss_op.h" #include "paddle/fluid/operators/center_loss_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T, int BlockDimX, int BlockDimY, int GridDimX> template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
__global__ void ComputeDifferent(T *centers_diff, __global__ void ComputeDifferent(T *centers_diff,
...@@ -75,7 +75,7 @@ __global__ void UpdateCenters(T *centers, ...@@ -75,7 +75,7 @@ __global__ void UpdateCenters(T *centers,
const T *diff = centers_diff + idy * D; const T *diff = centers_diff + idy * D;
T *cent = centers + id * D; T *cent = centers + id * D;
for (int i = idx; i < D; i += BlockDimX) { for (int i = idx; i < D; i += BlockDimX) {
paddle::platform::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count); phi::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
} }
idy += BlockDimY * GridDimX; idy += BlockDimY * GridDimX;
} }
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -77,8 +77,7 @@ __global__ void CEmbeddingGrad(T *table, ...@@ -77,8 +77,7 @@ __global__ void CEmbeddingGrad(T *table,
auto id = ids[row]; auto id = ids[row];
if (id >= start_idx && id < end_idx) { if (id >= start_idx && id < end_idx) {
auto real_idx = id - start_idx; auto real_idx = id - start_idx;
paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col], phi::CudaAtomicAdd(&table[real_idx * columns + col], output[i]);
output[i]);
} }
} }
} }
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_split_op.h" #include "paddle/fluid/operators/collective/c_split_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/conv_shift_op.h" #include "paddle/fluid/operators/conv_shift_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
......
...@@ -16,12 +16,12 @@ limitations under the License. */ ...@@ -16,12 +16,12 @@ limitations under the License. */
#include "paddle/fluid/operators/cvm_op.h" #include "paddle/fluid/operators/cvm_op.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
using Tensor = phi::DenseTensor; using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/data_norm_op.h" #include "paddle/fluid/operators/data_norm_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
...@@ -29,7 +29,7 @@ namespace operators { ...@@ -29,7 +29,7 @@ namespace operators {
using Tensor = phi::DenseTensor; using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
using DataLayout = phi::DataLayout; using DataLayout = phi::DataLayout;
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
inline int GET_BLOCKS(const int N) { inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
......
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/deformable_psroi_pooling_op.h" #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
...@@ -41,7 +41,7 @@ namespace operators { ...@@ -41,7 +41,7 @@ namespace operators {
using Tensor = phi::DenseTensor; using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
using paddle::platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
static inline int GET_BLOCKS(const int N) { static inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
...@@ -447,17 +447,13 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( ...@@ -447,17 +447,13 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
// compute gradient of input // compute gradient of input
if (bottom_data_diff) { if (bottom_data_diff) {
platform::CudaAtomicAdd( phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x0,
bottom_data_diff + bottom_index + y0 * width + x0,
q00 * diff_val); q00 * diff_val);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x0,
bottom_data_diff + bottom_index + y1 * width + x0,
q01 * diff_val); q01 * diff_val);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x1,
bottom_data_diff + bottom_index + y0 * width + x1,
q10 * diff_val); q10 * diff_val);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x1,
bottom_data_diff + bottom_index + y1 * width + x1,
q11 * diff_val); q11 * diff_val);
} }
...@@ -478,8 +474,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( ...@@ -478,8 +474,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
u00 * (1 - dist_x)) * u00 * (1 - dist_x)) *
trans_std * diff_val; trans_std * diff_val;
diff_y *= roi_height; diff_y *= roi_height;
platform::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x); phi::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x);
platform::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y); phi::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y);
} }
} }
} }
......
...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/dequantize_log_op.h" #include "paddle/fluid/operators/dequantize_log_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/box_clip_op.h" #include "paddle/fluid/operators/detection/box_clip_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
......
...@@ -11,7 +11,7 @@ limitations under the License. */ ...@@ -11,7 +11,7 @@ limitations under the License. */
#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h" #include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -26,8 +26,8 @@ namespace cub = hipcub; ...@@ -26,8 +26,8 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/gather.cu.h"
namespace paddle { namespace paddle {
...@@ -50,7 +50,7 @@ static __global__ void GetLengthLoD(const int nthreads, ...@@ -50,7 +50,7 @@ static __global__ void GetLengthLoD(const int nthreads,
const int* batch_ids, const int* batch_ids,
int* length_lod) { int* length_lod) {
CUDA_KERNEL_LOOP(i, nthreads) { CUDA_KERNEL_LOOP(i, nthreads) {
platform::CudaAtomicAdd(length_lod + batch_ids[i], 1); phi::CudaAtomicAdd(length_lod + batch_ids[i], 1);
} }
} }
......
...@@ -14,13 +14,13 @@ limitations under the License. */ ...@@ -14,13 +14,13 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = phi::DenseTensor; using Tensor = phi::DenseTensor;
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
#define CUDA_BLOCK_SIZE 16 #define CUDA_BLOCK_SIZE 16
template <typename T> template <typename T>
......
...@@ -15,12 +15,12 @@ limitations under the License. */ ...@@ -15,12 +15,12 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
using paddle::platform::float16; using paddle::platform::float16;
using paddle::platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math.h" #include "paddle/phi/kernels/funcs/math.h"
......
...@@ -43,7 +43,7 @@ limitations under the License. */ ...@@ -43,7 +43,7 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/gpu/elementwise_grad.h" #include "paddle/phi/kernels/gpu/elementwise_grad.h"
#endif #endif
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/fake_quantize_op.h" #include "paddle/fluid/operators/fake_quantize_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/aligned_vector.h"
namespace paddle { namespace paddle {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/gather_scatter_kernel.h" #include "paddle/fluid/operators/gather_scatter_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -35,7 +35,7 @@ class ReduceAdd { ...@@ -35,7 +35,7 @@ class ReduceAdd {
typename tensor_t, typename tensor_t,
std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr> std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
__device__ void operator()(tensor_t* self_data, tensor_t* src_data) const { __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
platform::CudaAtomicAdd(self_data, *src_data); phi::CudaAtomicAdd(self_data, *src_data);
} }
template <typename tensor_t, template <typename tensor_t,
std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr> std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
......
...@@ -41,8 +41,8 @@ limitations under the License. */ ...@@ -41,8 +41,8 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/graph_khop_sampler_imp.h" #include "paddle/fluid/operators/graph_khop_sampler_imp.h"
#include "paddle/fluid/operators/graph_khop_sampler_op.h" #include "paddle/fluid/operators/graph_khop_sampler_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
constexpr int WARP_SIZE = 32; constexpr int WARP_SIZE = 32;
...@@ -134,8 +134,7 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, ...@@ -134,8 +134,7 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
const int num = curand(&rng) % (idx + 1); const int num = curand(&rng) % (idx + 1);
#endif #endif
if (num < k) { if (num < k) {
paddle::platform::CudaAtomicMax(output_idxs + out_row_start + num, phi::CudaAtomicMax(output_idxs + out_row_start + num, idx);
idx);
} }
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -22,7 +22,7 @@ namespace cub = hipcub; ...@@ -22,7 +22,7 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/group_norm_op.h" #include "paddle/fluid/operators/group_norm_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -51,7 +51,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { ...@@ -51,7 +51,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
typedef cub::WarpReduce<T> WarpReduce; typedef cub::WarpReduce<T> WarpReduce;
typename WarpReduce::TempStorage temp_storage; typename WarpReduce::TempStorage temp_storage;
value = WarpReduce(temp_storage).Sum(value); value = WarpReduce(temp_storage).Sum(value);
if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value); if (cub::LaneId() == 0) phi::CudaAtomicAdd(sum, value);
} }
template <typename T> template <typename T>
...@@ -429,14 +429,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x, ...@@ -429,14 +429,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x,
if (flags & kHasScale) { if (flags & kHasScale) {
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11070
platform::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data); phi::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
#else #else
CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data); CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data);
#endif #endif
} }
if (flags & kHasBias) { if (flags & kHasBias) {
#if CUDA_VERSION >= 11070 #if CUDA_VERSION >= 11070
platform::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data); phi::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
#else #else
CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data); CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data);
#endif #endif
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/fluid/operators/interpolate_op.h" #include "paddle/fluid/operators/interpolate_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -126,7 +126,7 @@ __global__ void KeNearestNeighborInterpBw(T* in, ...@@ -126,7 +126,7 @@ __global__ void KeNearestNeighborInterpBw(T* in,
in_img_idx * num_channels + channel_id]; in_img_idx * num_channels + channel_id];
} }
const T out_pos = out[out_id_h * output_w + out_id_w]; const T out_pos = out[out_id_h * output_w + out_id_w];
platform::CudaAtomicAdd(in_pos, out_pos); phi::CudaAtomicAdd(in_pos, out_pos);
} }
} }
...@@ -243,12 +243,11 @@ __global__ void KeLinearInterpBw(T* in, ...@@ -243,12 +243,11 @@ __global__ void KeLinearInterpBw(T* in,
const T* out_pos = &out[out_id_w]; const T* out_pos = &out[out_id_w];
if (data_layout == DataLayout::kNCHW) { if (data_layout == DataLayout::kNCHW) {
platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
} else { } else {
platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id * num_channels], phi::CudaAtomicAdd(&in_pos[w_id * num_channels], w1lambda * out_pos[0]);
w1lambda * out_pos[0]);
} }
} }
} }
...@@ -408,19 +407,19 @@ __global__ void KeBilinearInterpBw(T* in, ...@@ -408,19 +407,19 @@ __global__ void KeBilinearInterpBw(T* in,
const T* out_pos = &out[out_id_h * output_w + out_id_w]; const T* out_pos = &out[out_id_h * output_w + out_id_w];
if (data_layout == DataLayout::kNCHW) { if (data_layout == DataLayout::kNCHW) {
platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[h_id * in_img_w], phi::CudaAtomicAdd(&in_pos[h_id * in_img_w],
h1lambda * w2lambda * out_pos[0]); h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id], phi::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
h1lambda * w1lambda * out_pos[0]); h1lambda * w1lambda * out_pos[0]);
} else { } else {
platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id * num_channels], phi::CudaAtomicAdd(&in_pos[w_id * num_channels],
h2lambda * w1lambda * out_pos[0]); h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels], phi::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
h1lambda * w2lambda * out_pos[0]); h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&in_pos[h_id * in_img_w * num_channels + w_id * num_channels], &in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
h1lambda * w1lambda * out_pos[0]); h1lambda * w1lambda * out_pos[0]);
} }
...@@ -638,21 +637,21 @@ __global__ void KeTrilinearInterpBw(T* in, ...@@ -638,21 +637,21 @@ __global__ void KeTrilinearInterpBw(T* in,
const T* out_pos = &out[out_id_h * output_w + out_id_w]; const T* out_pos = &out[out_id_h * output_w + out_id_w];
// trilinear interpolation grad // trilinear interpolation grad
platform::CudaAtomicAdd(&in_pos1[0], phi::CudaAtomicAdd(&in_pos1[0],
d2lambda * h2lambda * w2lambda * out_pos[0]); d2lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[w_id], phi::CudaAtomicAdd(&in_pos1[w_id],
d2lambda * h2lambda * w1lambda * out_pos[0]); d2lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w], phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
d2lambda * h1lambda * w2lambda * out_pos[0]); d2lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id], phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
d2lambda * h1lambda * w1lambda * out_pos[0]); d2lambda * h1lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[0], phi::CudaAtomicAdd(&in_pos2[0],
d1lambda * h2lambda * w2lambda * out_pos[0]); d1lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[w_id], phi::CudaAtomicAdd(&in_pos2[w_id],
d1lambda * h2lambda * w1lambda * out_pos[0]); d1lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w], phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
d1lambda * h1lambda * w2lambda * out_pos[0]); d1lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id], phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
d1lambda * h1lambda * w1lambda * out_pos[0]); d1lambda * h1lambda * w1lambda * out_pos[0]);
} else { } else {
int in_pos1_idx = out_id_h * input_w + int in_pos1_idx = out_id_h * input_w +
...@@ -666,22 +665,22 @@ __global__ void KeTrilinearInterpBw(T* in, ...@@ -666,22 +665,22 @@ __global__ void KeTrilinearInterpBw(T* in,
const T* out_pos = &out[out_id_h * output_w + out_id_w]; const T* out_pos = &out[out_id_h * output_w + out_id_w];
// trilinear interpolation grad // trilinear interpolation grad
platform::CudaAtomicAdd(&in_pos1[0], phi::CudaAtomicAdd(&in_pos1[0],
d2lambda * h2lambda * w2lambda * out_pos[0]); d2lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[w_id * num_channels], phi::CudaAtomicAdd(&in_pos1[w_id * num_channels],
d2lambda * h2lambda * w1lambda * out_pos[0]); d2lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels], phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
d2lambda * h1lambda * w2lambda * out_pos[0]); d2lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&in_pos1[h_id * in_img_w * num_channels + w_id * num_channels], &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels],
d2lambda * h1lambda * w1lambda * out_pos[0]); d2lambda * h1lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[0], phi::CudaAtomicAdd(&in_pos2[0],
d1lambda * h2lambda * w2lambda * out_pos[0]); d1lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[w_id * num_channels], phi::CudaAtomicAdd(&in_pos2[w_id * num_channels],
d1lambda * h2lambda * w1lambda * out_pos[0]); d1lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels], phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
d1lambda * h1lambda * w2lambda * out_pos[0]); d1lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&in_pos2[h_id * in_img_w * num_channels + w_id * num_channels], &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels],
d1lambda * h1lambda * w1lambda * out_pos[0]); d1lambda * h1lambda * w1lambda * out_pos[0]);
} }
...@@ -903,7 +902,7 @@ __global__ void KeBicubicInterpBw(T* in, ...@@ -903,7 +902,7 @@ __global__ void KeBicubicInterpBw(T* in,
in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels + in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels +
access_x * num_channels + channel_id]; access_x * num_channels + channel_id];
} }
platform::CudaAtomicAdd(&in_pos[0], phi::CudaAtomicAdd(&in_pos[0],
(out_pos[0] * y_coeffs[j] * x_coeffs[i])); (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
} }
} }
......
...@@ -22,8 +22,8 @@ ...@@ -22,8 +22,8 @@
#include "paddle/fluid/operators/limit_by_capacity_op.h" #include "paddle/fluid/operators/limit_by_capacity_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -39,7 +39,7 @@ __global__ void limit_by_capacity_impl( ...@@ -39,7 +39,7 @@ __global__ void limit_by_capacity_impl(
wid = i / n_expert; wid = i / n_expert;
eid = i % n_expert; eid = i % n_expert;
auto proposal = expc[wid * n_expert + eid]; auto proposal = expc[wid * n_expert + eid];
auto cap_left = paddle::platform::CudaAtomicAdd(cap + eid, proposal * (-1)); auto cap_left = phi::CudaAtomicAdd(cap + eid, proposal * (-1));
if (cap_left >= proposal) { if (cap_left >= proposal) {
out[wid * n_expert + eid] = proposal; out[wid * n_expert + eid] = proposal;
} else if (cap_left >= 0) { } else if (cap_left >= 0) {
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/lookup_table_op.h" #include "paddle/fluid/operators/lookup_table_op.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -93,7 +93,7 @@ __global__ void LookupTableGrad(T *table, ...@@ -93,7 +93,7 @@ __global__ void LookupTableGrad(T *table,
const T *out = output + idy * D; const T *out = output + idy * D;
T *tab = table + id * D; T *tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) { for (int i = idx; i < D; i += BlockDimX) {
paddle::platform::CudaAtomicAdd(&tab[i], out[i]); phi::CudaAtomicAdd(&tab[i], out[i]);
} }
idy += BlockDimY * GridDimX; idy += BlockDimY * GridDimX;
} }
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/lookup_table_v2_op.h" #include "paddle/fluid/operators/lookup_table_v2_op.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -65,10 +65,10 @@ __global__ void LookupTableV2Grad(T *table, ...@@ -65,10 +65,10 @@ __global__ void LookupTableV2Grad(T *table,
const T *out = output + idy * D; const T *out = output + idy * D;
T *tab = table + id * D; T *tab = table + id * D;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
#else #else
for (int i = idx; i < D; i += blockDim.x) { for (int i = idx; i < D; i += blockDim.x) {
paddle::platform::CudaAtomicAdd(&tab[i], out[i]); phi::CudaAtomicAdd(&tab[i], out[i]);
} }
#endif #endif
idy += blockDim.y * gridDim.x; idy += blockDim.y * gridDim.x;
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/cos_sim_functor.h" #include "paddle/fluid/operators/math/cos_sim_functor.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -44,7 +44,7 @@ __global__ void CosSimDyKernel(const T* x_norm, ...@@ -44,7 +44,7 @@ __global__ void CosSimDyKernel(const T* x_norm,
for (size_t i = 0; i < cols; ++i) { for (size_t i = 0; i < cols; ++i) {
T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
z_data * y[i] * reciprocal_y_norm_square); z_data * y[i] * reciprocal_y_norm_square);
platform::CudaAtomicAdd(dy + i, dy_data); phi::CudaAtomicAdd(dy + i, dy_data);
} }
} }
} }
......
...@@ -15,10 +15,9 @@ limitations under the License. */ ...@@ -15,10 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math.h" #include "paddle/phi/kernels/funcs/math.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -466,8 +466,7 @@ __global__ void col2imOCF(const T* col_data, ...@@ -466,8 +466,7 @@ __global__ void col2imOCF(const T* col_data,
if (height_offset >= 0 && height_offset < im_height && if (height_offset >= 0 && height_offset < im_height &&
width_offset >= 0 && width_offset < im_width) { width_offset >= 0 && width_offset < im_width) {
paddle::platform::CudaAtomicAdd(im_data + im_offset, phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
col_data[col_offset]);
} }
} }
} }
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/maxouting.h" #include "paddle/fluid/operators/math/maxouting.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/fluid/operators/math/sequence_pooling.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
......
...@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/sequence_scale.h" #include "paddle/fluid/operators/math/sequence_scale.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T, int BlockSize> template <typename T, int BlockSize>
__global__ void SequenceScaleKernel(T* seq, __global__ void SequenceScaleKernel(T* seq,
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/unpooling.h" #include "paddle/fluid/operators/math/unpooling.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -15,13 +15,13 @@ limitations under the License. */ ...@@ -15,13 +15,13 @@ limitations under the License. */
#include "paddle/fluid/operators/mean_iou_op.h" #include "paddle/fluid/operators/mean_iou_op.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T> template <typename T>
__global__ void CountCUDAKernel(const int num_classes, __global__ void CountCUDAKernel(const int num_classes,
......
...@@ -22,8 +22,8 @@ ...@@ -22,8 +22,8 @@
#include "paddle/fluid/operators/number_count_op.h" #include "paddle/fluid/operators/number_count_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -77,7 +77,7 @@ __global__ void NumberCount(const T* numbers, ...@@ -77,7 +77,7 @@ __global__ void NumberCount(const T* numbers,
#endif #endif
} }
if (threadIdx.x % WARP_SIZE == 0) { if (threadIdx.x % WARP_SIZE == 0) {
platform::CudaAtomicAdd(number_count + i, x); phi::CudaAtomicAdd(number_count + i, x);
} }
} }
} }
......
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
#include "paddle/fluid/operators/one_hot_op.h" #include "paddle/fluid/operators/one_hot_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename InT, typename OutT> template <typename InT, typename OutT>
__global__ void FillOutputKernel(const InT* p_in_data, __global__ void FillOutputKernel(const InT* p_in_data,
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/optimizers/sgd_op.h" #include "paddle/fluid/operators/optimizers/sgd_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -56,7 +56,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows, ...@@ -56,7 +56,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) { for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
// Since index in rows of SelectedRows can be duplicate, we have to use // Since index in rows of SelectedRows can be duplicate, we have to use
// Atomic Operation to avoid concurrent write error. // Atomic Operation to avoid concurrent write error.
paddle::platform::CudaAtomicAdd( phi::CudaAtomicAdd(
tensor_out_ptr + index, tensor_out_ptr + index,
-static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]); -static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]);
} }
......
...@@ -16,13 +16,13 @@ limitations under the License. */ ...@@ -16,13 +16,13 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T> template <typename T>
__global__ void Pad2DConstNCHW(const int nthreads, __global__ void Pad2DConstNCHW(const int nthreads,
...@@ -257,8 +257,7 @@ __global__ void Pad2DGradReflectNCHW(const int out_size, ...@@ -257,8 +257,7 @@ __global__ void Pad2DGradReflectNCHW(const int out_size,
in_w = max(in_w, -in_w); in_w = max(in_w, -in_w);
in_h = min(in_h, 2 * in_height - in_h - 2); in_h = min(in_h, 2 * in_height - in_h - 2);
in_w = min(in_w, 2 * in_width - in_w - 2); in_w = min(in_w, 2 * in_width - in_w - 2);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
&d_in_data[(nc * in_height + in_h) * in_width + in_w],
d_out_data[out_index]); d_out_data[out_index]);
} }
} }
...@@ -288,7 +287,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size, ...@@ -288,7 +287,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size,
in_w = max(in_w, -in_w); in_w = max(in_w, -in_w);
in_h = min(in_h, in_height * 2 - in_h - 2); in_h = min(in_h, in_height * 2 - in_h - 2);
in_w = min(in_w, in_width * 2 - in_w - 2); in_w = min(in_w, in_width * 2 - in_w - 2);
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c], &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
d_out_data[out_index]); d_out_data[out_index]);
} }
...@@ -313,8 +312,7 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size, ...@@ -313,8 +312,7 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size,
nc /= out_height; nc /= out_height;
const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
platform::CudaAtomicAdd( phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
&d_in_data[(nc * in_height + in_h) * in_width + in_w],
d_out_data[out_index]); d_out_data[out_index]);
} }
} }
...@@ -340,7 +338,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, ...@@ -340,7 +338,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size,
n /= out_height; n /= out_height;
const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c], &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
d_out_data[out_index]); d_out_data[out_index]);
} }
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#endif #endif
namespace paddle { namespace paddle {
...@@ -96,7 +96,7 @@ DEVICE void PrRoIPoolingDistributeDiff(T* diff, ...@@ -96,7 +96,7 @@ DEVICE void PrRoIPoolingDistributeDiff(T* diff,
const T coeff) { const T coeff) {
bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
if (!overflow) { if (!overflow) {
paddle::platform::CudaAtomicAdd(diff + h * width + w, top_diff * coeff); phi::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
} }
} }
#else #else
...@@ -166,7 +166,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff, ...@@ -166,7 +166,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff,
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
template <typename T> template <typename T>
DEVICE void AccumulateRois(T* offset, T data) { DEVICE void AccumulateRois(T* offset, T data) {
paddle::platform::CudaAtomicAdd(offset, data); phi::CudaAtomicAdd(offset, data);
} }
#else #else
template <typename T> template <typename T>
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
// Licensed under the Apache License, Version 2.0 (the "License"). // Licensed under the Apache License, Version 2.0 (the "License").
#include "paddle/fluid/operators/prune_gate_by_capacity_op.h" #include "paddle/fluid/operators/prune_gate_by_capacity_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
...@@ -47,7 +47,7 @@ __global__ void prune_gate_by_capacity_kernel(const T1* gate_idx_data, ...@@ -47,7 +47,7 @@ __global__ void prune_gate_by_capacity_kernel(const T1* gate_idx_data,
const int64_t batch_size) { const int64_t batch_size) {
CUDA_KERNEL_LOOP(i, batch_size) { CUDA_KERNEL_LOOP(i, batch_size) {
auto orig_cap = auto orig_cap =
platform::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1); phi::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1);
if (orig_cap <= 0) { if (orig_cap <= 0) {
new_gate_idx_data[i] = -1; new_gate_idx_data[i] = -1;
} else { } else {
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/fluid/operators/pull_box_extended_sparse_op.h" #include "paddle/fluid/operators/pull_box_extended_sparse_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -37,7 +37,7 @@ limitations under the License. */ ...@@ -37,7 +37,7 @@ limitations under the License. */
#include "xpu/kernel/math.h" // NOLINT #include "xpu/kernel/math.h" // NOLINT
#else #else
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#endif #endif
#include "paddle/fluid/operators/pull_box_sparse_op.h" #include "paddle/fluid/operators/pull_box_sparse_op.h"
...@@ -46,9 +46,13 @@ namespace ops = paddle::operators; ...@@ -46,9 +46,13 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
REGISTER_OP_KERNEL(pull_box_sparse, KP, plat::XPUPlace, REGISTER_OP_KERNEL(pull_box_sparse,
KP,
plat::XPUPlace,
ops::PullBoxSparseKernel<float>); ops::PullBoxSparseKernel<float>);
REGISTER_OP_KERNEL(push_box_sparse, KP, plat::XPUPlace, REGISTER_OP_KERNEL(push_box_sparse,
KP,
plat::XPUPlace,
ops::PushBoxSparseKernel<float>); ops::PushBoxSparseKernel<float>);
#else #else
REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>); REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
......
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
#include "paddle/fluid/operators/pull_gpups_sparse_op.h" #include "paddle/fluid/operators/pull_gpups_sparse_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
template <typename T> template <typename T>
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.cu.h" #include "paddle/fluid/operators/fake_dequantize_op.cu.h"
#include "paddle/fluid/operators/fake_quantize_op.cu.h" #include "paddle/fluid/operators/fake_quantize_op.cu.h"
#include "paddle/fluid/operators/quantize_linear_op.h" #include "paddle/fluid/operators/quantize_linear_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "paddle/fluid/operators/random_routing_op.h" #include "paddle/fluid/operators/random_routing_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/operators/rank_attention.cu.h" #include "paddle/fluid/operators/rank_attention.cu.h"
#include "paddle/fluid/operators/rank_attention_op.h" #include "paddle/fluid/operators/rank_attention_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
......
...@@ -16,11 +16,11 @@ ...@@ -16,11 +16,11 @@
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h" #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
template <typename T> template <typename T>
......
...@@ -16,11 +16,11 @@ limitations under the License. */ ...@@ -16,11 +16,11 @@ limitations under the License. */
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h" #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor;
template <typename T> template <typename T>
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h" #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -72,7 +72,7 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data, ...@@ -72,7 +72,7 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data,
for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) { for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
for (int tid_x = threadIdx.x; tid_x < x_item_length; for (int tid_x = threadIdx.x; tid_x < x_item_length;
tid_x += blockDim.x) { tid_x += blockDim.x) {
platform::CudaAtomicAdd( phi::CudaAtomicAdd(
&dx_data[(x_offset + tid_y) * x_item_length + tid_x], &dx_data[(x_offset + tid_y) * x_item_length + tid_x],
dout_data[(out_offset + tid_z * x_item_count + tid_y) * dout_data[(out_offset + tid_z * x_item_count + tid_y) *
x_item_length + x_item_length +
......
...@@ -11,7 +11,7 @@ limitations under the License. */ ...@@ -11,7 +11,7 @@ limitations under the License. */
#include "paddle/fluid/operators/shuffle_channel_op.h" #include "paddle/fluid/operators/shuffle_channel_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#include "paddle/fluid/operators/temporal_shift_op.h" #include "paddle/fluid/operators/temporal_shift_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -28,8 +28,8 @@ limitations under the License. */ ...@@ -28,8 +28,8 @@ limitations under the License. */
#include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#define FINAL_MASK 0xffffffff #define FINAL_MASK 0xffffffff
#ifdef __HIPCC__ #ifdef __HIPCC__
...@@ -713,7 +713,7 @@ __device__ void RadixCountUsingMask(const T* input, ...@@ -713,7 +713,7 @@ __device__ void RadixCountUsingMask(const T* input,
if (GetLaneId() == 0) { if (GetLaneId() == 0) {
#pragma unroll #pragma unroll
for (uint32_t i = 0; i < RadixSize; ++i) { for (uint32_t i = 0; i < RadixSize; ++i) {
platform::CudaAtomicAdd(&shared_mem[i], counts[i]); phi::CudaAtomicAdd(&shared_mem[i], counts[i]);
} }
} }
......
...@@ -16,9 +16,9 @@ limitations under the License. */ ...@@ -16,9 +16,9 @@ limitations under the License. */
#include "paddle/fluid/framework/gpu_utils.h" #include "paddle/fluid/framework/gpu_utils.h"
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/fast_divmod.h" #include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/autotune/auto_tune_base.h" #include "paddle/phi/kernels/autotune/auto_tune_base.h"
......
...@@ -24,17 +24,15 @@ ...@@ -24,17 +24,15 @@
#define PADDLE_CUDA_FP16 #define PADDLE_CUDA_FP16
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
using paddle::platform::float16; using paddle::platform::float16;
using paddle::platform::PADDLE_CUDA_NUM_THREADS; using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T> template <typename T>
__global__ void AddKernel(const T* data_a, T* data_b, size_t num) { __global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
CUDA_KERNEL_LOOP(i, num) { CUDA_KERNEL_LOOP(i, num) { phi::CudaAtomicAdd(&data_b[i], data_a[i]); }
paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
}
} }
template <typename T> template <typename T>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册