未验证 提交 9918bf9c 编写于 作者: W Wang Xin 提交者: GitHub

[PHI decoupling] remove "gpu_primitives.h" in fluid (#48063)

* remove "gpu_primitives.h" in fluid namespace

* fix PR-CI-GpuPS fail

* fix PR-CI-GpuPS fail
上级 a33d563c
......@@ -13,12 +13,12 @@ limitations under the License. */
#ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace framework {
const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
......@@ -45,7 +45,7 @@ __global__ void PullCopy(float** dest,
int x = low;
int y = i - (x ? len[x - 1] : 0);
float* feature_value_ptr =
(float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
(float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); // NOLINT
int mf_dim = gpu_dim[x] - 3;
gpu_accessor.Select(
dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
......@@ -79,7 +79,7 @@ __global__ void PullDedupCopy(const size_t N,
return;
}
float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) *
float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) * // NOLINT
uint64_t(max_val_size));
switch (off) {
case 0:
......@@ -125,9 +125,10 @@ __global__ void PushCopyWithPool(float* dest,
}
int x = low;
int y = i - (x ? len[low - 1] : 0);
float* cur = (float*)((char*)dest + i * grad_value_size);
float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT
cur[gpu_accessor.common_push_value.SlotIndex()] = (float)slot_vector[x];
cur[gpu_accessor.common_push_value.SlotIndex()] =
(float)slot_vector[x]; // NOLINT
int mf_dim = mf_dim_vector[x];
cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim;
......@@ -170,31 +171,29 @@ __global__ void PushMergeCopyAtomic(const size_t N,
int y = i - slot_lens[x];
const float* ptr = src[x] + y * hidden;
float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size);
float* cur =
(float*)((char*)dest + d_restore_idx[i] * grad_value_size); // NOLINT
int mf_dim = slot_dims[x] - 3;
switch (off) {
case 0:
cur[accessor.SlotIndex()] = (float)slot_vector[x];
cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT
cur[accessor.MfDimIndex()] = mf_dim;
paddle::platform::CudaAtomicAdd(&cur[accessor.ShowIndex()],
*(ptr + off));
phi::CudaAtomicAdd(&cur[accessor.ShowIndex()], *(ptr + off));
break;
case 1:
paddle::platform::CudaAtomicAdd(&cur[accessor.ClickIndex()],
*(ptr + off));
phi::CudaAtomicAdd(&cur[accessor.ClickIndex()], *(ptr + off));
break;
case 2:
paddle::platform::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
*(ptr + off) * -1. * bs);
phi::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
*(ptr + off) * -1. * bs);
break;
default:
int embedx_idx = off - 3;
if (mf_dim < embedx_idx) {
return;
}
paddle::platform::CudaAtomicAdd(
&cur[accessor.EmbedxGIndex() + embedx_idx],
*(ptr + off) * -1. * bs);
phi::CudaAtomicAdd(&cur[accessor.EmbedxGIndex() + embedx_idx],
*(ptr + off) * -1. * bs);
break;
}
}
......@@ -228,7 +227,7 @@ __global__ void PushMergeCopy(const size_t N,
int i = idx / hidden;
int off = idx % hidden;
// filter 0 keys
float* cur = (float*)((char*)dest + i * grad_value_size);
float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT
if (total_keys[i] == 0) {
switch (off) {
......@@ -262,7 +261,7 @@ __global__ void PushMergeCopy(const size_t N,
switch (off) {
case 0:
cur[accessor.SlotIndex()] = (float)slot_vector[x];
cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT
cur[accessor.MfDimIndex()] = mf_dim;
SUM_GRAD_VALUE
cur[accessor.ShowIndex()] = val;
......@@ -331,8 +330,8 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
const uint64_t total_length,
const int batch_size,
size_t grad_value_size,
std::vector<int>& slot_vector,
std::vector<int>& slot_mf_dim_vector) {
std::vector<int>& slot_vector, // NOLINT
std::vector<int>& slot_mf_dim_vector) { // NOLINT
auto stream = dynamic_cast<phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
......
......@@ -22,12 +22,12 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace framework {
const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
......
......@@ -20,8 +20,8 @@
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
#include "paddle/fluid/operators/fused_token_prune_op.cu.h"
......@@ -149,7 +149,7 @@ __global__ void ReduceSum2<half>(
}
if (tid == 0) {
platform::fastAtomicAdd<platform::float16>(
phi::fastAtomicAdd<platform::float16>(
reinterpret_cast<platform::float16*>(dst),
static_cast<size_t>(batch * max_seq_len + col),
static_cast<size_t>(bsz * max_seq_len),
......
......@@ -23,7 +23,7 @@ namespace cub = hipcub;
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -23,8 +23,8 @@ We retain the following license from the original files:
#include "paddle/fluid/operators/assign_pos_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
DECLARE_bool(avoid_op_randomness);
......@@ -47,7 +47,7 @@ __global__ void AssignPos(T* cum_count,
CUDA_KERNEL_LOOP(i, limit) {
int number_idx = numbers[i];
if (number_idx > -1) {
int p = platform::CudaAtomicAdd(cum_count + number_idx, -1);
int p = phi::CudaAtomicAdd(cum_count + number_idx, -1);
out[p - 1] = i;
}
}
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/batch_fc_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace paddle {
......
......@@ -14,7 +14,7 @@
#include "paddle/fluid/operators/bilateral_slice_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -16,11 +16,11 @@ limitations under the License. */
#include "paddle/fluid/operators/center_loss_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
__global__ void ComputeDifferent(T *centers_diff,
......@@ -75,7 +75,7 @@ __global__ void UpdateCenters(T *centers,
const T *diff = centers_diff + idy * D;
T *cent = centers + id * D;
for (int i = idx; i < D; i += BlockDimX) {
paddle::platform::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
phi::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
}
idy += BlockDimY * GridDimX;
}
......
......@@ -16,8 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -77,8 +77,7 @@ __global__ void CEmbeddingGrad(T *table,
auto id = ids[row];
if (id >= start_idx && id < end_idx) {
auto real_idx = id - start_idx;
paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col],
output[i]);
phi::CudaAtomicAdd(&table[real_idx * columns + col], output[i]);
}
}
}
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_split_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/conv_shift_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
......
......@@ -16,12 +16,12 @@ limitations under the License. */
#include "paddle/fluid/operators/cvm_op.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor;
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/data_norm_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
......@@ -29,7 +29,7 @@ namespace operators {
using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor;
using DataLayout = phi::DataLayout;
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
......
......@@ -32,7 +32,7 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -41,7 +41,7 @@ namespace operators {
using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor;
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
static inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
......@@ -447,18 +447,14 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
// compute gradient of input
if (bottom_data_diff) {
platform::CudaAtomicAdd(
bottom_data_diff + bottom_index + y0 * width + x0,
q00 * diff_val);
platform::CudaAtomicAdd(
bottom_data_diff + bottom_index + y1 * width + x0,
q01 * diff_val);
platform::CudaAtomicAdd(
bottom_data_diff + bottom_index + y0 * width + x1,
q10 * diff_val);
platform::CudaAtomicAdd(
bottom_data_diff + bottom_index + y1 * width + x1,
q11 * diff_val);
phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x0,
q00 * diff_val);
phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x0,
q01 * diff_val);
phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x1,
q10 * diff_val);
phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x1,
q11 * diff_val);
}
// compute gradient of trans
......@@ -478,8 +474,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
u00 * (1 - dist_x)) *
trans_std * diff_val;
diff_y *= roi_height;
platform::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x);
platform::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y);
phi::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x);
phi::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y);
}
}
}
......
......@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/dequantize_log_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math.h"
namespace paddle {
namespace operators {
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/box_clip_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
......@@ -11,7 +11,7 @@ limitations under the License. */
#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -26,8 +26,8 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/gather.cu.h"
namespace paddle {
......@@ -50,7 +50,7 @@ static __global__ void GetLengthLoD(const int nthreads,
const int* batch_ids,
int* length_lod) {
CUDA_KERNEL_LOOP(i, nthreads) {
platform::CudaAtomicAdd(length_lod + batch_ids[i], 1);
phi::CudaAtomicAdd(length_lod + batch_ids[i], 1);
}
}
......
......@@ -14,13 +14,13 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
using Tensor = phi::DenseTensor;
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
#define CUDA_BLOCK_SIZE 16
template <typename T>
......
......@@ -15,12 +15,12 @@ limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h"
using paddle::platform::float16;
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
namespace paddle {
namespace operators {
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math.h"
......
......@@ -43,7 +43,7 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/gpu/elementwise_grad.h"
#endif
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/fake_quantize_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
namespace paddle {
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/gather_scatter_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -35,7 +35,7 @@ class ReduceAdd {
typename tensor_t,
std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
__device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
platform::CudaAtomicAdd(self_data, *src_data);
phi::CudaAtomicAdd(self_data, *src_data);
}
template <typename tensor_t,
std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
......
......@@ -41,8 +41,8 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/graph_khop_sampler_imp.h"
#include "paddle/fluid/operators/graph_khop_sampler_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
constexpr int WARP_SIZE = 32;
......@@ -134,8 +134,7 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
const int num = curand(&rng) % (idx + 1);
#endif
if (num < k) {
paddle::platform::CudaAtomicMax(output_idxs + out_row_start + num,
idx);
phi::CudaAtomicMax(output_idxs + out_row_start + num, idx);
}
}
#ifdef PADDLE_WITH_CUDA
......
......@@ -22,7 +22,7 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/group_norm_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -51,7 +51,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
typedef cub::WarpReduce<T> WarpReduce;
typename WarpReduce::TempStorage temp_storage;
value = WarpReduce(temp_storage).Sum(value);
if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
if (cub::LaneId() == 0) phi::CudaAtomicAdd(sum, value);
}
template <typename T>
......@@ -429,14 +429,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x,
if (flags & kHasScale) {
#if CUDA_VERSION >= 11070
platform::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
phi::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
#else
CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data);
#endif
}
if (flags & kHasBias) {
#if CUDA_VERSION >= 11070
platform::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
phi::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
#else
CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data);
#endif
......
......@@ -14,7 +14,7 @@
#include "paddle/fluid/operators/interpolate_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -126,7 +126,7 @@ __global__ void KeNearestNeighborInterpBw(T* in,
in_img_idx * num_channels + channel_id];
}
const T out_pos = out[out_id_h * output_w + out_id_w];
platform::CudaAtomicAdd(in_pos, out_pos);
phi::CudaAtomicAdd(in_pos, out_pos);
}
}
......@@ -243,12 +243,11 @@ __global__ void KeLinearInterpBw(T* in,
const T* out_pos = &out[out_id_w];
if (data_layout == DataLayout::kNCHW) {
platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
} else {
platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
w1lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos[w_id * num_channels], w1lambda * out_pos[0]);
}
}
}
......@@ -408,19 +407,19 @@ __global__ void KeBilinearInterpBw(T* in,
const T* out_pos = &out[out_id_h * output_w + out_id_w];
if (data_layout == DataLayout::kNCHW) {
platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
h1lambda * w1lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos[h_id * in_img_w],
h1lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
h1lambda * w1lambda * out_pos[0]);
} else {
platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(
phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos[w_id * num_channels],
h2lambda * w1lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
h1lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(
&in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
h1lambda * w1lambda * out_pos[0]);
}
......@@ -638,22 +637,22 @@ __global__ void KeTrilinearInterpBw(T* in,
const T* out_pos = &out[out_id_h * output_w + out_id_w];
// trilinear interpolation grad
platform::CudaAtomicAdd(&in_pos1[0],
d2lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[w_id],
d2lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
d2lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
d2lambda * h1lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[0],
d1lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[w_id],
d1lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
d1lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
d1lambda * h1lambda * w1lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos1[0],
d2lambda * h2lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos1[w_id],
d2lambda * h2lambda * w1lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
d2lambda * h1lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
d2lambda * h1lambda * w1lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos2[0],
d1lambda * h2lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos2[w_id],
d1lambda * h2lambda * w1lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
d1lambda * h1lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
d1lambda * h1lambda * w1lambda * out_pos[0]);
} else {
int in_pos1_idx = out_id_h * input_w +
in_img_idt * in_img_h * in_img_w * num_channels +
......@@ -666,22 +665,22 @@ __global__ void KeTrilinearInterpBw(T* in,
const T* out_pos = &out[out_id_h * output_w + out_id_w];
// trilinear interpolation grad
platform::CudaAtomicAdd(&in_pos1[0],
d2lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[w_id * num_channels],
d2lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
d2lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(
phi::CudaAtomicAdd(&in_pos1[0],
d2lambda * h2lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos1[w_id * num_channels],
d2lambda * h2lambda * w1lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
d2lambda * h1lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(
&in_pos1[h_id * in_img_w * num_channels + w_id * num_channels],
d2lambda * h1lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[0],
d1lambda * h2lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[w_id * num_channels],
d1lambda * h2lambda * w1lambda * out_pos[0]);
platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
d1lambda * h1lambda * w2lambda * out_pos[0]);
platform::CudaAtomicAdd(
phi::CudaAtomicAdd(&in_pos2[0],
d1lambda * h2lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos2[w_id * num_channels],
d1lambda * h2lambda * w1lambda * out_pos[0]);
phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
d1lambda * h1lambda * w2lambda * out_pos[0]);
phi::CudaAtomicAdd(
&in_pos2[h_id * in_img_w * num_channels + w_id * num_channels],
d1lambda * h1lambda * w1lambda * out_pos[0]);
}
......@@ -903,8 +902,8 @@ __global__ void KeBicubicInterpBw(T* in,
in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels +
access_x * num_channels + channel_id];
}
platform::CudaAtomicAdd(&in_pos[0],
(out_pos[0] * y_coeffs[j] * x_coeffs[i]));
phi::CudaAtomicAdd(&in_pos[0],
(out_pos[0] * y_coeffs[j] * x_coeffs[i]));
}
}
}
......
......@@ -22,8 +22,8 @@
#include "paddle/fluid/operators/limit_by_capacity_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -39,7 +39,7 @@ __global__ void limit_by_capacity_impl(
wid = i / n_expert;
eid = i % n_expert;
auto proposal = expc[wid * n_expert + eid];
auto cap_left = paddle::platform::CudaAtomicAdd(cap + eid, proposal * (-1));
auto cap_left = phi::CudaAtomicAdd(cap + eid, proposal * (-1));
if (cap_left >= proposal) {
out[wid * n_expert + eid] = proposal;
} else if (cap_left >= 0) {
......
......@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/lookup_table_op.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -93,7 +93,7 @@ __global__ void LookupTableGrad(T *table,
const T *out = output + idy * D;
T *tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) {
paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
phi::CudaAtomicAdd(&tab[i], out[i]);
}
idy += BlockDimY * GridDimX;
}
......
......@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/lookup_table_v2_op.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -65,10 +65,10 @@ __global__ void LookupTableV2Grad(T *table,
const T *out = output + idy * D;
T *tab = table + id * D;
#ifdef PADDLE_WITH_CUDA
paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
#else
for (int i = idx; i < D; i += blockDim.x) {
paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
phi::CudaAtomicAdd(&tab[i], out[i]);
}
#endif
idy += blockDim.y * gridDim.x;
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/cos_sim_functor.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -44,7 +44,7 @@ __global__ void CosSimDyKernel(const T* x_norm,
for (size_t i = 0; i < cols; ++i) {
T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
z_data * y[i] * reciprocal_y_norm_square);
platform::CudaAtomicAdd(dy + i, dy_data);
phi::CudaAtomicAdd(dy + i, dy_data);
}
}
}
......
......@@ -15,10 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math.h"
namespace paddle {
namespace operators {
namespace math {
......
......@@ -17,8 +17,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -466,8 +466,7 @@ __global__ void col2imOCF(const T* col_data,
if (height_offset >= 0 && height_offset < im_height &&
width_offset >= 0 && width_offset < im_width) {
paddle::platform::CudaAtomicAdd(im_data + im_offset,
col_data[col_offset]);
phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
}
}
}
......
......@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/maxouting.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -16,8 +16,8 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/operators/math/sequence_pooling.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
......
......@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/sequence_scale.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
namespace math {
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T, int BlockSize>
__global__ void SequenceScaleKernel(T* seq,
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/unpooling.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -17,8 +17,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -15,13 +15,13 @@ limitations under the License. */
#include "paddle/fluid/operators/mean_iou_op.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T>
__global__ void CountCUDAKernel(const int num_classes,
......
......@@ -22,8 +22,8 @@
#include "paddle/fluid/operators/number_count_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -77,7 +77,7 @@ __global__ void NumberCount(const T* numbers,
#endif
}
if (threadIdx.x % WARP_SIZE == 0) {
platform::CudaAtomicAdd(number_count + i, x);
phi::CudaAtomicAdd(number_count + i, x);
}
}
}
......
......@@ -14,11 +14,11 @@
#include "paddle/fluid/operators/one_hot_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
template <typename InT, typename OutT>
__global__ void FillOutputKernel(const InT* p_in_data,
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/optimizers/sgd_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -56,7 +56,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
// Since index in rows of SelectedRows can be duplicate, we have to use
// Atomic Operation to avoid concurrent write error.
paddle::platform::CudaAtomicAdd(
phi::CudaAtomicAdd(
tensor_out_ptr + index,
-static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]);
}
......
......@@ -16,13 +16,13 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T>
__global__ void Pad2DConstNCHW(const int nthreads,
......@@ -257,9 +257,8 @@ __global__ void Pad2DGradReflectNCHW(const int out_size,
in_w = max(in_w, -in_w);
in_h = min(in_h, 2 * in_height - in_h - 2);
in_w = min(in_w, 2 * in_width - in_w - 2);
platform::CudaAtomicAdd(
&d_in_data[(nc * in_height + in_h) * in_width + in_w],
d_out_data[out_index]);
phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
d_out_data[out_index]);
}
}
......@@ -288,7 +287,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size,
in_w = max(in_w, -in_w);
in_h = min(in_h, in_height * 2 - in_h - 2);
in_w = min(in_w, in_width * 2 - in_w - 2);
platform::CudaAtomicAdd(
phi::CudaAtomicAdd(
&d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
d_out_data[out_index]);
}
......@@ -313,9 +312,8 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size,
nc /= out_height;
const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
platform::CudaAtomicAdd(
&d_in_data[(nc * in_height + in_h) * in_width + in_w],
d_out_data[out_index]);
phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
d_out_data[out_index]);
}
}
......@@ -340,7 +338,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size,
n /= out_height;
const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
platform::CudaAtomicAdd(
phi::CudaAtomicAdd(
&d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
d_out_data[out_index]);
}
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#endif
namespace paddle {
......@@ -96,7 +96,7 @@ DEVICE void PrRoIPoolingDistributeDiff(T* diff,
const T coeff) {
bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
if (!overflow) {
paddle::platform::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
phi::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
}
}
#else
......@@ -166,7 +166,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff,
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T>
DEVICE void AccumulateRois(T* offset, T data) {
paddle::platform::CudaAtomicAdd(offset, data);
phi::CudaAtomicAdd(offset, data);
}
#else
template <typename T>
......
......@@ -21,7 +21,7 @@
// Licensed under the Apache License, Version 2.0 (the "License").
#include "paddle/fluid/operators/prune_gate_by_capacity_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
......@@ -47,7 +47,7 @@ __global__ void prune_gate_by_capacity_kernel(const T1* gate_idx_data,
const int64_t batch_size) {
CUDA_KERNEL_LOOP(i, batch_size) {
auto orig_cap =
platform::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1);
phi::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1);
if (orig_cap <= 0) {
new_gate_idx_data[i] = -1;
} else {
......
......@@ -14,7 +14,7 @@
#include "paddle/fluid/operators/pull_box_extended_sparse_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -37,7 +37,7 @@ limitations under the License. */
#include "xpu/kernel/math.h" // NOLINT
#else
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#endif
#include "paddle/fluid/operators/pull_box_sparse_op.h"
......@@ -46,9 +46,13 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform;
#ifdef PADDLE_WITH_XPU_KP
REGISTER_OP_KERNEL(pull_box_sparse, KP, plat::XPUPlace,
REGISTER_OP_KERNEL(pull_box_sparse,
KP,
plat::XPUPlace,
ops::PullBoxSparseKernel<float>);
REGISTER_OP_KERNEL(push_box_sparse, KP, plat::XPUPlace,
REGISTER_OP_KERNEL(push_box_sparse,
KP,
plat::XPUPlace,
ops::PushBoxSparseKernel<float>);
#else
REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
......
......@@ -14,11 +14,11 @@
#include "paddle/fluid/operators/pull_gpups_sparse_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = phi::DenseTensor;
template <typename T>
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.cu.h"
#include "paddle/fluid/operators/fake_quantize_op.cu.h"
#include "paddle/fluid/operators/quantize_linear_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -14,8 +14,8 @@
#include "paddle/fluid/operators/random_routing_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/operators/rank_attention.cu.h"
#include "paddle/fluid/operators/rank_attention_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace paddle {
......
......@@ -16,11 +16,11 @@
#include <thrust/host_vector.h>
#include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = phi::DenseTensor;
template <typename T>
......
......@@ -16,11 +16,11 @@ limitations under the License. */
#include <thrust/host_vector.h>
#include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = phi::DenseTensor;
template <typename T>
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include <algorithm>
#include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......@@ -72,7 +72,7 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data,
for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
for (int tid_x = threadIdx.x; tid_x < x_item_length;
tid_x += blockDim.x) {
platform::CudaAtomicAdd(
phi::CudaAtomicAdd(
&dx_data[(x_offset + tid_y) * x_item_length + tid_x],
dout_data[(out_offset + tid_z * x_item_count + tid_y) *
x_item_length +
......
......@@ -11,7 +11,7 @@ limitations under the License. */
#include "paddle/fluid/operators/shuffle_channel_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -11,7 +11,7 @@
#include "paddle/fluid/operators/temporal_shift_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
......
......@@ -28,8 +28,8 @@ limitations under the License. */
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#define FINAL_MASK 0xffffffff
#ifdef __HIPCC__
......@@ -713,7 +713,7 @@ __device__ void RadixCountUsingMask(const T* input,
if (GetLaneId() == 0) {
#pragma unroll
for (uint32_t i = 0; i < RadixSize; ++i) {
platform::CudaAtomicAdd(&shared_mem[i], counts[i]);
phi::CudaAtomicAdd(&shared_mem[i], counts[i]);
}
}
......
......@@ -16,9 +16,9 @@ limitations under the License. */
#include "paddle/fluid/framework/gpu_utils.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/autotune/auto_tune_base.h"
......
......@@ -24,17 +24,15 @@
#define PADDLE_CUDA_FP16
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
using paddle::platform::float16;
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
template <typename T>
__global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
CUDA_KERNEL_LOOP(i, num) {
paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
}
CUDA_KERNEL_LOOP(i, num) { phi::CudaAtomicAdd(&data_b[i], data_a[i]); }
}
template <typename T>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册