未验证 提交 ea9684f1 编写于 作者: Y Yiqun Liu 提交者: GitHub

Optmize the CPU -> GPU memcpy and avoid explit sync in some operators. (#40933)

上级 3a6201af
...@@ -21,7 +21,6 @@ limitations under the License. */ ...@@ -21,7 +21,6 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/utils/dim.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
namespace phi { namespace phi {
...@@ -44,7 +43,7 @@ __global__ void GatherCUDAKernel(const T* params, ...@@ -44,7 +43,7 @@ __global__ void GatherCUDAKernel(const T* params,
template <typename T, typename IndexT = int> template <typename T, typename IndexT = int>
__global__ void GatherNdCUDAKernel(const T* input, __global__ void GatherNdCUDAKernel(const T* input,
const int64_t* input_dims, const Dim<DDim::kMaxRank> input_dims,
const IndexT* indices, const IndexT* indices,
T* output, T* output,
size_t remain_size, size_t remain_size,
...@@ -149,19 +148,11 @@ void GPUGatherNd(const phi::GPUContext& ctx, ...@@ -149,19 +148,11 @@ void GPUGatherNd(const phi::GPUContext& ctx,
slice_size *= input_dims[i]; slice_size *= input_dims[i];
} }
// source dim // source dim
std::vector<int64_t> v_input_dims(input_dims_size); Dim<DDim::kMaxRank> g_input_dims;
for (int i = 0; i < input_dims_size; ++i) { for (int i = 0; i < input_dims_size; ++i) {
v_input_dims[i] = input_dims[i]; g_input_dims[i] = input_dims[i];
} }
phi::DenseTensor input_dims_tensor;
input_dims_tensor.Resize({input_dims_size});
auto* g_input_dims = ctx.Alloc<int64_t>(&input_dims_tensor);
int64_t bytes = input_dims_size * sizeof(int64_t);
paddle::memory::Copy(
gplace, g_input_dims, cplace, v_input_dims.data(), bytes, ctx.stream());
int block = 512; int block = 512;
int64_t n = slice_size * remain_numel; int64_t n = slice_size * remain_numel;
int64_t grid = (n + block - 1) / block; int64_t grid = (n + block - 1) / block;
......
...@@ -77,7 +77,7 @@ template <typename T, typename IndexT = int> ...@@ -77,7 +77,7 @@ template <typename T, typename IndexT = int>
__global__ void ScatterNdCUDAKernel(const T* update, __global__ void ScatterNdCUDAKernel(const T* update,
const IndexT* indices, const IndexT* indices,
T* output, T* output,
const int64_t* output_dims, const Dim<DDim::kMaxRank> output_dims,
size_t remain_size, size_t remain_size,
size_t slice_size, size_t slice_size,
size_t end_size) { size_t end_size) {
...@@ -222,23 +222,12 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx, ...@@ -222,23 +222,12 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx,
slice_size *= output_dims[i]; slice_size *= output_dims[i];
} }
const size_t slice_bytes = slice_size * sizeof(T); const size_t slice_bytes = slice_size * sizeof(T);
// put output_dims int CUDA
// gplace and cplace
const auto gplace = ctx.GetPlace();
auto cplace = phi::CPUPlace();
std::vector<int64_t> v_output_dims(output_dims_size); Dim<DDim::kMaxRank> g_output_dims;
for (int i = 0; i < output_dims_size; ++i) { for (int i = 0; i < output_dims_size; ++i) {
v_output_dims[i] = output_dims[i]; g_output_dims[i] = output_dims[i];
} }
phi::DenseTensor out_dims_tensor;
out_dims_tensor.Resize({output_dims_size});
auto* g_output_dims = ctx.Alloc<int64_t>(&out_dims_tensor);
int64_t bytes = output_dims_size * sizeof(int64_t);
paddle::memory::Copy(
gplace, g_output_dims, cplace, v_output_dims.data(), bytes, ctx.stream());
int block = 512; int block = 512;
int64_t n = slice_size * remain_numel; int64_t n = slice_size * remain_numel;
int64_t grid = (n + block - 1) / block; int64_t grid = (n + block - 1) / block;
......
...@@ -109,7 +109,6 @@ void IndexSelectGradKernel(const Context& ctx, ...@@ -109,7 +109,6 @@ void IndexSelectGradKernel(const Context& ctx,
stride, stride,
size, size,
delta); delta);
phi::backends::gpu::GpuStreamSync(stream);
} else { } else {
const int* index_data = index.data<int>(); const int* index_data = index.data<int>();
index_select_grad_cuda_kernel<T, int><<< index_select_grad_cuda_kernel<T, int><<<
...@@ -124,7 +123,6 @@ void IndexSelectGradKernel(const Context& ctx, ...@@ -124,7 +123,6 @@ void IndexSelectGradKernel(const Context& ctx,
stride, stride,
size, size,
delta); delta);
phi::backends::gpu::GpuStreamSync(stream);
} }
} }
......
...@@ -82,7 +82,6 @@ void IndexSelectKernel(const Context& ctx, ...@@ -82,7 +82,6 @@ void IndexSelectKernel(const Context& ctx,
PADDLE_CUDA_NUM_THREADS, PADDLE_CUDA_NUM_THREADS,
0, 0,
stream>>>(in_data, out_data, index_data, numel, stride, size, delta); stream>>>(in_data, out_data, index_data, numel, stride, size, delta);
phi::backends::gpu::GpuStreamSync(stream);
} else { } else {
const int* index_data = index.data<int>(); const int* index_data = index.data<int>();
index_select_cuda_kernel< index_select_cuda_kernel<
...@@ -92,7 +91,6 @@ void IndexSelectKernel(const Context& ctx, ...@@ -92,7 +91,6 @@ void IndexSelectKernel(const Context& ctx,
0, 0,
stream>>>( stream>>>(
in_data, out_data, index_data, numel, stride, size, delta); in_data, out_data, index_data, numel, stride, size, delta);
phi::backends::gpu::GpuStreamSync(stream);
} }
} }
......
...@@ -26,7 +26,7 @@ void ScatterNdAddKernel(const Context &ctx, ...@@ -26,7 +26,7 @@ void ScatterNdAddKernel(const Context &ctx,
const DenseTensor &index, const DenseTensor &index,
const DenseTensor &updates, const DenseTensor &updates,
DenseTensor *out) { DenseTensor *out) {
Copy(ctx, x, ctx.GetPlace(), true, out); Copy(ctx, x, ctx.GetPlace(), false, out);
const auto &index_type = index.dtype(); const auto &index_type = index.dtype();
bool index_type_match = bool index_type_match =
index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
......
...@@ -29,33 +29,32 @@ namespace cub = hipcub; ...@@ -29,33 +29,32 @@ namespace cub = hipcub;
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
template <typename T1, typename T2, typename OutT> template <typename MaskT, typename IndexT, typename OutT>
struct IndexFunctor { struct IndexFunctor {
T2 stride[phi::DDim::kMaxRank]; IndexT strides[phi::DDim::kMaxRank];
int dims; int rank;
explicit IndexFunctor(const phi::DDim &in_dims) { explicit IndexFunctor(const phi::DDim &in_dims) {
dims = in_dims.size(); rank = in_dims.size();
std::vector<T2> strides_in_tmp; // Get strides according to in_dims
strides_in_tmp.resize(dims, 1); strides[0] = 1;
// get strides according to in_dims for (IndexT i = 1; i < rank; i++) {
for (T2 i = 1; i < dims; i++) { strides[i] = strides[i - 1] * in_dims[rank - i];
strides_in_tmp[i] = strides_in_tmp[i - 1] * in_dims[dims - i];
} }
memcpy(stride, strides_in_tmp.data(), dims * sizeof(T2));
} }
HOSTDEVICE inline void operator()(OutT *out, HOSTDEVICE inline void operator()(OutT *out,
const T1 *mask, const MaskT *mask,
const T2 *index, const IndexT *index,
const int num) { const int num) {
int store_fix = 0; int store_fix = 0;
for (int idx = 0; idx < num; idx++) { for (int idx = 0; idx < num; idx++) {
if (mask[idx]) { if (mask[idx]) {
T2 data_index = index[idx]; IndexT data_index = index[idx];
// get index // get index
for (int rank_id = dims - 1; rank_id >= 0; --rank_id) { for (int rank_id = rank - 1; rank_id >= 0; --rank_id) {
out[store_fix] = static_cast<OutT>(data_index / stride[rank_id]); out[store_fix] = static_cast<OutT>(data_index / strides[rank_id]);
data_index = data_index % stride[rank_id]; data_index = data_index % strides[rank_id];
store_fix++; store_fix++;
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册