未验证 提交 b1faa562 编写于 作者: Y Yiqun Liu 提交者: GitHub

Implement a common segmented array. (#49450)

* Implement a common PointerArray.

* Polish codes.

* Add including of header file.

* Add the branch of kFix8.

* Fix compiling error.

* Add alignas hint to fix the performance drop.

* Optimize the H2D copy in stack_grad.

* Rename the macro.

* Fix align hint for different compilers.

* Polish the define of PADDLE_ALIGN.

* Fix compiling error.

* Remove the align hint on windows.
上级 24f5c46e
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/funcs/fast_divmod.h"
namespace phi {
namespace funcs {
template <typename IndexT>
struct GeneralDivMod {
public:
explicit GeneralDivMod(IndexT d) { divmoder = phi::funcs::FastDivMod(d); }
__device__ inline phi::funcs::FastDivMod::DivModT div_mod(IndexT val) {
return divmoder.Divmod(val);
}
phi::funcs::FastDivMod divmoder;
};
template <>
struct GeneralDivMod<int64_t> {
public:
using DivModT = phi::AlignedVector<int64_t, 2>;
explicit GeneralDivMod(int64_t d) { divisor = d; }
__device__ inline DivModT div_mod(int64_t val) {
DivModT data;
data[0] = val / divisor;
data[1] = val - data[0] * divisor;
return data;
}
int64_t divisor;
};
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x)
#endif
enum class SegmentedArraySize {
kVariableLength = 0,
kFixed4 = 4,
kFixed8 = 8,
kFixed16 = 16,
kFixed32 = 32,
kFixed64 = 64,
};
template <typename T, SegmentedArraySize Size>
struct PADDLE_ALIGN(256) ConstPointerArray {
public:
const T* data[static_cast<int>(Size)];
void Set(const std::vector<const T*>& ptrs, const T** dev_ptr = nullptr) {
for (auto i = 0; i < ptrs.size(); ++i) {
data[i] = ptrs[i];
}
}
};
template <typename T>
struct PADDLE_ALIGN(256)
ConstPointerArray<T, SegmentedArraySize::kVariableLength> {
public:
const T** data{nullptr};
void Set(const std::vector<const T*>& ptrs, const T** dev_ptr = nullptr) {
data = dev_ptr;
}
};
template <typename T, SegmentedArraySize Size>
struct PADDLE_ALIGN(256) PointerArray {
public:
T* data[static_cast<int>(Size)];
void Set(const std::vector<T*>& ptrs, T** dev_ptr = nullptr) {
for (auto i = 0; i < ptrs.size(); ++i) {
data[i] = ptrs[i];
}
}
};
template <typename T>
struct PADDLE_ALIGN(256) PointerArray<T, SegmentedArraySize::kVariableLength> {
public:
T** data{nullptr};
void Set(const std::vector<T*>& ptrs, T** dev_ptr = nullptr) {
data = dev_ptr;
}
};
#undef PADDLE_ALIGN
template <typename Context>
struct ArraySetterBase {
protected:
void* AllocAndCopy(const Context& ctx, void* src, size_t num_bytes) {
allocation = paddle::memory::Alloc(
ctx.GetPlace(),
num_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
paddle::memory::Copy(ctx.GetPlace(),
allocation->ptr(),
phi::CPUPlace(),
src,
num_bytes,
ctx.stream());
return allocation->ptr();
}
phi::Allocator::AllocationPtr allocation{nullptr};
};
template <typename Context, typename T, SegmentedArraySize Size>
struct ConstPointerArraySetter : public ArraySetterBase<Context> {
public:
ConstPointerArray<T, Size> array;
ConstPointerArraySetter(const Context& ctx,
const std::vector<const DenseTensor*>& t) {
ptrs.resize(t.size());
for (int i = 0; i < t.size(); ++i) {
ptrs[i] = t[i]->data<T>();
}
const T** dev_ptr = nullptr;
if (Size == SegmentedArraySize::kVariableLength) {
size_t num_bytes = t.size() * sizeof(T*);
dev_ptr =
reinterpret_cast<const T**>(ArraySetterBase<Context>::AllocAndCopy(
ctx, reinterpret_cast<void*>(ptrs.data()), num_bytes));
}
array.Set(ptrs, dev_ptr);
}
private:
std::vector<const T*> ptrs;
};
template <typename Context, typename T, SegmentedArraySize Size>
struct PointerArraySetter : public ArraySetterBase<Context> {
public:
PointerArray<T, Size> array;
PointerArraySetter(const Context& ctx, std::vector<DenseTensor*>* t) {
ptrs.resize(t->size());
for (int i = 0; i < t->size(); ++i) {
if (t->at(i) && (t->at(i)->numel() > 0)) {
ptrs[i] = ctx.template Alloc<T>(t->at(i));
} else {
ptrs[i] = nullptr;
}
}
T** dev_ptr = nullptr;
if (Size == SegmentedArraySize::kVariableLength) {
size_t num_bytes = t->size() * sizeof(T*);
dev_ptr = reinterpret_cast<T**>(ArraySetterBase<Context>::AllocAndCopy(
ctx, reinterpret_cast<void*>(ptrs.data()), num_bytes));
}
array.Set(ptrs, dev_ptr);
}
private:
std::vector<T*> ptrs;
};
inline SegmentedArraySize CalcArraySize(int n) {
if (n <= 4) {
return SegmentedArraySize::kFixed4;
} else if (n <= 8) {
return SegmentedArraySize::kFixed8;
} else if (n <= 16) {
return SegmentedArraySize::kFixed16;
} else if (n <= 32) {
return SegmentedArraySize::kFixed32;
} else if (n <= 64) {
return SegmentedArraySize::kFixed64;
} else {
return SegmentedArraySize::kVariableLength;
}
}
} // namespace funcs
#define _SEGMENTED_ARRAY_KERNEL_CASE(size, ...) \
case (size): { \
constexpr auto kArraySize = (size); \
__VA_ARGS__; \
} break
#define _SEGMENTED_ARRAY_KERNEL_DEFAULT(size, ...) \
default: { \
constexpr auto kArraySize = (size); \
__VA_ARGS__; \
} break
#define SEGMENTED_ARRAY_KERNEL_HELPER(...) \
_SEGMENTED_ARRAY_KERNEL_CASE(funcs::SegmentedArraySize::kFixed4, \
##__VA_ARGS__); \
_SEGMENTED_ARRAY_KERNEL_CASE(funcs::SegmentedArraySize::kFixed8, \
##__VA_ARGS__); \
_SEGMENTED_ARRAY_KERNEL_CASE(funcs::SegmentedArraySize::kFixed16, \
##__VA_ARGS__); \
_SEGMENTED_ARRAY_KERNEL_CASE(funcs::SegmentedArraySize::kFixed32, \
##__VA_ARGS__); \
_SEGMENTED_ARRAY_KERNEL_CASE(funcs::SegmentedArraySize::kFixed64, \
##__VA_ARGS__); \
_SEGMENTED_ARRAY_KERNEL_DEFAULT(funcs::SegmentedArraySize::kVariableLength, \
##__VA_ARGS__);
} // namespace phi
...@@ -16,16 +16,17 @@ ...@@ -16,16 +16,17 @@
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/segmented_array.h"
namespace phi { namespace phi {
template <typename T, typename IndexT> template <typename T, typename IndexT, typename ArrayT>
__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input, __global__ void UnStackCudaKernel(const T* __restrict__ input,
int pre_dim_size, IndexT pre_dim_size,
int split_dim_size, IndexT split_dim_size,
int suf_dim_size, IndexT suf_dim_size,
int num_split, IndexT num_split,
T** output_ptrs) { ArrayT array) {
assert(blockDim.y == 1); assert(blockDim.y == 1);
assert(blockDim.z == 1); assert(blockDim.z == 1);
// In this case they are equal // In this case they are equal
...@@ -40,7 +41,7 @@ __global__ void UnStackHelperCUDAKernel(const T* __restrict__ input, ...@@ -40,7 +41,7 @@ __global__ void UnStackHelperCUDAKernel(const T* __restrict__ input,
IndexT j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size; IndexT j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size;
IndexT k = offset % suf_dim_size; IndexT k = offset % suf_dim_size;
T* output = output_ptrs[j / each_dim_size]; T* output = array.data[j / each_dim_size];
if (output == nullptr) { if (output == nullptr) {
return; return;
} }
...@@ -50,12 +51,12 @@ __global__ void UnStackHelperCUDAKernel(const T* __restrict__ input, ...@@ -50,12 +51,12 @@ __global__ void UnStackHelperCUDAKernel(const T* __restrict__ input,
} }
} }
template <typename T, typename IndexT> template <typename T, typename IndexT, typename ArrayT>
__global__ void StackGradKernelForLastDim(const T* __restrict__ in_data, __global__ void UnStackCudaKernelForLastDim(const T* __restrict__ in_data,
const IndexT cols, const IndexT cols,
const IndexT rows, const IndexT rows,
const IndexT tile_x_num, const IndexT tile_x_num,
T** out_datas) { ArrayT array) {
constexpr int buffer_size = 512; constexpr int buffer_size = 512;
__shared__ T s_buf[buffer_size]; __shared__ T s_buf[buffer_size];
...@@ -71,112 +72,112 @@ __global__ void StackGradKernelForLastDim(const T* __restrict__ in_data, ...@@ -71,112 +72,112 @@ __global__ void StackGradKernelForLastDim(const T* __restrict__ in_data,
} }
__syncthreads(); __syncthreads();
if (is_valid) { if (is_valid) {
if (out_datas[col_idx] != nullptr) { if (array.data[col_idx]) {
out_datas[col_idx][row_idx] = s_buf[s_idx]; array.data[col_idx][row_idx] = s_buf[s_idx];
} }
} }
} }
} }
template <typename Context, typename T, typename IndexT> template <typename Context,
void LaunchStackGradCUDAKernel(const Context& ctx, typename T,
const DenseTensor& out, typename IndexT,
std::vector<DenseTensor*>* x_grad_ptr, funcs::SegmentedArraySize Size>
const int axis, void LaunchUnStackKernel(const Context& ctx,
const int64_t dy_pre) { const IndexT pre_dim,
auto x_grad = *x_grad_ptr; const IndexT split_dim,
int out_num = out.dims()[axis]; const IndexT suf_dim,
PADDLE_ENFORCE_EQ( const IndexT num_splits,
out_num, const DenseTensor& out_grad,
x_grad.size(), std::vector<DenseTensor*>* x_grad) {
phi::errors::InvalidArgument( // each x_grad should have same shape
"Output x_grad size shall be equal to output num, but output num " auto dout_ptr = out_grad.data<T>();
"received in stack_grad op is:%d, and x_grad size is:%d.", funcs::PointerArraySetter<Context, T, Size> setter(ctx, x_grad);
out_num,
x_grad.size())); if (suf_dim == 1) {
std::vector<T*> outputs(out_num); // For the case axis == (out_grad.dims().size() - 1)
for (size_t j = 0; j < out_num; ++j) {
if (x_grad[j] == nullptr || x_grad[j]->numel() == 0UL) {
outputs[j] = nullptr;
} else {
outputs[j] = ctx.template Alloc<T>(x_grad[j]);
}
}
auto tmp_out_data = paddle::memory::Alloc(
ctx.GetPlace(),
out_num * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
paddle::memory::Copy(ctx.GetPlace(),
tmp_out_data->ptr(),
phi::CPUPlace(),
reinterpret_cast<void*>(outputs.data()),
out_num * sizeof(T*),
ctx.stream());
if (axis == (out.dims().size() - 1)) {
constexpr int kThreads = 512; constexpr int kThreads = 512;
constexpr int kWarpSize = 32; constexpr int kWarpSize = 32;
constexpr int kMaxOut = 16; constexpr int kMaxOut = 16;
int tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1;
bool is_small_num = out_num < kMaxOut;
if (is_small_num) { int tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1;
tid_y = out_num; if (split_dim < kMaxOut) {
tid_y = split_dim;
tid_x = tid_x =
std::min(backends::gpu::RoundToNextHighPowOfTwo(dy_pre, kWarpSize), std::min(backends::gpu::RoundToNextHighPowOfTwo(pre_dim, kWarpSize),
kThreads / backends::gpu::RoundToNextHighPowOfTwo(tid_y)); kThreads / backends::gpu::RoundToNextHighPowOfTwo(tid_y));
} else { } else {
tid_y = kMaxOut; tid_y = kMaxOut;
tid_x = kWarpSize; tid_x = kWarpSize;
bid_y = backends::gpu::DivUp<int>(out_num, kMaxOut); bid_y = backends::gpu::DivUp<int>(split_dim, kMaxOut);
} }
int tile_x_num = backends::gpu::DivUp<int>(dy_pre, tid_x); int tile_x_num = backends::gpu::DivUp<int>(pre_dim, tid_x);
bid_x = std::min(tile_x_num, backends::gpu::kMultiDimslimit); bid_x = std::min(tile_x_num, backends::gpu::kMultiDimslimit);
dim3 blocks(tid_x, tid_y, 1); dim3 blocks(tid_x, tid_y, 1);
dim3 grids(bid_x, bid_y, 1); dim3 grids(bid_x, bid_y, 1);
StackGradKernelForLastDim<T, IndexT><<<grids, blocks, 0, ctx.stream()>>>( UnStackCudaKernelForLastDim<T, IndexT, decltype(setter.array)>
out.data<T>(), <<<grids, blocks, 0, ctx.stream()>>>(
out_num, dout_ptr, split_dim, pre_dim, tile_x_num, setter.array);
dy_pre,
tile_x_num,
reinterpret_cast<T**>(tmp_out_data->ptr()));
} else { } else {
int dy_suf = out.numel() / (out_num * dy_pre); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
auto config = ctx, pre_dim * split_dim * suf_dim);
backends::gpu::GetGpuLaunchConfig1D(ctx, dy_pre * out_num * dy_suf);
UnStackCudaKernel<T, IndexT, decltype(setter.array)>
UnStackHelperCUDAKernel<T, IndexT> <<<config.block_per_grid.x,
<<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( config.thread_per_block.x,
out.data<T>(), 0,
dy_pre, ctx.stream()>>>(
out_num, dout_ptr, pre_dim, split_dim, suf_dim, num_splits, setter.array);
dy_suf,
out_num,
reinterpret_cast<T**>(tmp_out_data->ptr()));
} }
} }
template <typename T, typename Context> template <typename T, typename Context>
void StackGradKernel(const Context& dev_ctx, void StackGradKernel(const Context& ctx,
const DenseTensor& out, const DenseTensor& out_grad,
int axis, int axis,
std::vector<DenseTensor*> x_grad) { std::vector<DenseTensor*> x_grad) {
const auto& dy_dims = out.dims(); if (axis < 0) axis += out_grad.dims().size();
int actual_axis = axis < 0 ? axis + dy_dims.size() : axis;
bool use_int32 = out.numel() < std::numeric_limits<int32_t>::max(); int64_t split_dim = out_grad.dims()[axis];
PADDLE_ENFORCE_EQ(
split_dim,
x_grad.size(),
phi::errors::InvalidArgument(
"Output x_grad size should be equal to the split_dim, but"
" received split_dim is:%d x_grad size is:%d.",
split_dim,
x_grad.size()));
int64_t dy_pre = 1; auto dout_dims = out_grad.dims();
for (int i = 0; i < actual_axis; ++i) { int64_t dout_pre = 1;
dy_pre *= dy_dims[i]; for (int i = 0; i < axis; ++i) {
dout_pre *= dout_dims[i];
} }
if (use_int32) { int64_t dout_suf = out_grad.numel() / (split_dim * dout_pre);
LaunchStackGradCUDAKernel<Context, T, int32_t>(
dev_ctx, out, &x_grad, actual_axis, dy_pre); if (out_grad.numel() < std::numeric_limits<int32_t>::max()) {
switch (funcs::CalcArraySize(split_dim)) {
SEGMENTED_ARRAY_KERNEL_HELPER(
LaunchUnStackKernel<Context, T, int32_t, kArraySize>(ctx,
dout_pre,
split_dim,
dout_suf,
split_dim,
out_grad,
&x_grad));
}
} else { } else {
LaunchStackGradCUDAKernel<Context, T, int64_t>( switch (funcs::CalcArraySize(split_dim)) {
dev_ctx, out, &x_grad, actual_axis, dy_pre); SEGMENTED_ARRAY_KERNEL_HELPER(
LaunchUnStackKernel<Context, T, int64_t, kArraySize>(ctx,
dout_pre,
split_dim,
dout_suf,
split_dim,
out_grad,
&x_grad));
}
} }
} }
......
...@@ -15,86 +15,15 @@ ...@@ -15,86 +15,15 @@
#include "paddle/phi/kernels/stack_kernel.h" #include "paddle/phi/kernels/stack_kernel.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/fast_divmod.h" #include "paddle/phi/kernels/funcs/segmented_array.h"
namespace phi { namespace phi {
template <typename IndexT> template <typename T, typename IndexT, typename ArrayT>
struct DivmodWarpper { __global__ void StackCUDAKernel(ArrayT array,
public: funcs::GeneralDivMod<IndexT> divmoder,
void SetDivisor(IndexT divisor) {
divmoder = phi::funcs::FastDivMod(divisor);
}
__device__ inline phi::funcs::FastDivMod::DivModT div_mod(IndexT val) {
return divmoder.Divmod(val);
}
private:
phi::funcs::FastDivMod divmoder;
};
template <>
struct DivmodWarpper<int64_t> {
public:
using DivModT = phi::AlignedVector<int64_t, 2>;
void SetDivisor(int64_t divisor) { dividen_ = divisor; }
__device__ inline DivModT div_mod(int64_t val) {
DivModT data;
data[0] = val / dividen_;
data[1] = val - data[0] * dividen_;
return data;
}
private:
int64_t dividen_;
};
template <typename T, typename IndexT, int Size>
struct PointerArray : public DivmodWarpper<IndexT> {
public:
const T* data[Size];
PointerArray(const std::vector<const DenseTensor*>& x,
int num,
IndexT divisor) {
this->SetDivisor(divisor);
for (auto i = 0; i < num; ++i) {
data[i] = x[i]->data<T>();
}
}
};
template <typename Context, typename T, typename IndexT>
struct PointerToPointer : public DivmodWarpper<IndexT> {
public:
T** data{nullptr};
PointerToPointer(const Context& ctx,
const std::vector<const DenseTensor*>& x,
IndexT num,
IndexT divisor,
paddle::memory::AllocationPtr* dev_ins_ptr) {
this->SetDivisor(divisor);
std::vector<const T*> x_datas(num);
for (int i = 0; i < num; ++i) {
x_datas[i] = x[i]->data<T>();
}
*dev_ins_ptr = paddle::memory::Alloc(
ctx.GetPlace(),
num * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
paddle::memory::Copy(ctx.GetPlace(),
(*dev_ins_ptr)->ptr(),
phi::CPUPlace(),
reinterpret_cast<void*>(x_datas.data()),
num * sizeof(T*),
ctx.stream());
data = reinterpret_cast<T**>((*dev_ins_ptr)->ptr());
}
};
template <typename T, typename IndexT, typename WrapT>
__global__ void StackCUDAKernel(WrapT input_warpper,
IndexT split_size, IndexT split_size,
IndexT rows, IndexT rows,
IndexT cols, IndexT cols,
...@@ -106,85 +35,69 @@ __global__ void StackCUDAKernel(WrapT input_warpper, ...@@ -106,85 +35,69 @@ __global__ void StackCUDAKernel(WrapT input_warpper,
for (; grid_x < cols; grid_x += grid_x_stride) { for (; grid_x < cols; grid_x += grid_x_stride) {
IndexT grid_y = static_cast<IndexT>(blockIdx.y) * blockDim.y + threadIdx.y; IndexT grid_y = static_cast<IndexT>(blockIdx.y) * blockDim.y + threadIdx.y;
auto divmod_rslt = input_warpper.div_mod(grid_x); auto divmod_rslt = divmoder.div_mod(grid_x);
const T* input_ptr = input_warpper.data[divmod_rslt[0]]; IndexT split = divmod_rslt[0]; // grid_x / split_size
IndexT col_offset = divmod_rslt[1]; // grid_x % split_size
const T* input_ptr = array.data[split];
#pragma unroll #pragma unroll
for (; grid_y < rows; grid_y += grid_y_stride) { for (; grid_y < rows; grid_y += grid_y_stride) {
output[grid_y * cols + grid_x] = output[grid_y * cols + grid_x] =
input_ptr[grid_y * split_size + divmod_rslt[1]]; input_ptr[grid_y * split_size + col_offset];
} }
} }
} }
template <typename T, typename IndexT, typename Context> template <typename Context,
void LaunchStackCUDAKernelWithIndexType( typename T,
const Context& ctx, typename IndexT,
const IndexT x_col, funcs::SegmentedArraySize Size>
const IndexT x_row, void LaunchStackKernel(const Context& ctx,
const IndexT out_col, const IndexT x_col,
const phi::backends::gpu::GpuLaunchConfig& cfg, const IndexT x_row,
const std::vector<const DenseTensor*>& x, const IndexT out_col,
T* dst_data) { const std::vector<const DenseTensor*>& x,
int num = static_cast<int>(x.size()); DenseTensor* out) {
#define IMPL_STACK_CUDA_KERNEL_CASE(size_, ...) \ T* out_ptr = ctx.template Alloc<T>(out);
case size_: { \ auto config = phi::backends::gpu::GetGpuLaunchConfig2D(ctx, out_col, x_row);
PointerArray<T, IndexT, size_> ptr_array(x, num, x_col); \
__VA_ARGS__; \ funcs::ConstPointerArraySetter<Context, T, Size> setter(ctx, x);
} break; funcs::GeneralDivMod<IndexT> divmoder(x_col);
StackCUDAKernel<T, IndexT, decltype(setter.array)>
#define IMPL_STACK_CUDA_KERNEL_HELPER(...) \ <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
IMPL_STACK_CUDA_KERNEL_CASE(4, ##__VA_ARGS__); \ setter.array, divmoder, x_col, x_row, out_col, out_ptr);
IMPL_STACK_CUDA_KERNEL_CASE(8, ##__VA_ARGS__); \
IMPL_STACK_CUDA_KERNEL_CASE(16, ##__VA_ARGS__); \
IMPL_STACK_CUDA_KERNEL_CASE(32, ##__VA_ARGS__); \
IMPL_STACK_CUDA_KERNEL_CASE(64, ##__VA_ARGS__); \
IMPL_STACK_CUDA_KERNEL_CASE(128, ##__VA_ARGS__);
switch (phi::backends::gpu::RoundToNextHighPowOfTwo(num, 4)) {
IMPL_STACK_CUDA_KERNEL_HELPER(
StackCUDAKernel<T, IndexT, decltype(ptr_array)>
<<<cfg.block_per_grid, cfg.thread_per_block, 0, ctx.stream()>>>(
ptr_array, x_col, x_row, out_col, dst_data));
default: {
paddle::memory::AllocationPtr dev_ins_ptr{nullptr};
PointerToPointer<Context, T, IndexT> ptr_array(
ctx, x, num, x_col, &dev_ins_ptr);
StackCUDAKernel<T, IndexT, decltype(ptr_array)>
<<<cfg.block_per_grid, cfg.thread_per_block, 0, ctx.stream()>>>(
ptr_array, x_col, x_row, out_col, dst_data);
}
}
#undef IMPL_STACK_CUDA_KERNEL_HELPER
#undef IMPL_STACK_CUDA_KERNEL_CASE
} }
template <typename T, typename Context> template <typename T, typename Context>
void StackKernel(const Context& dev_ctx, void StackKernel(const Context& ctx,
const std::vector<const DenseTensor*>& x, const std::vector<const DenseTensor*>& x,
int axis, int axis,
DenseTensor* out) { DenseTensor* out) {
if (axis < 0) axis += (x[0]->dims().size() + 1); if (axis < 0) axis += (x[0]->dims().size() + 1);
int num = static_cast<int>(x.size()); int num = static_cast<int>(x.size());
T* dst_data = dev_ctx.template Alloc<T>(out);
// Split x dim from axis to matrix // Split x dim from axis to matrix
int64_t x_row = 1, x_col = 1; int64_t x_row = 1;
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
x_row *= x[0]->dims()[i]; x_row *= x[0]->dims()[i];
} }
x_col = x[0]->numel() / x_row; int64_t x_col = x[0]->numel() / x_row;
int64_t out_col = x_col * num; int64_t out_col = x_col * num;
auto config =
phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
if (out->numel() < std::numeric_limits<int32_t>::max()) { if (out->numel() < std::numeric_limits<int32_t>::max()) {
LaunchStackCUDAKernelWithIndexType<T, int32_t, Context>( switch (funcs::CalcArraySize(num)) {
dev_ctx, x_col, x_row, out_col, config, x, dst_data); SEGMENTED_ARRAY_KERNEL_HELPER(
LaunchStackKernel<Context, T, int32_t, kArraySize>(
ctx, x_col, x_row, out_col, x, out));
}
} else { } else {
LaunchStackCUDAKernelWithIndexType<T, int64_t, Context>( switch (funcs::CalcArraySize(num)) {
dev_ctx, x_col, x_row, out_col, config, x, dst_data); SEGMENTED_ARRAY_KERNEL_HELPER(
LaunchStackKernel<Context, T, int64_t, kArraySize>(
ctx, x_col, x_row, out_col, x, out));
}
} }
} }
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL(stack, PD_REGISTER_KERNEL(stack,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册