未验证 提交 cef884e5 编写于 作者: P Pei Yang 提交者: GitHub

refine concat cuda kernel, test=develop (#2081)

上级 977a66fc
...@@ -21,134 +21,25 @@ namespace kernels { ...@@ -21,134 +21,25 @@ namespace kernels {
namespace cuda { namespace cuda {
using Tensor = lite::Tensor; using Tensor = lite::Tensor;
template <typename T> template <typename Dtype>
__global__ void ConcatKernel(const T** inputs, __global__ void Concat(const int num,
const int* input_cols, const Dtype* in_data,
int col_size, const int num_concats,
const int output_rows, const int concat_size,
const int output_cols, const int top_concat_axis,
T* output) { const int bottom_concat_axis,
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; const int offset_concat_axis,
int curr_segment = 0; Dtype* out_data) {
int curr_offset = input_cols[0]; int index = blockIdx.x * blockDim.x + threadIdx.x;
for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { if (index < num) {
int curr_col_offset = input_cols[curr_segment + 1]; const int total_concat_size = concat_size * bottom_concat_axis;
while (curr_col_offset <= tid_x) { const int concat_num = index / total_concat_size;
curr_offset = curr_col_offset; const int concat_index = index % total_concat_size;
++curr_segment; const int top_index =
curr_col_offset = input_cols[curr_segment + 1]; concat_index +
} (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
out_data[top_index] = in_data[index];
int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset;
const T* input_ptr = inputs[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
output[tid_y * output_cols + tid_x] =
input_ptr[tid_y * segment_width + local_col];
}
}
template <typename T>
__device__ void ConcatKernelDetail(const T** inputs_data,
const int fixed_in_col,
const int out_rows,
const int out_cols,
T* output_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
int split = tid_x * 1.0 / fixed_in_col;
int in_offset = tid_x - split * fixed_in_col;
const T* input_ptr = inputs_data[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
output_data[tid_y * out_cols + tid_x] =
input_ptr[tid_y * fixed_in_col + in_offset];
}
}
// for (int i = 0; i < 4; i++){
// printf("input[0][%d] = %.1f\n", i, inputs_data[0][i]);
// printf("output[%d] = %.1f\n", i, output_data[i]);
// }
}
template <typename T>
__global__ void ConcatKernel(const T* input_addr0,
const T* input_addr1,
const int fixed_in_col,
const int out_rows,
const int out_cols,
T* output_data) {
const T* inputs_data[2];
inputs_data[0] = input_addr0;
inputs_data[1] = input_addr1;
ConcatKernelDetail<T>(
inputs_data, fixed_in_col, out_rows, out_cols, output_data);
}
template <typename T>
__global__ void ConcatKernel(const T* input_addr0,
const T* input_addr1,
const T* input_addr2,
const int fixed_in_col,
const int out_rows,
const int out_cols,
T* output_data) {
const T* inputs_data[3];
inputs_data[0] = input_addr0;
inputs_data[1] = input_addr1;
inputs_data[2] = input_addr2;
ConcatKernelDetail<T>(
inputs_data, fixed_in_col, out_rows, out_cols, output_data);
}
template <typename T>
__global__ void ConcatKernel(const T* input_addr0,
const T* input_addr1,
const T* input_addr2,
const T* input_addr3,
const int fixed_in_col,
const int out_rows,
const int out_cols,
T* output_data) {
const T* inputs_data[4];
inputs_data[0] = input_addr0;
inputs_data[1] = input_addr1;
inputs_data[2] = input_addr2;
inputs_data[3] = input_addr3;
ConcatKernelDetail<T>(
inputs_data, fixed_in_col, out_rows, out_cols, output_data);
}
template <typename T>
__global__ void ConcatKernel(const T** inputs_data,
const int in_num,
const int fixed_in_col,
const int out_rows,
const int out_cols,
T* output_data) {
ConcatKernelDetail<T>(
inputs_data, fixed_in_col, out_rows, out_cols, output_data);
}
static inline void GetBlockDims(const CUDAContext& context,
int num_rows,
int num_cols,
dim3* block_dims,
dim3* grid_dims) {
// Set the thread block and grid according to CurrentDeviceId
const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock;
if (num_cols < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((num_cols + 31) >> 5) << 5;
} }
int block_rows = kThreadsPerBlock / block_cols;
*block_dims = dim3(block_cols, block_rows, 1);
int grid_cols = (num_cols + block_cols - 1) / block_cols;
int grid_rows = std::max(num_rows / block_rows, 1);
*grid_dims = dim3(grid_cols, grid_rows, 1);
} }
void ConcatCompute::Run() { void ConcatCompute::Run() {
...@@ -158,105 +49,40 @@ void ConcatCompute::Run() { ...@@ -158,105 +49,40 @@ void ConcatCompute::Run() {
std::vector<Tensor*> input = param.x; std::vector<Tensor*> input = param.x;
Tensor* output = param.output; Tensor* output = param.output;
auto* output_data = output->mutable_data<float>(TARGET(kCUDA));
int axis = param.axis; int axis = param.axis;
int inner_size = 1;
int in_num = input.size(); int outer_size = 1;
int in_row = 1; auto input_dims = input[0]->dims();
auto dim_0 = input[0]->dims(); for (int i = 0; i < axis; i++) {
for (int i = 0; i < axis; ++i) { outer_size *= input_dims[i];
in_row *= dim_0[i];
} }
int in_col = input[0]->numel() / in_row;
int out_row = in_row, out_col = 0;
std::vector<const float*> inputs_data(in_num); for (int i = axis + 1; i < input_dims.size(); i++) {
std::vector<int> inputs_col(in_num + 1); inner_size *= input_dims[i];
inputs_col[0] = 0;
bool has_same_shape = true;
for (int i = 0; i < in_num; ++i) {
int t_cols = input[i]->numel() / in_row;
if (has_same_shape) {
if (t_cols != in_col) has_same_shape = false;
}
out_col += t_cols;
inputs_col[i + 1] = out_col;
inputs_data[i] = input[i]->data<float>();
}
dim3 block_dims;
dim3 grid_dims;
GetBlockDims(ctx, out_row, out_col, &block_dims, &grid_dims);
const float** dev_ins_data = nullptr;
if (!has_same_shape || in_num < 2 || in_num > 4) {
float* tmp_dev_ins_data = nullptr;
CHECK(cudaSuccess ==
cudaMalloc(&tmp_dev_ins_data, inputs_data.size() * sizeof(float*)));
CHECK(cudaSuccess == cudaMemcpy(tmp_dev_ins_data,
static_cast<void*>(inputs_data.data()),
inputs_data.size() * sizeof(float*),
cudaMemcpyHostToDevice));
dev_ins_data = reinterpret_cast<const float**>(tmp_dev_ins_data);
} }
if (has_same_shape) {
if (in_num == 2) {
ConcatKernel<float><<<grid_dims, block_dims, 0, stream>>>(
inputs_data[0],
inputs_data[1],
in_col,
out_row,
out_col,
output->mutable_data<float>());
} else if (in_num == 3) {
ConcatKernel<float><<<grid_dims, block_dims, 0, stream>>>(
inputs_data[0],
inputs_data[1],
inputs_data[2],
in_col,
out_row,
out_col,
output->mutable_data<float>());
} else if (in_num == 4) {
ConcatKernel<float><<<grid_dims, block_dims, 0, stream>>>(
inputs_data[0],
inputs_data[1],
inputs_data[2],
inputs_data[3],
in_col,
out_row,
out_col,
output->mutable_data<float>());
} else {
ConcatKernel<float><<<grid_dims, block_dims, 0, stream>>>(
dev_ins_data,
in_num,
in_col,
out_row,
out_col,
output->mutable_data<float>());
cudaFree(dev_ins_data);
}
} else {
int* tmp_dev_ins_col_data = nullptr;
CHECK(cudaSuccess == int all_concat_axis = param.output->dims()[axis];
cudaMalloc(&tmp_dev_ins_col_data, inputs_col.size() * sizeof(int))); int in_num = input.size();
CHECK(cudaSuccess == cudaMemcpy(tmp_dev_ins_col_data, int offset_concat_axis = 0;
static_cast<void*>(inputs_col.data()),
inputs_col.size() * sizeof(int), for (int i = 0; i < in_num; i++) {
cudaMemcpyHostToDevice)); auto* input_data = input[i]->data<float>();
int* dev_ins_col_data = static_cast<int*>(tmp_dev_ins_col_data); int input_concat_axis = input[i]->dims()[axis];
ConcatKernel<float><<<grid_dims, block_dims, 0, stream>>>( int input_concat_size = input_concat_axis * inner_size;
dev_ins_data, int num = input_concat_size * outer_size;
dev_ins_col_data, int threads = 1024;
static_cast<int>(inputs_col.size()), int blocks = (num + threads - 1) / threads;
out_row, Concat<<<blocks, threads, 0, stream>>>(num,
out_col, input_data,
output->mutable_data<float>()); outer_size,
cudaFree(dev_ins_data); inner_size,
cudaFree(dev_ins_col_data); all_concat_axis,
input_concat_axis,
offset_concat_axis,
output_data);
offset_concat_axis += input_concat_axis;
} }
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
} }
} // namespace cuda } // namespace cuda
...@@ -270,7 +96,6 @@ REGISTER_LITE_KERNEL(concat, ...@@ -270,7 +96,6 @@ REGISTER_LITE_KERNEL(concat,
kNCHW, kNCHW,
paddle::lite::kernels::cuda::ConcatCompute, paddle::lite::kernels::cuda::ConcatCompute,
def) def)
.BindInput("x", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindInput("axis", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("output", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize(); .Finalize();
...@@ -126,10 +126,10 @@ TEST(concat, compute_input_multi) { ...@@ -126,10 +126,10 @@ TEST(concat, compute_input_multi) {
lite::Tensor tensorC_ref; lite::Tensor tensorC_ref;
lite::Tensor tensorD_ref; lite::Tensor tensorD_ref;
DDimLite ddimA({1, 3, 1, 2}); DDimLite ddimA({1, 3, 38, 38});
DDimLite ddimB({1, 4, 1, 2}); DDimLite ddimB({1, 4, 38, 38});
DDimLite ddimC({1, 5, 1, 2}); DDimLite ddimC({1, 5, 38, 38});
DDimLite ddimD({1, 6, 1, 2}); DDimLite ddimD({1, 6, 38, 38});
tensorA.Resize(ddimA); tensorA.Resize(ddimA);
tensorB.Resize(ddimB); tensorB.Resize(ddimB);
...@@ -144,6 +144,9 @@ TEST(concat, compute_input_multi) { ...@@ -144,6 +144,9 @@ TEST(concat, compute_input_multi) {
tensorC_ref.Resize(ddimC); tensorC_ref.Resize(ddimC);
tensorD_ref.Resize(ddimD); tensorD_ref.Resize(ddimD);
out.Resize({1, 18, 38, 38});
out_cpu.Resize({1, 18, 38, 38});
out_ref.Resize({1, 18, 38, 38});
auto* out_data = out.mutable_data<float>(TARGET(kCUDA)); auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
auto* out_cpu_data = out_cpu.mutable_data<float>(); auto* out_cpu_data = out_cpu.mutable_data<float>();
auto* out_ref_data = out_ref.mutable_data<float>(); auto* out_ref_data = out_ref.mutable_data<float>();
...@@ -215,7 +218,7 @@ TEST(concat, compute_input_multi) { ...@@ -215,7 +218,7 @@ TEST(concat, compute_input_multi) {
concat_compute_ref(param_ref); concat_compute_ref(param_ref);
LOG(INFO) << "concat_compute_ref end"; LOG(INFO) << "concat_compute_ref end";
for (int i = 0; i < out.numel(); i++) { for (int i = 0; i < out_ref.numel(); i++) {
EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
} }
} }
......
...@@ -27,7 +27,6 @@ using Tensor = lite::Tensor; ...@@ -27,7 +27,6 @@ using Tensor = lite::Tensor;
static void ElementwiseAddRef(float* x, float* y, float* out, int num) { static void ElementwiseAddRef(float* x, float* y, float* out, int num) {
for (int i = 0; i < num; ++i) { for (int i = 0; i < num; ++i) {
out[i] = x[i] + y[i]; out[i] = x[i] + y[i];
// LOG(INFO) << x[i] << " + " << y[i] << " = " << out[i];
} }
} }
......
...@@ -120,9 +120,9 @@ void NearestInterpCompute::Run() { ...@@ -120,9 +120,9 @@ void NearestInterpCompute::Run() {
int in_chw = c * in_hw; int in_chw = c * in_hw;
int out_chw = c * out_hw; int out_chw = c * out_hw;
int pixelNum = n * out_chw; int pixel_num = n * out_chw;
int threads = 512; int threads = 512;
int blocks = (pixelNum + threads - 1) / threads; int blocks = (pixel_num + threads - 1) / threads;
blocks = blocks > 8 ? 8 : blocks; blocks = blocks > 8 ? 8 : blocks;
KeNearestNeighborInterp<<<blocks, threads, 0, stream>>>(input_data, KeNearestNeighborInterp<<<blocks, threads, 0, stream>>>(input_data,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册