From a2bd1692ae63d9bcbf68683811790940e88af16d Mon Sep 17 00:00:00 2001 From: yangfei Date: Thu, 25 Oct 2018 17:18:18 +0800 Subject: [PATCH] depthwise_conv op kernel for gpu --- src/framework/cl/cl_image.h | 8 ++ src/framework/cl/cl_image_converter.cpp | 92 +++++++++++++++++++ src/framework/cl/cl_image_converter.h | 6 ++ .../kernel/cl/conv_add_bn_relu_kernel.cpp | 15 +-- 4 files changed, 109 insertions(+), 12 deletions(-) diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index 90c55aab66..35f60d3b77 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -106,6 +106,14 @@ class CLImage { InitCLImage(context, command_queue, folder_converter); PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4"); } + void InitDWImage(cl_context context, cl_command_queue command_queue) { + if (tensor_data_ == nullptr) { + PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); + } + CLImageConverterDWBlock *dw_converter = new CLImageConverterDWBlock(); + InitCLImage(context, command_queue, dw_converter); + PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4"); + } void InitEmptyImage(cl_context context, cl_command_queue command_queue, const DDim &dim) { diff --git a/src/framework/cl/cl_image_converter.cpp b/src/framework/cl/cl_image_converter.cpp index ebcfd0d675..13094a8d05 100644 --- a/src/framework/cl/cl_image_converter.cpp +++ b/src/framework/cl/cl_image_converter.cpp @@ -297,5 +297,97 @@ void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, DLOG << " init done"; } +const DDim &CLImageConverterDWBlock::InitImageDimInfoWith( + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + size_t N, C, H, W; + N = tensor_dim[0]; + C = tensor_dim[1]; + H = tensor_dim[2]; + W = tensor_dim[3]; + size_t width = W * ((N + 3) / 4); + size_t height = C * H; + return make_ddim({width, height}); +} + +void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[1]; + C = new_dims[0]; + H = new_dims[2]; + W = new_dims[3]; + + DDim in_image_dim = InitImageDimInfoWith(tensor_dim); + + DLOG << " tensor dim " << tensor_dim; + DLOG << " image dim " << in_image_dim; + + size_t width = in_image_dim[0]; + size_t height = in_image_dim[1]; + + int w_block = width / W; + + float *p = tensor; + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < w_block * 4; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + if (c < C) { + // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + + // (c % 4); + image[i2] = Float2Half(*p); + i2 += 4; + p++; + } else { + image[i2] = 0.0; + i2 += 4; + } + } + i1 += width; + } + } + i0 += width * H; + } +} + +void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + float *p = tensor; + int N = tensor_dim[1]; + int C = tensor_dim[0]; + int H = tensor_dim[2]; + int W = tensor_dim[3]; + int width = image_dim[0]; + int height = image_dim[0]; + + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + *p = Half2Float(image[i2]); + i2 += 4; + p++; + } + i1 += width; + } + } + i0 += width * H; + } +} + } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/cl/cl_image_converter.h b/src/framework/cl/cl_image_converter.h index 6b7318e105..02887b0cd4 100644 --- a/src/framework/cl/cl_image_converter.h +++ b/src/framework/cl/cl_image_converter.h @@ -69,6 +69,12 @@ class CLImageConverterNWBlock : public CLImageConverterBase { void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, const DDim &tensor_dim); }; +class CLImageConverterDWBlock : public CLImageConverterBase { + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; } // namespace framework } // namespace paddle_mobile diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index ecd0c0beef..0bc348e170 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -29,15 +29,6 @@ bool ConvAddBNReluKernel::Init( param->Paddings()[0] == param->Paddings()[1], "need equal"); - auto filter_ddim = param->Filter()->dims(); - - std::vector filter_shape( - {filter_ddim[1], filter_ddim[0], filter_ddim[2], filter_ddim[3]}); - framework::DDim ddim = framework::make_ddim(filter_shape); - if (filter_ddim[1] == 1) { - param->Filter()->Resize(ddim); - } - param->Bias()->InitCLImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); @@ -140,10 +131,10 @@ bool ConvAddBNReluKernel::Init( this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); DLOG << " conv add bn relu conv 1x1"; - } else if (param->Filter()->dims()[0] == 1 && + } else if (param->Filter()->dims()[1] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && param->Filter()->dims()[2] == 3) { - param->Filter()->InitCLImage(cl_helper_.CLContext(), + param->Filter()->InitDWImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); DLOG << " conv add bn relu depth_conv_3x3"; @@ -151,7 +142,7 @@ bool ConvAddBNReluKernel::Init( } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); + cl_helper_.CLCommandQueue()); this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); DLOG << " conv add bn relu conv_3x3"; -- GitLab