diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h index 75ac9a81ee20afcb4272c8abacd163ccb1e3fee0..0f8046f8f151d53480cf8054763c4a4ec9209ec1 100644 --- a/src/framework/cl/cl_engine.h +++ b/src/framework/cl/cl_engine.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "CL/cl.h" #include "common/enforce.h" #include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_tool.h" namespace paddle_mobile { namespace framework { diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index baedef23c194e370c9e5f303789a8f6358f644c2..58c8ea6bc4e76dddf15a263878e5270563d2ed57 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -43,17 +43,28 @@ class CLImage { C = tensor_dims_[1]; H = tensor_dims_[2]; W = tensor_dims_[3]; + + width_of_one_block_ = W; + height_of_one_block_ = H; + } else if (tensor_dims_.size() == 1) { N = 1; C = tensor_dims_[0]; H = 1; W = 1; + + width_of_one_block_ = W; + height_of_one_block_ = H; } DLOG << "-------InitMemory-------"; size_t width = W * ((C + 3) / 4); size_t height = H * N; + + image_width_ = width; + image_height_ = height; + std::unique_ptr imageData{}; int count = 0; if (tensorInput != nullptr) { @@ -95,9 +106,13 @@ class CLImage { 0, // size_t image_row_pitch reinterpret_cast(imageData.get()), // void *host_ptr &err); + if (err != CL_SUCCESS) { // TODO(HaiPeng): error handling + PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); } + + initialized_ = true; } void Init(cl_context context, DDim ddim) { Init(context, nullptr, ddim); } @@ -109,8 +124,6 @@ class CLImage { const DDim &dims() const { return tensor_dims_; } - std::vector DefaultWorkSize() { return {}; } - cl_mem GetCLImage() const { return cl_image_; } template @@ -120,24 +133,24 @@ class CLImage { inline int64_t numel() const { return product(tensor_dims_); } - int ImageWidth() const { return image_width_; } + inline size_t ImageWidth() const { return image_width_; } - int ImageHeight() const { return image_height_; } + inline size_t ImageHeight() const { return image_height_; } - int CBlock() const { return c_block_; } + inline size_t CBlock() const { return c_block_; } - int WidthOfOneBlock() const { return width_of_one_block_; } + inline size_t WidthOfOneBlock() const { return width_of_one_block_; } - int HeightOfOneBlock() const { return height_of_one_block_; } + inline size_t HeightOfOneBlock() const { return height_of_one_block_; } private: bool initialized_ = false; cl_mem cl_image_; - int image_width_; - int width_of_one_block_; - int height_of_one_block_; - int image_height_; - int c_block_; + size_t image_width_; + size_t width_of_one_block_; + size_t height_of_one_block_; + size_t image_height_; + size_t c_block_; DDim tensor_dims_; float *tensor_input_; cl_context context_; diff --git a/src/framework/loader.cpp b/src/framework/loader.cpp index 311f4f1db1b9ed1278190122c20e03f8ef8df9b7..0122f8916f2aa454df7ffe2ebaa8bbc9fe686b7b 100644 --- a/src/framework/loader.cpp +++ b/src/framework/loader.cpp @@ -95,15 +95,15 @@ void Loader::InitMemoryFromProgram( */ template void FusionAndPrintInfos( - bool optimize, bool can_add_split, const Program &program, + bool optimize, bool can_add_split, Program *program, const std::shared_ptr &originProgramDesc) { if (optimize) { ProgramOptimize program_optimize; - program.optimizeProgram = + program->optimizeProgram = program_optimize.FusionOptimize(originProgramDesc, can_add_split); } if (optimize) { - program.optimizeProgram->Description("optimize: "); + program->optimizeProgram->Description("optimize: "); } else { originProgramDesc->Description("program: "); } @@ -186,7 +186,7 @@ const Program Loader::LoadProgram( // use originProgramDesc and scope to init tensors InitMemoryFromProgram(originProgramDesc, scope); // perform fusion and print infos - FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc); + FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc); paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL); return program; @@ -195,7 +195,7 @@ const Program Loader::LoadProgram( template const Program Loader::LoadCombinedMemory( size_t read_size, const uint8_t *buf, size_t combined_params_len, - const uint8_t *combined_params_buf, bool optimize, bool quantification) { + uint8_t *combined_params_buf, bool optimize, bool quantification) { bool can_add_split = false; PaddleMobile__Framework__Proto__ProgramDesc *c_program; @@ -221,7 +221,7 @@ const Program Loader::LoadCombinedMemory( auto scope = std::make_shared(); program.scope = scope; InitMemoryFromProgram(originProgramDesc, scope); - FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc); + FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc); paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, nullptr); return program; diff --git a/src/framework/loader.h b/src/framework/loader.h index ce2ffdb4c27ca8f305c4dfd9b2c0ea987dadb33f..3200f0b25368fa123b80c51000cfd6c6a6d084b6 100644 --- a/src/framework/loader.h +++ b/src/framework/loader.h @@ -46,7 +46,7 @@ class Loader { const Program LoadCombinedMemory(size_t model_len, const uint8_t *model_buf, size_t combined_params_len, - const uint8_t *combined_params_buf, + uint8_t *combined_params_buf, bool optimize = false, bool quantification = false); diff --git a/src/framework/program/program.h b/src/framework/program/program.h index 192328a567e6d3bfad7a8a3b35e3bc64131a2cd2..ae3e7b0abea2b2e1fc41962dc1d926fed252e096 100644 --- a/src/framework/program/program.h +++ b/src/framework/program/program.h @@ -32,7 +32,7 @@ class Program { bool combined = false; bool quantification = false; size_t combined_params_len; - const uint8_t *combined_params_buf; + uint8_t *combined_params_buf; private: }; diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index 54a2a4e912266f12c2dd4c232cb3061a7a487bb1..8bea4412ac0b371e029f5aa8914bcb2d6eeb547b 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -68,9 +68,10 @@ bool PaddleMobile::Load(const std::string &model_path, } template -bool PaddleMobile::LoadCombinedMemory( - size_t model_len, const uint8_t *model_buf, size_t combined_params_len, - const uint8_t *combined_params_buf) { +bool PaddleMobile::LoadCombinedMemory(size_t model_len, + const uint8_t *model_buf, + size_t combined_params_len, + uint8_t *combined_params_buf) { int batch_size = 1; bool optimise = true; bool quantification = false; diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h index 2259fee1a53393407f2b8e0c06d0f94cd39da0fe..b53dc5ac9eb1255a51992c3dd4dbbba3f306c467 100644 --- a/src/io/paddle_mobile.h +++ b/src/io/paddle_mobile.h @@ -83,7 +83,7 @@ class PaddleMobile { */ bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf, size_t combined_params_len, - const uint8_t *combined_params_buf); + uint8_t *combined_params_buf); void Clear(); diff --git a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl index d441932dbf0dc5f079c0bdf13a30c8e5b3215cf7..f731a61a82f9d1e7d44e760037512157c4ffef19 100644 --- a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + __kernel void elementwise_add(__global float* in, __global float* out) { int num = get_global_id(0); out[num] = in[num] * 0.1 + 102; diff --git a/src/operators/kernel/cl/conv_add_bn_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_kernel.cpp deleted file mode 100644 index f5141015dbc9b5a4d61a71baf42469c95cb0f609..0000000000000000000000000000000000000000 --- a/src/operators/kernel/cl/conv_add_bn_kernel.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDBN_OP - -#include "operators/kernel/conv_add_bn_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddBNReluKernel::Init( - FusionConvAddBNReluParam *param) { - return true; -} - -template <> -void ConvAddBNReluKernel::Compute( - const FusionConvAddBNReluParam ¶m) {} - -template class ConvAddBNReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index 9fb7b94927e34d11d0a6f54ee3a755bd88e0b292..e62041d3f47aae8dbc9078d49beb84d45c2d9423 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -37,6 +37,7 @@ bool ConvAddBNReluKernel::Init( auto bias_ptr = bias->data(); const int C = mean->numel(); + float inv_std_ptr[C]; for (int i = 0; i < C; i++) { inv_std_ptr[i] = @@ -55,8 +56,13 @@ bool ConvAddBNReluKernel::Init( framework::CLImage *new_scale = new framework::CLImage(); + new_scale->Init(this->cl_helper_.CLContext(), new_scale_ptr, + variance->dims()); + framework::CLImage *new_bias = new framework::CLImage(); + new_bias->Init(this->cl_helper_.CLContext(), new_bias_ptr, variance->dims()); + param->SetNewScale(new_scale); param->SetNewBias(new_bias); @@ -65,10 +71,23 @@ bool ConvAddBNReluKernel::Init( param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Paddings()[0] == param->Paddings()[1], "need equal"); - param->SetOffset(param->Filter()->dims()[2] / 2 - - static_cast(param->Paddings()[1])); - this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + + param->SetOffset(offset); + + if (param->Filter()->WidthOfOneBlock() == 1 && + param->Filter()->HeightOfOneBlock() == 1) { + this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); + } else if (param->Filter()->dims()[1] == 1) { + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + } else if (param->Filter()->WidthOfOneBlock() == 3 && + param->Filter()->HeightOfOneBlock() == 3) { + this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } return true; } diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index 0eb4fd61afe78654c1988ab0a0c5144e22531b5f..74de92e4c28709a5fdffa99402b1214982475511 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -21,12 +21,63 @@ namespace operators { template <> bool ConvAddKernel::Init(FusionConvAddParam *param) { + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + + if (param->Filter()->WidthOfOneBlock() == 1 && + param->Filter()->HeightOfOneBlock() == 1) { + this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); + } else if (param->Filter()->dims()[1] == 1) { + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + } else if (param->Filter()->WidthOfOneBlock() == 3 && + param->Filter()->HeightOfOneBlock() == 3) { + this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + return true; } template <> void ConvAddKernel::Compute( - const FusionConvAddParam ¶m) {} + const FusionConvAddParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto biase = param.Bias()->GetCLImage(); + auto output = param.Output(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = param.Input()->CBlock(); + int input_width = param.Input()->WidthOfOneBlock(); + int input_height = param.Input()->HeightOfOneBlock(); + + clSetKernelArg(kernel, 0, sizeof(int), &c_block); + clSetKernelArg(kernel, 1, sizeof(int), &w); + clSetKernelArg(kernel, 2, sizeof(int), &nh); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); + clSetKernelArg(kernel, 9, sizeof(int), &stride); + clSetKernelArg(kernel, 10, sizeof(int), &offset); + clSetKernelArg(kernel, 11, sizeof(int), &input_c); + clSetKernelArg(kernel, 12, sizeof(int), &input_width); + clSetKernelArg(kernel, 13, sizeof(int), &input_height); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); +} template class ConvAddKernel; diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp index 1e56b29ef9e9aa5bcc1f40ca3c045f92b6d0c3be..ec265b7992cd62fd4f77399698c377570c2b7a61 100644 --- a/src/operators/kernel/cl/conv_kernel.cpp +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -21,12 +21,62 @@ namespace operators { template <> bool ConvKernel::Init(ConvParam *param) { - // this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl"); + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + + if (param->Filter()->WidthOfOneBlock() == 1 && + param->Filter()->HeightOfOneBlock() == 1) { + this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); + } else if (param->Filter()->dims()[1] == 1) { + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + } else if (param->Filter()->WidthOfOneBlock() == 3 && + param->Filter()->HeightOfOneBlock() == 3) { + this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + return true; } template <> void ConvKernel::Compute(const ConvParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto output = param.Output(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = param.Input()->CBlock(); + int dilation = param.Dilations()[0]; + int input_width = param.Input()->WidthOfOneBlock(); + int input_height = param.Input()->HeightOfOneBlock(); + + clSetKernelArg(kernel, 0, sizeof(int), &c_block); + clSetKernelArg(kernel, 1, sizeof(int), &w); + clSetKernelArg(kernel, 2, sizeof(int), &nh); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + clSetKernelArg(kernel, 6, sizeof(int), &stride); + clSetKernelArg(kernel, 7, sizeof(int), &offset); + clSetKernelArg(kernel, 8, sizeof(int), &input_c); + clSetKernelArg(kernel, 9, sizeof(int), &dilation); + clSetKernelArg(kernel, 10, sizeof(int), &input_width); + clSetKernelArg(kernel, 11, sizeof(int), &input_height); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + // auto kernel = this->cl_helper_.KernelAt(0); // size_t global_work_size[3] = {1, 2, 3}; // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,