diff --git a/src/framework/cl/cl_scope.h b/src/framework/cl/cl_scope.h index 15ca27fd7c7929aa216e7bbef73e2555028285b8..61082808b53b9edc821daca5c310c8bb4c3b4f98 100644 --- a/src/framework/cl/cl_scope.h +++ b/src/framework/cl/cl_scope.h @@ -54,7 +54,7 @@ class CLScope { } auto program = - CLEngine::Instance()->CreateProgramWith(context_.get(), file_name); + CLEngine::Instance()->CreateProgramWith(context_.get(), "./cl_kernel/" + file_name); programs_[file_name] = std::move(program); status_ = clBuildProgram(program.get(), 0, 0, 0, 0, 0); diff --git a/src/operators/kernel/cl/cl_kernel/common.h b/src/operators/kernel/cl/cl_kernel/cl_common.h similarity index 94% rename from src/operators/kernel/cl/cl_kernel/common.h rename to src/operators/kernel/cl/cl_kernel/cl_common.h index 027255d9dc01ad8f6da6e23842a80c029f2698ee..c71967ccd66e26e2afb796e7e6a2cb550f80bb40 100644 --- a/src/operators/kernel/cl/cl_kernel/common.h +++ b/src/operators/kernel/cl/cl_kernel/cl_common.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once; -/* +#pragma OPENCL EXTENSION cl_khr_fp16 : enable inline hafl4 activation(half4 in #ifdef PRELU @@ -32,4 +32,3 @@ inline hafl4 activation(half4 in return output; } -*/ diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 04ceed8a6e16378f87966e53dff2bc880d3141d6..85a3d0b4093d1803bf5afb25be1bbaee264bd9e3 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -23,9 +23,7 @@ conv_add_bn_relu */ -/* - -#include "common.h" +#include "cl_common.h" __kernel void conv_1x1(__private const int global_size_dim0, __private const int global_size_dim1, @@ -43,8 +41,11 @@ __kernel void conv_1x1(__private const int global_size_dim0, __private const int stride, __private const int offset, __private const int input_c, + __private const int dilation, __private const int input_width,/* of one block */ - __private const int input_height/* of one block */) { + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -114,7 +115,9 @@ __kernel void conv_3x3(__private const int global_size_dim0, __private const int input_c, __private const int dilation, __private const int input_width,/* of one block */ - __private const int input_height/* of one block */) { + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { int2 stride_xy = int2(stride, stride); int2 ouput_pos_in_one_block = int2(out_w, out_nh); int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset); @@ -305,5 +308,3 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, int2 output_pos(out_c * global_size_dim1 + out_w, out_nh); write_imageh(output_image, output_pos, output); } - -*/ diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index e62041d3f47aae8dbc9078d49beb84d45c2d9423..fd846be8024bd2742f6825f08993f17dfcd3509a 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -109,8 +109,11 @@ void ConvAddBNReluKernel::Compute( int stride = param.Strides()[0]; int offset = param.Offset(); int input_c = param.Input()->CBlock(); + int dilation = param.Dilations()[0]; int input_width = param.Input()->WidthOfOneBlock(); int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); clSetKernelArg(kernel, 0, sizeof(int), &c_block); clSetKernelArg(kernel, 1, sizeof(int), &w); @@ -124,8 +127,11 @@ void ConvAddBNReluKernel::Compute( clSetKernelArg(kernel, 9, sizeof(int), &stride); clSetKernelArg(kernel, 10, sizeof(int), &offset); clSetKernelArg(kernel, 11, sizeof(int), &input_c); - clSetKernelArg(kernel, 12, sizeof(int), &input_width); - clSetKernelArg(kernel, 13, sizeof(int), &input_height); + clSetKernelArg(kernel, 12, sizeof(int), &dilation); + clSetKernelArg(kernel, 13, sizeof(int), &input_width); + clSetKernelArg(kernel, 14, sizeof(int), &input_height); + clSetKernelArg(kernel, 15, sizeof(int), &output_width); + clSetKernelArg(kernel, 16, sizeof(int), &output_height); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index 74de92e4c28709a5fdffa99402b1214982475511..32466dc17d1a4720071f796d17ed2a08790aea8e 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -59,8 +59,11 @@ void ConvAddKernel::Compute( int stride = param.Strides()[0]; int offset = param.Offset(); int input_c = param.Input()->CBlock(); + int dilation = param.Dilations()[0]; int input_width = param.Input()->WidthOfOneBlock(); int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); clSetKernelArg(kernel, 0, sizeof(int), &c_block); clSetKernelArg(kernel, 1, sizeof(int), &w); @@ -68,12 +71,15 @@ void ConvAddKernel::Compute( clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); - clSetKernelArg(kernel, 9, sizeof(int), &stride); - clSetKernelArg(kernel, 10, sizeof(int), &offset); - clSetKernelArg(kernel, 11, sizeof(int), &input_c); - clSetKernelArg(kernel, 12, sizeof(int), &input_width); - clSetKernelArg(kernel, 13, sizeof(int), &input_height); + clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + clSetKernelArg(kernel, 7, sizeof(int), &stride); + clSetKernelArg(kernel, 8, sizeof(int), &offset); + clSetKernelArg(kernel, 9, sizeof(int), &input_c); + clSetKernelArg(kernel, 10, sizeof(int), &dilation); + clSetKernelArg(kernel, 11, sizeof(int), &input_width); + clSetKernelArg(kernel, 12, sizeof(int), &input_height); + clSetKernelArg(kernel, 13, sizeof(int), &output_width); + clSetKernelArg(kernel, 14, sizeof(int), &output_height); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp index ec265b7992cd62fd4f77399698c377570c2b7a61..6d62515b8ac227eefa4c2a7f9fa8f86880079e05 100644 --- a/src/operators/kernel/cl/conv_kernel.cpp +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -60,6 +60,8 @@ void ConvKernel::Compute(const ConvParam ¶m) { int dilation = param.Dilations()[0]; int input_width = param.Input()->WidthOfOneBlock(); int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); clSetKernelArg(kernel, 0, sizeof(int), &c_block); clSetKernelArg(kernel, 1, sizeof(int), &w); @@ -73,14 +75,11 @@ void ConvKernel::Compute(const ConvParam ¶m) { clSetKernelArg(kernel, 9, sizeof(int), &dilation); clSetKernelArg(kernel, 10, sizeof(int), &input_width); clSetKernelArg(kernel, 11, sizeof(int), &input_height); + clSetKernelArg(kernel, 12, sizeof(int), &output_width); + clSetKernelArg(kernel, 13, sizeof(int), &output_height); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); - - // auto kernel = this->cl_helper_.KernelAt(0); - // size_t global_work_size[3] = {1, 2, 3}; - // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - // global_work_size, NULL, 0, NULL, NULL); } template class ConvKernel; diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..73ab8d7e1e328f7577a32199d7a56fc1216d0d83 --- /dev/null +++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef DEPTHWISECONV_OP + +#include "operators/kernel/depthwise_conv_kernel.h" +#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DepthwiseConvKernel::Init(ConvParam *param) { + DLOG << " depthwise conv kernel init begin "; + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + DLOG << " depthwise conv kernel init end "; + return true; +} + +template <> +void DepthwiseConvKernel::Compute(const ConvParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto output = param.Output(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = param.Input()->CBlock(); + int dilation = param.Dilations()[0]; + int input_width = param.Input()->WidthOfOneBlock(); + int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); + + clSetKernelArg(kernel, 0, sizeof(int), &c_block); + clSetKernelArg(kernel, 1, sizeof(int), &w); + clSetKernelArg(kernel, 2, sizeof(int), &nh); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + clSetKernelArg(kernel, 6, sizeof(int), &stride); + clSetKernelArg(kernel, 7, sizeof(int), &offset); + clSetKernelArg(kernel, 8, sizeof(int), &input_c); + clSetKernelArg(kernel, 9, sizeof(int), &dilation); + clSetKernelArg(kernel, 10, sizeof(int), &input_width); + clSetKernelArg(kernel, 11, sizeof(int), &input_height); + clSetKernelArg(kernel, 12, sizeof(int), &output_width); + clSetKernelArg(kernel, 13, sizeof(int), &output_height); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); +} + +template class DepthwiseConvKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif \ No newline at end of file diff --git a/tools/build.sh b/tools/build.sh index baa9fe1097b774418899cb20f2f1e63520fa7792..42ea9497a6443d148db58f821d4c30bffdca075a 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -90,6 +90,8 @@ build_for_android() { fi cd "../build/release/${PLATFORM}" make -j 8 + mkdir ./build/cl_kernel + cp ../../../src/operators/kernel/cl/cl_kernel/* ./build/cl_kernel/ } build_for_ios() {