From e8d74b3b23f6f4de0646491248da8144a932d786 Mon Sep 17 00:00:00 2001 From: liuruilong Date: Thu, 18 Oct 2018 16:55:22 +0800 Subject: [PATCH] update conv kernel --- src/framework/cl/cl_image.h | 17 +++-- src/framework/executor.cpp | 2 +- src/operators/feed_op.cpp | 4 +- src/operators/kernel/arm/feed_kernel.cpp | 26 +++---- .../cl/cl_kernel/conv_add_bn_relu_kernel.cl | 18 +++-- .../kernel/cl/cl_kernel/conv_add_kernel.cl | 19 +++-- .../kernel/cl/cl_kernel/conv_kernel.cl | 19 +++-- .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 21 ++++-- .../kernel/cl/conv_add_bn_relu_kernel.cpp | 12 +++ src/operators/kernel/fpga/feed-kernel.cpp | 74 +++++++++---------- src/operators/kernel/mali/feed_kernel.cpp | 22 +++--- 11 files changed, 137 insertions(+), 97 deletions(-) diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index b7625ca65d..8fc9f49ffa 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -257,16 +257,21 @@ class CLImage { float *p = tensor_data; size_t i0 = 0; for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { + for (int c = 0; c < c_block_ * 4; c++) { size_t i1 = i0 + (c / 4) * W; for (int h = 0; h < H; h++) { size_t i2 = (i1 << 2) + c % 4; for (int w = 0; w < W; w++) { - // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + (c - // % 4); - imageData[i2] = Float2Half(*p); - i2 += 4; - p++; + if (c < C) { + // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + + // (c % 4); + imageData[i2] = Float2Half(*p); + i2 += 4; + p++; + } else { + imageData[i2] = 0.0; + i2 += 4; + } } i1 += width; } diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 7980a2d1f6..22aa7cc98a 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -37,7 +37,7 @@ limitations under the License. */ #include "framework/cl/cl_image.h" #endif -int debug_to = 3; +int debug_to = 2; namespace paddle_mobile { namespace framework { diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp index dc6d0e9227..9299d7cc19 100644 --- a/src/operators/feed_op.cpp +++ b/src/operators/feed_op.cpp @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "feed_op.h" +#include "operators/feed_op.h" + namespace paddle_mobile { namespace operators { @@ -22,6 +23,7 @@ void FeedOp::InferShape() const { out_dims[0] = this->param_.BatchSize(); this->param_.Out()->Resize(out_dims); } + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/arm/feed_kernel.cpp b/src/operators/kernel/arm/feed_kernel.cpp index 03b0ec311e..598f6df01b 100644 --- a/src/operators/kernel/arm/feed_kernel.cpp +++ b/src/operators/kernel/arm/feed_kernel.cpp @@ -12,25 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "operators/kernel/feed_kernel.h" namespace paddle_mobile { - namespace operators { +namespace operators { - template <> - bool FeedKernel::Init(FeedParam *param) { - return true; - } +template <> +bool FeedKernel::Init(FeedParam *param) { + return true; +} - template <> - void FeedKernel::Compute(const FeedParam ¶m) { - param.Out()->ShareDataWith(*(param.InputX())); - param.Out()->set_lod(param.InputX()->lod()); - } +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); + param.Out()->set_lod(param.InputX()->lod()); +} - template class FeedKernel; +template class FeedKernel; - } // namespace operators +} // namespace operators } // namespace paddle_mobile - diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl index eb73248d74..ae04c64aa9 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl @@ -65,6 +65,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + printf(" out of range "); + return; + } + + int2 stride_xy; stride_xy.x = stride; stride_xy.y = stride; @@ -135,24 +143,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, input[8] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); for (int j = 0; j < 9; ++j) { int2 fuck; fuck.x = i * 3 + j % 3; - fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3; half4 weight_x = read_imageh(filter, sampler, fuck); output.x += dot(input[j], weight_x); - fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3; half4 weight_y = read_imageh(filter, sampler, fuck); output.y += dot(input[j], weight_y); - fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3; half4 weight_z = read_imageh(filter, sampler, fuck); output.z += dot(input[j], weight_z); - fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3; half4 weight_w = read_imageh(filter, sampler, fuck); output.w += dot(input[j], weight_w); } diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl index 69fc177e9e..708321faf8 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl @@ -63,6 +63,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + printf(" out of range "); + return; + } + + int2 stride_xy; stride_xy.x = stride; stride_xy.y = stride; @@ -133,24 +141,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, input[8] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); for (int j = 0; j < 9; ++j) { int2 fuck; fuck.x = i * 3 + j % 3; - fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3; half4 weight_x = read_imageh(filter, sampler, fuck); output.x += dot(input[j], weight_x); - fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3; half4 weight_y = read_imageh(filter, sampler, fuck); output.y += dot(input[j], weight_y); - fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3; half4 weight_z = read_imageh(filter, sampler, fuck); output.z += dot(input[j], weight_z); - fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3; half4 weight_w = read_imageh(filter, sampler, fuck); output.w += dot(input[j], weight_w); } @@ -169,7 +177,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, - __kernel void depth_conv_3x3(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl index 02717a0853..5e27f49556 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl @@ -44,6 +44,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + printf(" out of range "); + return; + } + + int2 stride_xy; stride_xy.x = stride; stride_xy.y = stride; @@ -114,24 +122,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, input[8] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); for (int j = 0; j < 9; ++j) { int2 fuck; fuck.x = i * 3 + j % 3; - fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3; half4 weight_x = read_imageh(filter, sampler, fuck); output.x += dot(input[j], weight_x); - fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3; half4 weight_y = read_imageh(filter, sampler, fuck); output.y += dot(input[j], weight_y); - fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3; half4 weight_z = read_imageh(filter, sampler, fuck); output.z += dot(input[j], weight_z); - fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3; half4 weight_w = read_imageh(filter, sampler, fuck); output.w += dot(input[j], weight_w); } @@ -150,7 +158,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, - __kernel void depth_conv_3x3(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 0ff05d4968..333aa79043 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -54,6 +54,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + printf(" out of range "); + return; + } + + int2 stride_xy; stride_xy.x = stride; stride_xy.y = stride; @@ -124,24 +132,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, input[8] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); for (int j = 0; j < 9; ++j) { int2 fuck; fuck.x = i * 3 + j % 3; - fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3; half4 weight_x = read_imageh(filter, sampler, fuck); output.x += dot(input[j], weight_x); - fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3; half4 weight_y = read_imageh(filter, sampler, fuck); output.y += dot(input[j], weight_y); - fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3; half4 weight_z = read_imageh(filter, sampler, fuck); output.z += dot(input[j], weight_z); - fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3; half4 weight_w = read_imageh(filter, sampler, fuck); output.w += dot(input[j], weight_w); } @@ -158,9 +166,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); } - - - __kernel void depth_conv_3x3(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index 1c75b8be9f..0d5dc25157 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -165,6 +165,18 @@ void ConvAddBNReluKernel::Compute( int output_width = param.Output()->WidthOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock(); + DLOG << " c block " << c_block; + DLOG << " w " << w; + DLOG << " nh " << nh; + DLOG << " stride " << stride; + DLOG << " offset " << offset; + DLOG << " input_c " << input_c; + DLOG << " dilation " << dilation; + DLOG << " input width " << input_width; + DLOG << " input height " << input_height; + DLOG << " output width " << output_width; + DLOG << " output height " << output_height; + cl_int status; status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); diff --git a/src/operators/kernel/fpga/feed-kernel.cpp b/src/operators/kernel/fpga/feed-kernel.cpp index c42f3345ea..161d8c9f0c 100644 --- a/src/operators/kernel/fpga/feed-kernel.cpp +++ b/src/operators/kernel/fpga/feed-kernel.cpp @@ -15,41 +15,41 @@ limitations under the License. */ #include "operators/kernel/feed_kernel.h" namespace paddle_mobile { - namespace operators { - - template <> - bool FeedKernel::Init(FeedParam *param) { - Tensor *output = param->Out(); - fpga::format_fp16_ofm(output); - return true; - } - - template <> - void FeedKernel::Compute(const FeedParam ¶m) { - auto input = reinterpret_cast(const_cast(param.InputX())); - auto input_ptr = input->data(); - fpga::format_image(input); - Tensor *output = param.Out(); - auto output_ptr = output->data(); - - fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; - - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = reinterpret_cast(input_ptr); - args.image.channels = (uint32_t)input->dims()[1]; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = output_ptr; - args.output.scale_address = output->scale; - fpga::PerformBypass(args); - } - template class FeedKernel; - - } // namespace operators +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + Tensor *output = param->Out(); + fpga::format_fp16_ofm(output); + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + auto input = + reinterpret_cast(const_cast(param.InputX())); + auto input_ptr = input->data(); + fpga::format_image(input); + Tensor *output = param.Out(); + auto output_ptr = output->data(); + + fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; + + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = reinterpret_cast(input_ptr); + args.image.channels = (uint32_t)input->dims()[1]; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = output_ptr; + args.output.scale_address = output->scale; + fpga::PerformBypass(args); +} +template class FeedKernel; + +} // namespace operators } // namespace paddle_mobile - diff --git a/src/operators/kernel/mali/feed_kernel.cpp b/src/operators/kernel/mali/feed_kernel.cpp index 4ca76e91aa..34be184d40 100644 --- a/src/operators/kernel/mali/feed_kernel.cpp +++ b/src/operators/kernel/mali/feed_kernel.cpp @@ -15,23 +15,19 @@ limitations under the License. */ #include "operators/kernel/feed_kernel.h" namespace paddle_mobile { - namespace operators { +namespace operators { +template <> +bool FeedKernel::Init(FeedParam *param) { + return true; +} - template <> - bool FeedKernel::Init( - FeedParam *param) { - return true; - } +template <> +void FeedKernel::Compute(const FeedParam ¶m) {} - template <> - void FeedKernel::Compute( - const FeedParam ¶m) { - } +template class FeedKernel; - template class FeedKernel; - - } // namespace operators +} // namespace operators } // namespace paddle_mobile #endif -- GitLab