fix out range in conv 1x1 for nano yolo ,test=develop (#2246)

61a89ef2 · xiebaiyuan · GitHub · e954a6e5 · 61a89ef2 · 61a89ef2
6 changed file
--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
@@ -126,6 +126,9 @@ class CLImage {
  void InitEmptyImage(cl_context context, cl_command_queue command_queue,
                      const DDim &dim) {
+    if (image_converter_ != nullptr) {
+      delete image_converter_;
+    }
    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
                          " empty image tensor data shouldn't have value");
@@ -153,7 +156,9 @@ class CLImage {
                         const DDim &need_dims, const DDim &real_image_dims) {
    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
                          " empty image tensor data shouldn't have value");
+    if (image_converter_ != nullptr) {
+      delete image_converter_;
+    }
    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
    // use real image dims to create mem
    real_image_dims_ = real_image_dims;
@@ -178,6 +183,9 @@ class CLImage {
   */
  void InitWithExistMem(cl_context context, cl_command_queue command_queue,
                        DDim need_dims, const CLImage &src) {
+    if (image_converter_ != nullptr) {
+      delete image_converter_;
+    }
    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
    real_image_dims_ = src.real_image_dims_;

--- a/mobile/src/operators/elementwise_mul_op.cpp
+++ b/mobile/src/operators/elementwise_mul_op.cpp
@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp);
 #endif
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CL(elementwise_mul, ops::ElementwiseMulOp);
+#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp);
 #endif

--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -212,6 +212,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
                    param.Input()->Converter())
                    ->GetCBlock();
+  int input_c_origin = param.Input()->dims()[1];
  int dilation = param.Dilations()[0];
  int input_width = param.Input()->dims()[3];
  int input_height = param.Input()->dims()[2];
@@ -284,6 +285,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c_origin);
+    CL_CHECK_ERRORS(status);
    status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
    CL_CHECK_ERRORS(status);

--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     half4 in = read_imageh(input, sampler, coords);
+     half4 biase = read_imageh(bias, sampler, coords);
+     half4 output = in * biase;
+     write_imageh(outputImage,coords,output);
+ }
+__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only
+image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x/w;
+  coords_bias.y = 0;
+  half4 in = read_imageh(input, sampler, coords);
+  half4 biase = read_imageh(bias, sampler, coords_bias);
+  half4 output = in * biase;
+  write_imageh(outputImage,coords,output);
+}
--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEMUL_OP
+#include "operators/kernel/elementwise_mul_kernel.h"
+#include "framework/cl/cl_image.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ElementwiseMulKernel<GPU_CL, float>::Init(
+    ElementwiseMulParam<GPU_CL> *param) {
+  DLOG << "-----init add-----";
+  framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
+      const_cast<framework::CLImage *>(param->InputY()));
+  if (bias->dims() == param->InputX()->dims()) {
+    this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
+  } else if (bias->dims().size() == 4) {
+    this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
+  } else {
+    DLOG << "error:bias dims is error";
+  }
+  return true;
+}
+template <>
+void ElementwiseMulKernel<GPU_CL, float>::Compute(
+    const ElementwiseMulParam<GPU_CL> &param) {
+  auto input = param.InputX();
+  auto bias = param.InputY();
+  auto output = param.Out();
+  cl_int status;
+  auto kernel = this->cl_helper_.KernelAt(0);
+  if (bias->dims() == input->dims()) {
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&input_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&bias_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&output_image));
+    CL_CHECK_ERRORS(status);
+    auto width = input->ImageWidth();
+    auto height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else if (bias->dims().size() == 4) {
+    DLOG << "zp7 444";
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    int tensor_w = input->dims()[input->dims().size() - 1];
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&input_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&bias_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&output_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                            reinterpret_cast<void *>(&tensor_w));
+    CL_CHECK_ERRORS(status);
+    auto width = input->ImageWidth();
+    auto height = input->ImageHeight();
+    DLOG << "dede:" << width << "," << height;
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else {
+    DLOG << "error:bias dims is error";
+  }
+}
+template class ElementwiseMulKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif