[mobile]Develop common deepwise & fix bug in element mul (#2687)

* [mobile][opencl]common deepwise conv,test=mobile * [mobile][opencl]revert deepwise 3x3 for stable ,test = mobile * [mobile][opencl]format convkernel.inc.cl with clang-format ,test = mobile * [mobile][opencl] suite 1*X Y element_y ,test=mobile * [mobile][opencl] add whole print method for cl_image ,test=mobile

[mobile]Develop common deepwise & fix bug in element mul (#2687)
* [mobile][opencl]common deepwise conv,test=mobile * [mobile][opencl]revert deepwise 3x3 for stable ,test = mobile * [mobile][opencl]format convkernel.inc.cl with clang-format ,test = mobile * [mobile][opencl] suite 1*X Y element_y ,test=mobile * [mobile][opencl] add whole print method for cl_image ,test=mobile
8d332397 · xiebaiyuan · GitHub · 53a5906c · 8d332397 · 8d332397
14 changed file
--- a/mobile/src/framework/cl/cl_image.cpp
+++ b/mobile/src/framework/cl/cl_image.cpp
@@ -18,6 +18,37 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {

+void CLImage::PrintTensor(const CLImage &cl_image) const {
+  size_t width = cl_image.ImageDims()[0];
+  size_t height = cl_image.ImageDims()[1];
+
+  half_t *image_data = new half_t[height * width * 4];
+  cl_int err;
+  cl_mem image = cl_image.GetCLImage();
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {width, height, 1};
+  err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
+                           region, 0, 0, image_data, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(err);
+
+  PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0,
+                        "cl_image numel should not be 0 ");
+  float *tensor_data = new float[cl_image.numel()];
+  auto converter = cl_image.Converter();
+  converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
+                         cl_image.dims());
+  int stride = cl_image.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+
+  for (int i = 0; i < cl_image.numel(); i++) {
+    printf("%f \n", tensor_data[i]);
+  }
+
+  delete[](tensor_data);
+  delete[](image_data);
+}
+
 void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
                     cl_command_queue commandQueue, cl_kernel kernel) {
  tensor->mutable_data<float>();

--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <iostream>
 #include <memory>
 #include <vector>

@@ -285,6 +286,7 @@ class CLImage {
  cl_event GetClEvent() const { return cl_event_.get(); }

  CLImageConverterBase *Converter() const { return image_converter_; }
+  void PrintTensor(const CLImage &cl_image) const;

 private:
  void InitCLImage(cl_context context, size_t width, size_t height,

--- a/mobile/src/framework/cl/cl_tool.h
+++ b/mobile/src/framework/cl/cl_tool.h
@@ -21,13 +21,14 @@ namespace framework {

 const char* opencl_error_to_str(cl_int error);

-#define CL_CHECK_ERRORS(ERR)                                          \
-  if (ERR != CL_SUCCESS) {                                            \
-    printf(                                                           \
-        "OpenCL error with code %s happened in file %s at line %d. "  \
-        "Exiting.\n",                                                 \
-        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
-        __LINE__);                                                    \
+#define CL_CHECK_ERRORS(ERR)                                                  \
+  if (ERR != CL_SUCCESS) {                                                    \
+    printf(                                                                   \
+        "\033[1;31;40mOpenCL error with code %s happened in file %s at line " \
+        "%d. "                                                                \
+        "Exiting.\033[0m\n",                                                  \
+        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__,         \
+        __LINE__);                                                            \
  }

 }  // namespace framework

--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -241,7 +241,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
  cl_int status;
  int index = 0;

-  if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
+  const int filter_height = param.Filter()->dims()[2];
+  const int filter_width = param.Filter()->dims()[3];
+  if (filter_height == 1 && filter_width == 1) {
    status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
    CL_CHECK_ERRORS(status);

@@ -404,7 +406,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
    status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
    CL_CHECK_ERRORS(status);

-    if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) {
+    if (filter_height == 3 && filter_width == 3) {
      // normal conv
      if (param.Filter()->dims()[0] == param.Output()->dims()[1] &&
          param.Filter()->dims()[1] == param.Input()->dims()[1]) {
@@ -425,6 +427,17 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
        status = clSetKernelArg(kernel, index++, sizeof(int), &group);
        CL_CHECK_ERRORS(status);
      }
+    } else if (filter_height != 3 && filter_width != 3) {
+      // not 3x3
+      if (param.Filter()->dims()[1] == 1 &&
+          param.Input()->dims()[1] == param.Output()->dims()[1]) {
+        // deepwise basic use in not 3x3
+        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
+        CL_CHECK_ERRORS(status);
+
+        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
+        CL_CHECK_ERRORS(status);
+      }
    }

    status = clEnqueueNDRangeKernel(

--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
@@ -13,33 +13,101 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
-     int x = get_global_id(0);
-     int y = get_global_id(1);
-     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-     int2 coords;
-     coords.x = x;
-     coords.y = y;
-     half4 in = read_imageh(input, sampler, coords);
-     half4 biase = read_imageh(bias, sampler, coords);
-     half4 output = in * biase;
-     write_imageh(outputImage,coords,output);
- }
-
-
-__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only
-image2d_t outputImage, int w) {
+__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
+                              __write_only image2d_t outputImage) {
  int x = get_global_id(0);
  int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  half4 in = read_imageh(input, sampler, coords);
+  half4 biase = read_imageh(bias, sampler, coords);
+  half4 output = in * biase;
+  write_imageh(outputImage, coords, output);
+}
+
+__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,
+                          __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  int2 coords;
  coords.x = x;
  coords.y = y;
  int2 coords_bias;
-  coords_bias.x = x/w;
+  coords_bias.x = x / w;
  coords_bias.y = 0;
  half4 in = read_imageh(input, sampler, coords);
  half4 biase = read_imageh(bias, sampler, coords_bias);
  half4 output = in * biase;
-  write_imageh(outputImage,coords,output);
+  write_imageh(outputImage, coords, output);
 }
+
+// etc : 1 1 1 72
+// run time Y  [value,0,0,0] * 72
+__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
+                             __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+
+  int2 coords_bias0;
+  int2 coords_bias1;
+  int2 coords_bias2;
+  int2 coords_bias3;
+
+  /*  if (x == 0 && y == 0) {
+      half4 b = (half4){0, 0, 0, 0};
+  #define PPI(j, k)                                                          \
+    b = read_imageh(bias, sampler, (int2){j, k});                            \
+    printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \
+           convert_float(b.y), convert_float(b.z), convert_float(b.w));
+      for (int i = 0; i < 73; ++i) {
+        PPI(i, 0);
+      }
+  #undef PPI
+    }*/
+
+  coords_bias0.x = x / w * 4;
+  coords_bias0.y = 0;
+
+  coords_bias1.x = x / w * 4 + 1;
+  coords_bias1.y = 0;
+
+  coords_bias2.x = x / w * 4 + 2;
+  coords_bias2.y = 0;
+
+  coords_bias3.x = x / w * 4 + 3;
+  coords_bias3.y = 0;
+
+  half4 biase0 = read_imageh(bias, sampler, coords_bias0);
+  half4 biase1 = read_imageh(bias, sampler, coords_bias1);
+  half4 biase2 = read_imageh(bias, sampler, coords_bias2);
+  half4 biase3 = read_imageh(bias, sampler, coords_bias3);
+  /*  if (x == 0 && y == 0) {
+      printf("bias0={ %f , %f , %f , %f }\n ",
+             convert_float(biase0.x), convert_float(biase0.y),
+             convert_float(biase0.z), convert_float(biase0.w));
+
+      printf("bias1={ %f , %f , %f , %f }\n ",
+             convert_float(biase1.x), convert_float(biase1.y),
+             convert_float(biase1.z), convert_float(biase1.w));
+      printf("bias2={ %f , %f , %f , %f }\n ",
+             convert_float(biase2.x), convert_float(biase2.y),
+             convert_float(biase2.z), convert_float(biase2.w));
+      printf("bias3={ %f , %f , %f , %f }\n ",
+             convert_float(biase3.x), convert_float(biase3.y),
+             convert_float(biase3.z), convert_float(biase3.w));
+    }*/
+  half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
+  half4 in = read_imageh(input, sampler, coords);
+  half4 output = mad(in, biase, 0);
+  write_imageh(outputImage, coords, output);
+}
\ No newline at end of file
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -174,6 +174,16 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
                                 build_options);
    }

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    // other depthwise not with filter 3x3
+    DLOG << "depth_conv basic ";
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
+
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -214,6 +224,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
                    param.NewScale(), param.NewBias());
      break;

--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -71,6 +71,14 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
                                 build_options);
    }

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
+
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -124,6 +132,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
      break;
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:

--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -72,6 +72,14 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
                                 build_options);
    }

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    DLOG << "init depwise conv basic";
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -130,6 +138,7 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
      break;
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:

--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -129,6 +129,14 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
                                 build_options);
    }

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -168,6 +176,7 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
                    param.NewBias());
      break;

--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
@@ -66,6 +66,14 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
    }
    DLOG << "depth_conv 3x3";

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file);
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -115,6 +123,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param);
      break;
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:

--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
@@ -72,6 +72,14 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {

    DLOG << "depth_conv 3x3";

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -120,6 +128,7 @@ void ConvReluKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, true);
      break;
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:

--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifdef ELEMENTWISEMUL_OP

 #include "operators/kernel/elementwise_mul_kernel.h"
+#include <framework/cl/cl_half.h>
+#include <iostream>
 #include "framework/cl/cl_image.h"

 namespace paddle_mobile {
@@ -23,19 +25,24 @@ namespace operators {
 template <>
 bool ElementwiseMulKernel<GPU_CL, float>::Init(
    ElementwiseMulParam<GPU_CL> *param) {
-  DLOG << "-----init add-----";
  framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
      const_cast<framework::CLImage *>(param->InputY()));
  if (bias->dims() == param->InputX()->dims()) {
+    DLOG << "init element wise mul";
    this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
-  } else if (bias->dims().size() == 4) {
+  } else if (bias->dims().size() == 1) {
+    DLOG << "init channel_mul";
    this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
+  } else if (bias->dims().size() == 2) {
+    // etc. input  1 72 28 28
+    // filter 1 72
+    DLOG << "init channel_mul_d2";
+    this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
  } else {
-    DLOG << "error:bias dims is error";
+    PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet");
  }
  return true;
 }
-
 template <>
 void ElementwiseMulKernel<GPU_CL, float>::Compute(
    const ElementwiseMulParam<GPU_CL> &param) {
@@ -64,8 +71,8 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                               NULL, global_work_size, NULL, 0, NULL, NULL);
    CL_CHECK_ERRORS(status);
-  } else if (bias->dims().size() == 4) {
-    DLOG << "zp7 444";
+  } else if (bias->dims().size() == 1) {
+    DLOG << "channel mul";
    cl_mem input_image = input->GetCLImage();
    cl_mem bias_image = bias->GetCLImage();
    cl_mem output_image = output->GetCLImage();
@@ -84,14 +91,48 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
    CL_CHECK_ERRORS(status);
    auto width = input->ImageWidth();
    auto height = input->ImageHeight();
-    DLOG << "dede:" << width << "," << height;
    size_t global_work_size[2] = {width, height};
    status =
        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                               NULL, global_work_size, NULL, 0, NULL, NULL);
    CL_CHECK_ERRORS(status);
+  } else if (bias->dims().size() == 2) {
+    DLOG << "channel mul d2";
+
+    // etc. input  1 72 28 28
+    // filter 1 72   -->  1 1 1 72
+    DLOG << "input->ImageDims():  " << input->ImageDims();
+    DLOG << "bias->ImageDims():  " << bias->ImageDims();
+    DLOG << "out->ImageDims():  " << output->ImageDims();
+
+    DLOG << "channel mul d2";
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    int tensor_w = input->dims()[input->dims().size() - 1];
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&input_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&bias_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&output_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                            reinterpret_cast<void *>(&tensor_w));
+    CL_CHECK_ERRORS(status);
+    auto width = input->ImageWidth();
+    auto height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+
+    //    bias->PrintTensor(*bias);
  } else {
-    DLOG << "error:bias dims is error";
+    PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
  }
 }


--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -489,6 +489,7 @@ class ConvParam : public OpParam {
    EXEC_SLIDINGWINDOW5x5_FLOAT,
    EXEC_SLIDINGWINDOW7x7_FLOAT,
    EXEC_GEMM1x1s1_FLOAT,
+    EXEC_DEPTHWISEBASIC_FLOAT,
  };

  ExecMode &ExecMode() const { return exec_mode_; }