diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
index 4895c07d201283d2b82e52209baf2baa896bc329..b7f4d16c3bb54b7f28d379e38724c5de8cf9dd06 100644
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
@@ -110,4 +110,22 @@ __kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
   half4 in = read_imageh(input, sampler, coords);
   half4 output = mad(in, biase, 0);
   write_imageh(outputImage, coords, output);
+}
+
+__kernel void channel_mul_d4(__global image2d_t input, __global image2d_t bias,
+                          __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+  half4 in = read_imageh(input, sampler, coords);
+  half4 biase = read_imageh(bias, sampler, coords_bias);
+  half4 output = in * biase;
+  write_imageh(outputImage, coords, output);
 }
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
index fd5b9e6bc3ffcce5ddde03e575cec0d1649758fc..37034a01899d8246abfa5dcf419637e643eff924 100644
--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
@@ -30,16 +30,23 @@ bool ElementwiseMulKernel<GPU_CL, float>::Init(
   if (bias->dims() == param->InputX()->dims()) {
     DLOG << "init element wise mul";
     this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
-  } else if (bias->dims().size() == 1) {
-    DLOG << "init channel_mul";
-    this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
-  } else if (bias->dims().size() == 2) {
-    // etc. input  1 72 28 28
-    // filter 1 72
-    DLOG << "init channel_mul_d2";
-    this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
   } else {
-    PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet");
+    const int bias_dim_size = bias->dims().size();
+    if (bias_dim_size == 1) {
+      DLOG << "init channel_mul";
+      this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
+    } else if (bias_dim_size == 2) {
+      // etc. input  1 72 28 28
+      // filter 1 72
+      DLOG << "init channel_mul_d2";
+      this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
+    } else if (bias_dim_size == 4) {
+      DLOG << "init channel_mul_d4";
+      this->cl_helper_.AddKernel("channel_mul_d4", "elementwise_mul_kernel.cl");
+    } else {
+      PADDLE_MOBILE_ENFORCE(false,
+                            "element mul not supported this situation yet");
+    }
   }
   return true;
 }
@@ -71,68 +78,103 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
         clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                                NULL, global_work_size, NULL, 0, NULL, NULL);
     CL_CHECK_ERRORS(status);
-  } else if (bias->dims().size() == 1) {
-    DLOG << "channel mul";
-    cl_mem input_image = input->GetCLImage();
-    cl_mem bias_image = bias->GetCLImage();
-    cl_mem output_image = output->GetCLImage();
-    int tensor_w = input->dims()[input->dims().size() - 1];
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&input_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&bias_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&output_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                            reinterpret_cast<void *>(&tensor_w));
-    CL_CHECK_ERRORS(status);
-    auto width = input->ImageWidth();
-    auto height = input->ImageHeight();
-    size_t global_work_size[2] = {width, height};
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  } else if (bias->dims().size() == 2) {
-    DLOG << "channel mul d2";
+  } else {
+    const int bias_dim_size = bias->dims().size();
+    if (bias_dim_size == 1) {
+      DLOG << "channel mul";
+      cl_mem input_image = input->GetCLImage();
+      cl_mem bias_image = bias->GetCLImage();
+      cl_mem output_image = output->GetCLImage();
+      int tensor_w = input->dims()[input->dims().size() - 1];
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&input_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&bias_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&output_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                              reinterpret_cast<void *>(&tensor_w));
+      CL_CHECK_ERRORS(status);
+      auto width = input->ImageWidth();
+      auto height = input->ImageHeight();
+      size_t global_work_size[2] = {width, height};
+      status =
+          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                 NULL, global_work_size, NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+    } else if (bias_dim_size == 2) {
+      DLOG << "channel mul d2";
 
-    // etc. input  1 72 28 28
-    // filter 1 72   -->  1 1 1 72
-    DLOG << "input->ImageDims():  " << input->ImageDims();
-    DLOG << "bias->ImageDims():  " << bias->ImageDims();
-    DLOG << "out->ImageDims():  " << output->ImageDims();
+      // etc. input  1 72 28 28
+      // filter 1 72   -->  1 1 1 72
+      DLOG << "input->ImageDims():  " << input->ImageDims();
+      DLOG << "bias->ImageDims():  " << bias->ImageDims();
+      DLOG << "out->ImageDims():  " << output->ImageDims();
 
-    DLOG << "channel mul d2";
-    cl_mem input_image = input->GetCLImage();
-    cl_mem bias_image = bias->GetCLImage();
-    cl_mem output_image = output->GetCLImage();
-    int tensor_w = input->dims()[input->dims().size() - 1];
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&input_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&bias_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&output_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                            reinterpret_cast<void *>(&tensor_w));
-    CL_CHECK_ERRORS(status);
-    auto width = input->ImageWidth();
-    auto height = input->ImageHeight();
-    size_t global_work_size[2] = {width, height};
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
+      DLOG << "channel mul d2";
+      cl_mem input_image = input->GetCLImage();
+      cl_mem bias_image = bias->GetCLImage();
+      cl_mem output_image = output->GetCLImage();
+      int tensor_w = input->dims()[input->dims().size() - 1];
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&input_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&bias_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&output_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                              reinterpret_cast<void *>(&tensor_w));
+      CL_CHECK_ERRORS(status);
+      auto width = input->ImageWidth();
+      auto height = input->ImageHeight();
+      size_t global_work_size[2] = {width, height};
+      status =
+          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                 NULL, global_work_size, NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
 
-    //    bias->PrintTensor(*bias);
-  } else {
-    PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
+      //    bias->PrintTensor(*bias);
+    } else if (bias_dim_size == 4) {
+      DLOG << "channel_mul_d4";
+      // etc. input  1 72 28 28
+      // filter 1 72   -->  1 1 1 72
+      DLOG << "input->ImageDims():  " << input->ImageDims();
+      DLOG << "bias->ImageDims():  " << bias->ImageDims();
+      DLOG << "out->ImageDims():  " << output->ImageDims();
+
+      DLOG << "channel mul d2";
+      cl_mem input_image = input->GetCLImage();
+      cl_mem bias_image = bias->GetCLImage();
+      cl_mem output_image = output->GetCLImage();
+      int tensor_w = input->dims()[input->dims().size() - 1];
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&input_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&bias_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&output_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                              reinterpret_cast<void *>(&tensor_w));
+      CL_CHECK_ERRORS(status);
+      auto width = input->ImageWidth();
+      auto height = input->ImageHeight();
+      size_t global_work_size[2] = {width, height};
+      status =
+          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                 NULL, global_work_size, NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+    } else {
+      PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
+    }
   }
 }