[test=develop]1.fix crash when gpu op scale&elementwise_add input dim… (#1856)

* [test=develop]1.fix crash when gpu op scale&elementwise_add input dim size equal 2 2.add gpu op mul * fix code style test=develop

[test=develop]1.fix crash when gpu op scale&elementwise_add input dim… (#1856)
* [test=develop]1.fix crash when gpu op scale&elementwise_add input dim size equal 2 2.add gpu op mul * fix code style test=develop
9ef59f74 · zp7 · Yanzhan Yang · 32065859 · 9ef59f74 · 9ef59f74
7 changed file
--- a/mobile/src/io/api_paddle_mobile.cc
+++ b/mobile/src/io/api_paddle_mobile.cc
@@ -41,10 +41,11 @@ bool PaddleMobilePredictor<Device, T>::Init(const PaddleMobileConfig &config) {
 #endif
  if (config.memory_pack.from_memory) {
    DLOG << "load from memory!";
-    paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size,
-                                       config.memory_pack.model_buf,
-                                       config.memory_pack.combined_params_size,
-                                       config.memory_pack.combined_params_buf);
+    paddle_mobile_->LoadCombinedMemory(
+        config.memory_pack.model_size, config.memory_pack.model_buf,
+        config.memory_pack.combined_params_size,
+        config.memory_pack.combined_params_buf, config.optimize,
+        config.quantification, config.batch_size, config.lod_mode);
  } else if (!config.model_dir.empty()) {
    paddle_mobile_->Load(config.model_dir, config.optimize,
                         config.quantification, config.batch_size,

--- a/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
@@ -28,3 +28,24 @@ __kernel void channel_add(__global image2d_t input, __global image2d_t bias,__wr
     half4 output = in + biase;
     write_imageh(outputImage,coords,output);
 }
+
+__kernel void width_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t
+outputImage,int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x % w;
+  coords_bias.y = 0;
+  half4 in = read_imageh(input, sampler, coords);
+  half4 biase = read_imageh(bias, sampler, coords_bias);
+  half4 output;
+  output.x = in.x + biase.x;
+  output.y = in.y + biase.x;
+  output.z = in.z + biase.x;
+  output.w = in.w + biase.x;
+  write_imageh(outputImage,coords,output);
+}
--- a/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp
@@ -23,21 +23,27 @@ template <>
 bool ElementwiseAddKernel<GPU_CL, float>::Init(
    ElementwiseAddParam<GPU_CL> *param) {
  DLOG << "-----init add-----";
-  CLImage *bias = (CLImage *)(param->InputY());
+  CLImage *bias = reinterpret_cast<CLImage *>(const_cast<CLImage *>
+          (param->InputY()));
  if (!bias->isInit()) {
-    bias->InitCLImage(cl_helper_.CLContext(),
-                      this->cl_helper_.CLCommandQueue());
+    bias->InitNormalCLImage(cl_helper_.CLContext(),
+                            this->cl_helper_.CLCommandQueue());
  }

  DLOG << " bias: " << *bias;
  if (bias->dims().size() == 4) {
    this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
  } else if (param->InputY()->dims().size() == 1) {
-    this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl");
+    if (param->Axis() == param->InputX()->dims().size() - 1) {
+      this->cl_helper_.AddKernel("width_add", "channel_add_kernel.cl");
+    } else if (param->Axis() == param->InputX()->dims().size() - 3) {
+      this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl");
+    } else {
+      DLOG << "error:bias dims is error";
+    }
  } else {
    DLOG << "error:bias dims is error";
  }
-
  return true;
 }

@@ -70,31 +76,37 @@ void ElementwiseAddKernel<GPU_CL, float>::Compute(
                               NULL, global_work_size, NULL, 0, NULL, NULL);
    CL_CHECK_ERRORS(status);
  } else if (bias->dims().size() == 1) {
-    cl_mem input_image = input->GetCLImage();
-    cl_mem bias_image = bias->GetCLImage();
-    cl_mem output_image = output->GetCLImage();
-    int tensor_w = input->dims()[3];
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&input_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&bias_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&output_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                            reinterpret_cast<void *>(&tensor_w));
-    CL_CHECK_ERRORS(status);
-    int width = input->ImageWidth();
-    int height = input->ImageHeight();
-    size_t global_work_size[2] = {width, height};
-    cl_event out_event = param.Out()->GetClEvent();
-    cl_event wait_event = param.InputX()->GetClEvent();
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
+    if (param.Axis() == param.InputX()->dims().size() - 1 ||
+        param.Axis() == param.InputX()->dims().size() - 3) {
+      cl_mem input_image = input->GetCLImage();
+      cl_mem bias_image = bias->GetCLImage();
+      cl_mem output_image = output->GetCLImage();
+      int tensor_w = input->dims()[input->dims().size() - 1];
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&input_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&bias_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&output_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                              reinterpret_cast<void *>(&tensor_w));
+      CL_CHECK_ERRORS(status);
+      int width = input->ImageWidth();
+      int height = input->ImageHeight();
+      DLOG << "dede:" << width << "," << height;
+      size_t global_work_size[2] = {width, height};
+      cl_event out_event = param.Out()->GetClEvent();
+      cl_event wait_event = param.InputX()->GetClEvent();
+      status =
+          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                 NULL, global_work_size, NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+    } else {
+      DLOG << "error:bias dims is error";
+    }
  } else {
    DLOG << "error:bias dims is error";
  }

--- a/mobile/src/operators/kernel/cl/mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/mul_kernel.cpp
@@ -21,11 +21,64 @@ namespace operators {

 template <>
 bool MulKernel<GPU_CL, float>::Init(MulParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
+  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
  return true;
 }

+template <typename Dtype>
+void MulCompute(const MulParam<GPU_CL> &param, cl_context context,
+                cl_command_queue commandQueue, cl_kernel kernel0,
+                cl_kernel kernel1) {
+  auto input_x = param.InputX();
+  Tensor *input_x_tensor = new Tensor();
+  input_x_tensor->Resize(input_x->dims());
+  input_x_tensor->mutable_data<float>();
+
+  framework::CLImageToTensor(input_x, input_x_tensor, context, commandQueue,
+                             kernel0);
+
+  auto input_y = param.InputY();
+  Tensor input_y_tensor(input_y->data<float>(), input_y->dims());
+
+  const Tensor x_matrix =
+      input_x_tensor->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x_tensor, param.XNumColDims())
+          : *input_x_tensor;
+  const Tensor y_matrix =
+      input_y_tensor.dims().size() > 2
+          ? framework::ReshapeToMatrix(input_y_tensor, param.YNumColDims())
+          : input_y_tensor;
+
+  auto out_dim = param.Out()->dims();
+  if (out_dim.size() != 2) {
+    param.Out()->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+
+  auto output = param.Out();
+  Tensor *output_tensor = new Tensor();
+  output_tensor->Resize(output->dims());
+  output_tensor->mutable_data<float>();
+  math::MatMul<float, float>(x_matrix, false, y_matrix, false,
+                             static_cast<float>(1), output_tensor,
+                             static_cast<float>(0));
+
+  output->InitEmptyImage(context, commandQueue, output_tensor->dims());
+  framework::TensorToCLImage(output_tensor, output, context, commandQueue,
+                             kernel1);
+
+  delete (input_x_tensor);
+  delete (output_tensor);
+}
+
 template <>
-void MulKernel<GPU_CL, float>::Compute(const MulParam<GPU_CL> &param) {}
+void MulKernel<GPU_CL, float>::Compute(const MulParam<GPU_CL> &param) {
+  auto kernel0 = this->cl_helper_.KernelAt(0);
+  auto kernel1 = this->cl_helper_.KernelAt(1);
+
+  MulCompute<float>(param, this->cl_helper_.CLContext(),
+                    this->cl_helper_.CLCommandQueue(), kernel0, kernel1);
+}

 template class MulKernel<GPU_CL, float>;


--- a/mobile/src/operators/kernel/cl/scale_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/scale_kernel.cpp
@@ -35,7 +35,7 @@ void ScaleKernel<GPU_CL, float>::Compute(const ScaleParam<GPU_CL>& param) {
  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
  auto inputImage = input->GetCLImage();
  auto outputImage = output->GetCLImage();
-  int out_width = output->dims()[3];
+  int out_width = (output->dims().size() == 4) ? output->dims()[3] : 1;
  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
  clSetKernelArg(kernel, 2, sizeof(float), &scale);

--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <string>
 #include <vector>
+#include <memory>
 #include "common/log.h"
 #include "common/type_define.h"
 #include "common/types.h"
@@ -654,9 +655,9 @@ class MulParam : public OpParam {
    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
  }

-  const GType *InputX() const { return input_x_; }
+  GType *InputX() const { return input_x_; }

-  const GType *InputY() const { return input_y_; }
+  GType *InputY() const { return input_y_; }

  GType *Out() const { return out_; }


--- a/mobile/test/net/test_net.cpp
+++ b/mobile/test/net/test_net.cpp
@@ -174,6 +174,9 @@ void test(int argc, char *argv[]) {
 #ifdef PADDLE_MOBILE_CL
    for (auto var_name : var_names) {
      auto cl_image = paddle_mobile.FetchImage(var_name);
+      if (cl_image == nullptr || cl_image->GetCLImage() == nullptr) {
+        continue;
+      }
      auto len = cl_image->numel();
      if (len == 0) {
        continue;