Merge pull request #1054 from codeWorm2015/opencl

resolve complier error

Merge pull request #1054 from codeWorm2015/opencl
resolve complier error
ae68ce7a · Ray Liu · GitHub · 4d6fdc95 · 3d498fc7 · ae68ce7a
12 changed file
--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "CL/cl.h"
 #include "common/enforce.h"
 #include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_tool.h"

 namespace paddle_mobile {
 namespace framework {

--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -43,17 +43,28 @@ class CLImage {
      C = tensor_dims_[1];
      H = tensor_dims_[2];
      W = tensor_dims_[3];
+
+      width_of_one_block_ = W;
+      height_of_one_block_ = H;
+
    } else if (tensor_dims_.size() == 1) {
      N = 1;
      C = tensor_dims_[0];
      H = 1;
      W = 1;
+
+      width_of_one_block_ = W;
+      height_of_one_block_ = H;
    }

    DLOG << "-------InitMemory-------";

    size_t width = W * ((C + 3) / 4);
    size_t height = H * N;
+
+    image_width_ = width;
+    image_height_ = height;
+
    std::unique_ptr<half_t[]> imageData{};
    int count = 0;
    if (tensorInput != nullptr) {
@@ -95,9 +106,13 @@ class CLImage {
        0,       // size_t image_row_pitch
        reinterpret_cast<void *>(imageData.get()),  // void *host_ptr
        &err);
+
    if (err != CL_SUCCESS) {
      // TODO(HaiPeng): error handling
+      PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
    }
+
+    initialized_ = true;
  }

  void Init(cl_context context, DDim ddim) { Init(context, nullptr, ddim); }
@@ -109,8 +124,6 @@ class CLImage {

  const DDim &dims() const { return tensor_dims_; }

-  std::vector<size_t> DefaultWorkSize() { return {}; }
-
  cl_mem GetCLImage() const { return cl_image_; }

  template <typename T>
@@ -120,24 +133,24 @@ class CLImage {

  inline int64_t numel() const { return product(tensor_dims_); }

-  int ImageWidth() const { return image_width_; }
+  inline size_t ImageWidth() const { return image_width_; }

-  int ImageHeight() const { return image_height_; }
+  inline size_t ImageHeight() const { return image_height_; }

-  int CBlock() const { return c_block_; }
+  inline size_t CBlock() const { return c_block_; }

-  int WidthOfOneBlock() const { return width_of_one_block_; }
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }

-  int HeightOfOneBlock() const { return height_of_one_block_; }
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }

 private:
  bool initialized_ = false;
  cl_mem cl_image_;
-  int image_width_;
-  int width_of_one_block_;
-  int height_of_one_block_;
-  int image_height_;
-  int c_block_;
+  size_t image_width_;
+  size_t width_of_one_block_;
+  size_t height_of_one_block_;
+  size_t image_height_;
+  size_t c_block_;
  DDim tensor_dims_;
  float *tensor_input_;
  cl_context context_;

--- a/src/framework/loader.cpp
+++ b/src/framework/loader.cpp
@@ -95,15 +95,15 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
 */
 template <typename Dtype, Precision P>
 void FusionAndPrintInfos(
-    bool optimize, bool can_add_split, const Program<Dtype, P> &program,
+    bool optimize, bool can_add_split, Program<Dtype, P> *program,
    const std::shared_ptr<ProgramDesc> &originProgramDesc) {
  if (optimize) {
    ProgramOptimize program_optimize;
-    program.optimizeProgram =
+    program->optimizeProgram =
        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
  }
  if (optimize) {
-    program.optimizeProgram->Description("optimize: ");
+    program->optimizeProgram->Description("optimize: ");
  } else {
    originProgramDesc->Description("program: ");
  }
@@ -186,7 +186,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
  // use  originProgramDesc and scope to init tensors
  InitMemoryFromProgram(originProgramDesc, scope);
  // perform fusion and print infos
-  FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
+  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);

  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
  return program;
@@ -195,7 +195,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
 template <typename Dtype, Precision P>
 const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
    size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    const uint8_t *combined_params_buf, bool optimize, bool quantification) {
+    uint8_t *combined_params_buf, bool optimize, bool quantification) {
  bool can_add_split = false;

  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
@@ -221,7 +221,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
  auto scope = std::make_shared<Scope>();
  program.scope = scope;
  InitMemoryFromProgram(originProgramDesc, scope);
-  FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
+  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
                                                               nullptr);
  return program;

--- a/src/framework/loader.h
+++ b/src/framework/loader.h
@@ -46,7 +46,7 @@ class Loader {
  const Program<Dtype, P> LoadCombinedMemory(size_t model_len,
                                             const uint8_t *model_buf,
                                             size_t combined_params_len,
-                                             const uint8_t *combined_params_buf,
+                                             uint8_t *combined_params_buf,
                                             bool optimize = false,
                                             bool quantification = false);


--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -32,7 +32,7 @@ class Program {
  bool combined = false;
  bool quantification = false;
  size_t combined_params_len;
-  const uint8_t *combined_params_buf;
+  uint8_t *combined_params_buf;

 private:
 };

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -68,9 +68,10 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
 }

 template <typename Dtype, Precision P>
-bool PaddleMobile<Dtype, P>::LoadCombinedMemory(
-    size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-    const uint8_t *combined_params_buf) {
+bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
+                                                const uint8_t *model_buf,
+                                                size_t combined_params_len,
+                                                uint8_t *combined_params_buf) {
  int batch_size = 1;
  bool optimise = true;
  bool quantification = false;

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -83,7 +83,7 @@ class PaddleMobile {
   */
  bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
                          size_t combined_params_len,
-                          const uint8_t *combined_params_buf);
+                          uint8_t *combined_params_buf);

  void Clear();


--- a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 __kernel void elementwise_add(__global float* in, __global float* out) {
     int num = get_global_id(0);
     out[num] = in[num] * 0.1  + 102;

--- a/src/operators/kernel/cl/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include "operators/kernel/conv_add_bn_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNReluKernel<GPU_CL, float>::Init(
-    FusionConvAddBNReluParam<GPU_CL> *param) {
-  return true;
-}
-
-template <>
-void ConvAddBNReluKernel<GPU_CL, float>::Compute(
-    const FusionConvAddBNReluParam<GPU_CL> &param) {}
-
-template class ConvAddBNReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -37,6 +37,7 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
  auto bias_ptr = bias->data<float>();

  const int C = mean->numel();
+
  float inv_std_ptr[C];
  for (int i = 0; i < C; i++) {
    inv_std_ptr[i] =
@@ -55,8 +56,13 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(

  framework::CLImage *new_scale = new framework::CLImage();

+  new_scale->Init(this->cl_helper_.CLContext(), new_scale_ptr,
+                  variance->dims());
+
  framework::CLImage *new_bias = new framework::CLImage();

+  new_bias->Init(this->cl_helper_.CLContext(), new_bias_ptr, variance->dims());
+
  param->SetNewScale(new_scale);

  param->SetNewBias(new_bias);
@@ -65,10 +71,23 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
          param->Paddings()[0] == param->Paddings()[1],
      "need equal");
-  param->SetOffset(param->Filter()->dims()[2] / 2 -
-                   static_cast<int>(param->Paddings()[1]));

+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+
+  param->SetOffset(offset);
+
+  if (param->Filter()->WidthOfOneBlock() == 1 &&
+      param->Filter()->HeightOfOneBlock() == 1) {
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1) {
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
+             param->Filter()->HeightOfOneBlock() == 3) {
    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }

  return true;
 }

--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -21,12 +21,63 @@ namespace operators {

 template <>
 bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+
+  if (param->Filter()->WidthOfOneBlock() == 1 &&
+      param->Filter()->HeightOfOneBlock() == 1) {
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1) {
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
+             param->Filter()->HeightOfOneBlock() == 3) {
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
  return true;
 }

 template <>
 void ConvAddKernel<GPU_CL, float>::Compute(
-    const FusionConvAddParam<GPU_CL> &param) {}
+    const FusionConvAddParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto biase = param.Bias()->GetCLImage();
+  auto output = param.Output();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = param.Input()->CBlock();
+  int input_width = param.Input()->WidthOfOneBlock();
+  int input_height = param.Input()->HeightOfOneBlock();
+
+  clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  clSetKernelArg(kernel, 1, sizeof(int), &w);
+  clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
+  clSetKernelArg(kernel, 9, sizeof(int), &stride);
+  clSetKernelArg(kernel, 10, sizeof(int), &offset);
+  clSetKernelArg(kernel, 11, sizeof(int), &input_c);
+  clSetKernelArg(kernel, 12, sizeof(int), &input_width);
+  clSetKernelArg(kernel, 13, sizeof(int), &input_height);
+
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+}

 template class ConvAddKernel<GPU_CL, float>;


--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -21,12 +21,62 @@ namespace operators {

 template <>
 bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
-  //  this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+
+  if (param->Filter()->WidthOfOneBlock() == 1 &&
+      param->Filter()->HeightOfOneBlock() == 1) {
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1) {
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
+             param->Filter()->HeightOfOneBlock() == 3) {
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
  return true;
 }

 template <>
 void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto output = param.Output();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = param.Input()->CBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->WidthOfOneBlock();
+  int input_height = param.Input()->HeightOfOneBlock();
+
+  clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  clSetKernelArg(kernel, 1, sizeof(int), &w);
+  clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+  clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+
  //  auto kernel = this->cl_helper_.KernelAt(0);
  //  size_t global_work_size[3] = {1, 2, 3};
  //  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,