Merge pull request #1160 from codeWorm2015/opencl

commit event

Merge pull request #1160 from codeWorm2015/opencl
commit event
9ae8d0a1 · Ray Liu · GitHub · 826f68d9 · e6364103 · 9ae8d0a1
9 changed file
--- a/src/framework/cl/cl_deleter.h
+++ b/src/framework/cl/cl_deleter.h
@@ -30,6 +30,13 @@ struct CLMemDeleter {
  }
 };

+struct CLEventDeleter {
+  template <class T>
+  void operator()(T *clEventObj) {
+    clReleaseEvent(clEventObj);
+  }
+};
+
 struct CLCommQueueDeleter {
  template <class T>
  void operator()(T *clQueueObj) {

--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -81,6 +81,12 @@ class CLEngine {
    return std::move(program_ptr);
  }

+  std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
+    cl_event event = clCreateUserEvent(context, status_);
+    std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);
+    return std::move(event_ptr);
+  }
+
  bool BuildProgram(cl_program program) {
    cl_int status;
    status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math", 0, 0);

--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "framework/cl/cl_half.h"
 #include "framework/cl/cl_tool.h"
 #include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_engine.h"
 #include "framework/ddim.h"
 #include "framework/tensor.h"

@@ -97,6 +98,8 @@ class CLImage {
      InitCLImage(context, command_queue, tensor_data_, tensor_dims_);
    }

+    cl_event_ = CLEngine::Instance()->CreateEvent(context);
+

 //    InitCLImage(context, command_queue, nullptr, dim);
    initialized_ = true;
@@ -157,6 +160,8 @@ class CLImage {

  const ImageType GetImageType() const { return image_type_; }

+  cl_event GetClEvent() const { return cl_event_.get(); }
+
 private:
  ImageType image_type_ = Invalid;
  void InitCLImage2C(cl_context context, cl_command_queue command_queue,
@@ -295,6 +300,7 @@ class CLImage {

  bool initialized_ = false;
  std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_;
+  std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
  size_t image_width_;
  size_t width_of_one_block_;
  size_t height_of_one_block_;

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -37,8 +37,6 @@ limitations under the License. */
 #include "framework/cl/cl_image.h"
 #endif

-int debug_to = 32;
-
 namespace paddle_mobile {
 namespace framework {

@@ -87,7 +85,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
  for (int i = 0; i < blocks.size(); ++i) {
    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < debug_to; ++j) {
+    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
      DLOG << "create op: " << j << "  " << op->Type();
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
@@ -416,7 +414,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    }
  }
 #else
-  for (int i = 0; i < debug_to; i++) {
+  for (int i = 0; i < ops.size(); i++) {
 #ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
@@ -433,8 +431,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(

  DLOG << " predict return nullptr";

-  return nullptr;
-
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();

--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -168,20 +168,20 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
  int output_width = param.Output()->WidthOfOneBlock();
  int output_height = param.Output()->HeightOfOneBlock();

-  DLOG << " c block " << c_block;
-  DLOG << " w " << w;
-  DLOG << " nh " << nh;
-  DLOG << " stride " << stride;
-  DLOG << " offset " << offset;
-  DLOG << " input_c " << input_c;
-  DLOG << " dilation " << dilation;
-  DLOG << " input width " << input_width;
-  DLOG << " input height " << input_height;
-  DLOG << " output width " << output_width;
-  DLOG << " output height " << output_height;
-  DLOG << " input dim " << param.Input()->dims();
-  DLOG << " output dim " << param.Output()->dims();
-  DLOG << " filter dim " << param.Filter()->dims();
+//  DLOG << " c block " << c_block;
+//  DLOG << " w " << w;
+//  DLOG << " nh " << nh;
+//  DLOG << " stride " << stride;
+//  DLOG << " offset " << offset;
+//  DLOG << " input_c " << input_c;
+//  DLOG << " dilation " << dilation;
+//  DLOG << " input width " << input_width;
+//  DLOG << " input height " << input_height;
+//  DLOG << " output width " << output_width;
+//  DLOG << " output height " << output_height;
+//  DLOG << " input dim " << param.Input()->dims();
+//  DLOG << " output dim " << param.Output()->dims();
+//  DLOG << " filter dim " << param.Filter()->dims();

  cl_int status;

@@ -236,9 +236,12 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
  status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
  CL_CHECK_ERRORS(status);

+  cl_event out_event = param.Output()->GetClEvent();
+  cl_event wait_event = param.Input()->GetClEvent();
+
  status =
      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-                             default_work_size.data(), NULL, 0, NULL, NULL);
+                             default_work_size.data(), NULL, 1, &wait_event, &out_event);
  CL_CHECK_ERRORS(status);
 }


--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -117,9 +117,12 @@ void ConvAddKernel<GPU_CL, float>::Compute(
  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
  CL_CHECK_ERRORS(status);

+  cl_event out_event = param.Output()->GetClEvent();
+  cl_event wait_event = param.Input()->GetClEvent();
+
  status =
      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-                             default_work_size.data(), NULL, 0, NULL, NULL);
+                             default_work_size.data(), NULL, 1, &wait_event, &out_event);
  CL_CHECK_ERRORS(status);
 }


--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -62,27 +62,15 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {

 template <>
 void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
-  DLOG << " Compute helper: " << &cl_helper_;
-  DLOG << " begin compute ";
  auto kernel = this->cl_helper_.KernelAt(0);
-  DLOG << " get work size ";
  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  DLOG << " end work size ";
  int c_block = default_work_size[0];
  int w = default_work_size[1];
  int nh = default_work_size[2];
  auto input = param.Input()->GetCLImage();
-
-  DLOG << " get Input ";
-
  auto filter = param.Filter()->GetCLImage();
-
-  DLOG << " get Filter ";
-
  auto output = param.Output()->GetCLImage();

-  DLOG << " get Output ";
-
  int stride = param.Strides()[0];
  int offset = param.Offset();
  int input_c = param.Input()->CBlock();
@@ -109,56 +97,27 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
  DLOG << " output height " << output_height;

  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  DLOG << " end set kernel arg ";

-  DLOG << " begin enqueue ";
+  cl_event out_event = param.Output()->GetClEvent();
+  cl_event wait_event = param.Input()->GetClEvent();

  status =
      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-                             default_work_size.data(), NULL, 0, NULL, NULL);
+                             default_work_size.data(), NULL, 1, &wait_event, &out_event);
  CL_CHECK_ERRORS(status);
-  DLOG << " end enqueue ";
 }

 template class ConvKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp
@@ -76,9 +76,12 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute(

  CL_CHECK_ERRORS(status);

+  cl_event out_event = param.Output()->GetClEvent();
+  cl_event wait_event = param.Input()->GetClEvent();
+
  status =
      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-                             default_work_size.data(), NULL, 0, NULL, NULL);
+                             default_work_size.data(), NULL, 1, &wait_event, &out_event);

  CL_CHECK_ERRORS(status);
 }

--- a/src/operators/kernel/cl/softmax_kernel.cpp
+++ b/src/operators/kernel/cl/softmax_kernel.cpp
@@ -34,21 +34,13 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
  auto inputImage = input->GetCLImage();
  auto outputImage = output->GetCLImage();

-  DLOG << " softmax - output image dim " << output->ImageDims();
-  DLOG << " softmax - output image tensor dim " << output->dims();
-
  int group = output->ImageWidth();

  cl_int status;

  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-
  status = clSetKernelArg(kernel, 2, sizeof(int), &group);
-  CL_CHECK_ERRORS(status);

 //  const auto &inputDim = input->dims();
 //
@@ -62,7 +54,6 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
 //  clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);
 //  clSetKernelArg(kernel, 4, sizeof(int), &dims[2]);
 //  clSetKernelArg(kernel, 5, sizeof(int), &dims[3]);
-  DLOG << "default_work_size:  " << default_work_size;

  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
                         default_work_size.data(), NULL, 0, NULL, NULL);