fix memcpy size in opencl fetch kernel (#1630)

5945175f · Jiaying Zhao · xiebaiyuan · b05774ca · 5945175f · 5945175f
4 changed file
--- a/src/framework/cl/cl_tensor.h
+++ b/src/framework/cl/cl_tensor.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {

-class CLTensor : TensorBase {
+class CLTensor : public TensorBase {
 public:
  CLTensor(cl_context context, cl_command_queue command_queue)
      : context_(context), command_queue_(command_queue) {}

--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -31,8 +31,6 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
  auto kernel = this->cl_helper_.KernelAt(0);
  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
  cl_int status;
-  param.Out()->InitEmptyImage(cl_helper_.CLContext(),
-                              cl_helper_.CLCommandQueue(), param.Out()->dims());
  auto output = param.Out();
  const Tensor *input = &param.InputX()->at(col);
  //  DLOG << *input;

--- a/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
@@ -14,19 +14,13 @@ limitations under the License. */

 #include "operators/kernel/fetch_kernel.h"
 #include "framework/cl/cl_tensor.h"
-// #include "common/common.h"
-// #include <iostream>

 namespace paddle_mobile {
 namespace operators {

 template <>
 bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
-  //  if (param->InputX()->dims().size() <= 2) {
-  //    this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
-  //  } else {
  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  //  }
  return true;
 }

@@ -40,24 +34,27 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
  auto *out = &param.Out()->at(col);
  out->Resize(param.InputX()->dims());
  out->mutable_data<float>();
-  const auto &dim = param.InputX()->dims();
+
+  DLOG << "fetch kernel out dims = " << out->dims();
+  DLOG << "fetch kernel out memory size = " << out->memory_size();
+
+  auto dim = param.InputX()->dims();
  size_t new_dims[] = {1, 1, 1, 1};

  for (int j = 0; j < dim.size(); ++j) {
    new_dims[4 - dim.size() + j] = dim[j];
  }

-  size_t C, in_height, in_width;
+  size_t in_ch, in_height, in_width;

-  C = new_dims[1];
+  in_ch = new_dims[1];
  in_height = new_dims[2];
-  //  if (dim.size() <= 2) {
-  //    in_width = param.InputX()->ImageWidth();
-  //  } else {
  in_width = new_dims[3];
-  //  }
+  int size_ch = in_height * in_width;
+  int size_block = size_ch * 4;
+  int size_batch = size_ch * in_ch;

-  CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
+  framework::CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
                                    this->cl_helper_.CLCommandQueue());
  out_cl_tensor.Resize(out->dims());
  cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
@@ -66,35 +63,28 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
  clSetKernelArg(kernel, 1, sizeof(int), &in_width);
  clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
  clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
-  //  if (dim.size() > 2) {
-  int size_ch = in_height * in_width;
-  int size_block = size_ch * 4;
-  int size_batch = size_ch * C;
-  int out_c = new_dims[1];
  clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
  clSetKernelArg(kernel, 5, sizeof(int), &size_block);
  clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
-  clSetKernelArg(kernel, 7, sizeof(int), &out_c);
-  //  }
+  clSetKernelArg(kernel, 7, sizeof(int), &in_ch);

  //  cl_event wait_event = param.InpdutX()->GetClEvent();
  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                         default_work_size.data(), NULL, 0, NULL, NULL);

-  //  auto time1 = paddle_mobile::time();
-
  //  printf(" before finish \n");
  //  clFlsh(this->cl_helper_.CLCommandQueue());
  clFinish(this->cl_helper_.CLCommandQueue());
  //  printf(" after finish \n");

-  //  auto time2 = paddle_mobile::time();
-  //
-  //
-  //  std::cout << " finish  cost :" << paddle_mobile::time_diff(time1, time2)
-  //            << "ms" << std::endl;
+  DLOG << "fetch kernel out dims = " << out->dims();
+  DLOG << "fetch kernel out memory size = " << out->memory_size();

-  memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size());
+  DLOG << "fetch kernel out_cl_tensor dims = " << out_cl_tensor.dims();
+  DLOG << "fetch kernel out_cl_tensor memery size = "
+       << out_cl_tensor.memory_size();
+  memcpy(out->data<float>(), out_cl_tensor.Data<float>(),
+         sizeof(float) * out->numel());
 }

 template class FetchKernel<GPU_CL, float>;

--- a/test/net/test_super.cpp
+++ b/test/net/test_super.cpp
@@ -21,12 +21,14 @@ int main() {
  paddle_mobile::PaddleMobileConfigInternal config;
  config.load_when_predict = true;

-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
-  //    paddle_mobile.SetThreadNum(4);
  auto time1 = paddle_mobile::time();
 #ifdef PADDLE_MOBILE_CL
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#else
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
 #endif
+  //  paddle_mobile.SetThreadNum(4);

  auto isok = paddle_mobile.Load(std::string(g_super) + "/model",
                                 std::string(g_super) + "/params", true, false,
@@ -131,12 +133,12 @@ int main() {
      auto time5 = paddle_mobile::time();
      vec_result4 = paddle_mobile.Predict(input4, dims4);
      auto time6 = paddle_mobile::time();
-      std::cout << "224*224 predict cost :第" << i << ": "
+      std::cout << "300*300 predict cost :第" << i << ": "
                << paddle_mobile::time_diff(time5, time6) << "ms" << std::endl;
    }

    auto time4 = paddle_mobile::time();
-    std::cout << "224*224 predict cost :"
+    std::cout << "300*300 predict cost :"
              << paddle_mobile::time_diff(time3, time4) / max << "ms"
              << std::endl;
    //    biggest = std::max_element(std::begin(vec_result4),