add cl_caller.

a36284ca · ZhenWang · 34a290c8 · a36284ca · a36284ca · a36284ca
9 changed file
--- a/paddle/fluid/lite/opencl/CMakeLists.txt
+++ b/paddle/fluid/lite/opencl/CMakeLists.txt
@@ -7,7 +7,8 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    set_target_properties(opencl-lib
        PROPERTIES
        IMPORTED_LOCATION
-        ${CMAKE_SOURCE_DIR}/opencl-lib/armeabi-v7a/libOpenCL.so)
+        #${CMAKE_SOURCE_DIR}/opencl-lib/armeabi-v7a/libOpenCL.so)
+        ${CMAKE_SOURCE_DIR}/opencl-lib/armeabi-v7a/libGLES_mali.so)

    cc_library(cl_tool SRCS cl_tool.cc)
    target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers)
@@ -18,7 +19,8 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context)
    cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor)
    cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine)
-    lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_engine cl_context)
+    cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_helper cl_image)
+    lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller)
    target_link_libraries(test_cl_runtime opencl-lib)
    add_dependencies(cl_tool opencl_clhpp)
 endif()
--- a/paddle/fluid/lite/opencl/cl_caller.cc
+++ b/paddle/fluid/lite/opencl/cl_caller.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/lite/opencl/cl_caller.h"
+#include <string>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#include "paddle/fluid/lite/opencl/cl_image.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+
+namespace paddle {
+namespace lite {
+static void CopyImageData(const CLImage& cl_image, float* out) {
+  int width = cl_image.image_dims()[0];
+  int height = cl_image.image_dims()[1];
+
+  half_t* image_data = new half_t[height * width * 4];
+  cl::Image* image = cl_image.cl_image();
+  const std::array<size_t, 3> origin{0, 0, 0};
+  const std::array<size_t, 3> region{static_cast<size_t>(width),
+                                     static_cast<size_t>(height), 1};
+  cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
+      *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
+  CL_CHECK_ERRORS(err);
+
+  auto* converter = cl_image.image_converter();
+  converter->ImageToNCHW(image_data, out, cl_image.image_dims(),
+                         cl_image.tensor_dims());
+
+  delete[] image_data;
+}
+
+bool InitOpenCLEngine(std::string cl_path) {
+  auto* engine = CLEngine::Global();
+  engine->set_cl_path(cl_path);
+  return engine->IsInitSuccess();
+}
+
+void elementwise_add(CLContext* context, float* in, const DDim& in_dim,
+                     float* bias, const DDim& bias_dim, float* out,
+                     const DDim& out_dim) {
+  CLHelper helper(context);
+  helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  auto kernel = helper.KernelAt(0);
+  CLImage in_image;
+  in_image.set_tensor_data(in, in_dim);
+  in_image.InitNormalCLImage(helper.OpenCLContext());
+  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
+  CLImage bias_image;
+  bias_image.set_tensor_data(bias, bias_dim);
+  bias_image.InitNormalCLImage(helper.OpenCLContext());
+  VLOG(3) << " --- Bias image: " << bias_image << " --- ";
+  CLImage out_image;
+  out_image.InitEmptyImage(helper.OpenCLContext(), out_dim);
+  cl_int status;
+  status = kernel.setArg(0, *in_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(1, *bias_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(2, *out_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  size_t width = in_image.ImageWidth();
+  size_t height = in_image.ImageHeight();
+  auto global_work_size = cl::NDRange{width, height};
+  status = helper.OpenCLCommandQueue().enqueueNDRangeKernel(
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
+  CL_CHECK_ERRORS(status);
+
+  VLOG(3) << " --- Out image: " << out_image << " --- ";
+
+  CopyImageData(out_image, out);
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_caller.h
+++ b/paddle/fluid/lite/opencl/cl_caller.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_context.h"
+
+namespace paddle {
+namespace lite {
+
+bool InitOpenCLEngine(std::string cl_path);
+void elementwise_add(CLContext* context, float* in, const DDim& in_dim,
+                     float* bias, const DDim& bias_dim, float* out,
+                     const DDim& out_dim);
+
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_engine.cc
+++ b/paddle/fluid/lite/opencl/cl_engine.cc
@@ -133,10 +133,10 @@ bool CLEngine::InitializePlatform() {

 bool CLEngine::InitializeDevice() {
  std::vector<cl::Device> all_devices;
-  status_ = platform_->getDevices(CL_DEVICE_TYPE_DEFAULT, &all_devices);
+  status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
  CL_CHECK_ERRORS(status_);
  if (all_devices.empty()) {
-    LOG(ERROR) << "No OpenCL device found!";
+    LOG(ERROR) << "No OpenCL GPU device found!";
    return false;
  }
  device_ = std::make_shared<cl::Device>();

--- a/paddle/fluid/lite/opencl/cl_helper.cc
+++ b/paddle/fluid/lite/opencl/cl_helper.cc
@@ -21,9 +21,12 @@ limitations under the License. */
 namespace paddle {
 namespace lite {

+void CLHelper::set_context(CLContext *context) { context_ = context; }
+
 void CLHelper::AddKernel(const std::string &kernel_name,
                         const std::string &file_name,
                         const std::string &options) {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
  VLOG(3) << " --- begin to add kernel ---";
  auto kernel = context_->GetKernel(kernel_name, file_name, options);
  kernels.emplace_back(std::move(kernel));
@@ -32,16 +35,24 @@ void CLHelper::AddKernel(const std::string &kernel_name,

 cl::Kernel &CLHelper::KernelAt(const int index) {
  VLOG(3) << " --- kernel count: " << kernels.size() << " --- ";
+  CHECK(static_cast<size_t>(index) < kernels.size())
+      << "The index must be less than the size of kernels.";
+  CHECK(kernels[index] != nullptr)
+      << "The target kernel pointer cannot be null.";
  return *(kernels[index]);
 }

 cl::CommandQueue &CLHelper::OpenCLCommandQueue() {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
  return context_->GetCommandQueue();
 }

-cl::Context &CLHelper::OpenCLContext() { return context_->GetContext(); }
+cl::Context &CLHelper::OpenCLContext() {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
+  return context_->GetContext();
+}

-std::vector<size_t> CLHelper::DefaultWorkSize(const CLImage &image) {
+cl::NDRange CLHelper::DefaultWorkSize(const CLImage &image) {
  // n c h w
  auto image_dim = image.tensor_dims();
  if (image_dim.size() == 4) {
@@ -52,23 +63,26 @@ std::vector<size_t> CLHelper::DefaultWorkSize(const CLImage &image) {
    auto work_size_0 = image_width / w;
    auto work_size_1 = w;
    auto work_size_2 = n * h;
-    return {static_cast<size_t>(work_size_0), static_cast<size_t>(work_size_1),
+    return cl::NDRange{static_cast<size_t>(work_size_0),
+                       static_cast<size_t>(work_size_1),
                       static_cast<size_t>(work_size_2)};
  } else if (image_dim.size() == 2) {
-    return {static_cast<size_t>(1), static_cast<size_t>(image.ImageWidth()),
+    return cl::NDRange{static_cast<size_t>(1),
+                       static_cast<size_t>(image.ImageWidth()),
                       static_cast<size_t>(image.ImageHeight())};
  } else if (image_dim.size() == 1) {
-    return {static_cast<size_t>(1), static_cast<size_t>(image.ImageWidth()),
+    return cl::NDRange{static_cast<size_t>(1),
+                       static_cast<size_t>(image.ImageWidth()),
                       static_cast<size_t>(1)};
  } else if (image_dim.size() == 3) {
    auto c = image_dim[0];
    auto h = image_dim[1];
    auto w = image_dim[2];
-    return {static_cast<size_t>((c + 3) / 4), static_cast<size_t>(w),
+    return cl::NDRange{static_cast<size_t>((c + 3) / 4), static_cast<size_t>(w),
                       static_cast<size_t>(h)};
  } else {
    LOG(FATAL) << "Not support this dimension, need to be implemented!";
-    return {};
+    return cl::NDRange{};
  }
 }


--- a/paddle/fluid/lite/opencl/cl_helper.h
+++ b/paddle/fluid/lite/opencl/cl_helper.h
@@ -30,6 +30,8 @@ class CLHelper {

  explicit CLHelper(CLContext *context) : context_(context) {}

+  void set_context(CLContext *context);
+
  void AddKernel(const std::string &kernel_name, const std::string &file_name,
                 const std::string &options = "");

@@ -39,10 +41,10 @@ class CLHelper {

  cl::Context &OpenCLContext();

-  std::vector<size_t> DefaultWorkSize(const CLImage &image);
+  cl::NDRange DefaultWorkSize(const CLImage &image);

 private:
-  CLContext *context_;
+  CLContext *context_{nullptr};
  std::vector<std::unique_ptr<cl::Kernel>> kernels;
 };


--- a/paddle/fluid/lite/opencl/cl_image.cc
+++ b/paddle/fluid/lite/opencl/cl_image.cc
@@ -27,12 +27,12 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
  int height = cl_image.image_dims_[1];

  half_t* image_data = new half_t[height * width * 4];
-  cl::Image2D& image = cl_image.cl_image();
+  cl::Image* image = cl_image.cl_image();
  const std::array<size_t, 3> origin{0, 0, 0};
  const std::array<size_t, 3> region{static_cast<size_t>(width),
                                     static_cast<size_t>(height), 1};
  cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
-      image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
+      *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
  CL_CHECK_ERRORS(err);

  float* tensor_data = new float[cl_image.numel()];
@@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
  return os;
 }

-void CLImage::SetTensorData(float* tensor_data, const DDim& dim) {
+void CLImage::set_tensor_data(float* tensor_data, const DDim& dim) {
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
  auto numel = dim.product();
 #else
@@ -65,28 +65,30 @@ void CLImage::SetTensorData(float* tensor_data, const DDim& dim) {
 }

 void CLImage::InitCLImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_data_ != nullptr) << " Please call "
+                                    "set_tensohelper->DefaultWorkSize(out_"
+                                    "image)r_data first!";
  image_converter_.reset(new CLImageConverterFolder);
  InitCLImage(context, image_converter_.get());
 }

 void CLImage::InitNormalCLImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
  image_converter_.reset(new CLImageConverterNormal);
  InitCLImage(context, image_converter_.get());
 }

 void CLImage::InitNImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
  CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
-  image_converter_.reset(new CLImageConverterNWBlock());
+  image_converter_.reset(new CLImageConverterNWBlock);
  InitCLImage(context, image_converter_.get());
 }

 void CLImage::InitDWImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
  CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
-  image_converter_.reset(new CLImageConverterDWBlock());
+  image_converter_.reset(new CLImageConverterDWBlock);
  InitCLImage(context, image_converter_.get());
 }

@@ -95,7 +97,7 @@ void CLImage::InitEmptyImage(const cl::Context& context, const DDim& dim) {
      << " Empty image tensor data shouldn't have value";

  tensor_dims_ = dim;
-  image_converter_.reset(new CLImageConverterNormal());
+  image_converter_.reset(new CLImageConverterNormal);

  VLOG(3) << " to get image dims ";
  image_dims_ = image_converter_->InitImageDimInfoWith(tensor_dims_);
@@ -123,7 +125,7 @@ void CLImage::InitEmptyWithImageDim(const cl::Context& context,

 void CLImage::InitCLImage(const cl::Context& context,
                          CLImageConverterBase* converter) {
-  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";

  VLOG(3) << " begin init cl image ";
  image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);

--- a/paddle/fluid/lite/opencl/cl_image.h
+++ b/paddle/fluid/lite/opencl/cl_image.h
@@ -33,11 +33,11 @@ class CLImage {
  /*
   * Will not hold input tensor data, memcpy in this method.
   * */
-  void SetTensorData(float* tensor_data, const DDim& dim);
+  void set_tensor_data(float* tensor_data, const DDim& dim);

  bool IsInit() { return initialized_; }
  /*
-   * Need call SetTensorData first.
+   * Need call set_tensor_data first.
   * Folder when one dim or two dim.
   * */
  void InitCLImage(const cl::Context& context);
@@ -53,7 +53,7 @@ class CLImage {
  void InitEmptyWithImageDim(const cl::Context& context,
                             const DDim& image_dims);

-  cl::Image2D& cl_image() const { return *cl_image_; }
+  cl::Image* cl_image() const { return cl_image_.get(); }

  const DDim& image_dims() const { return image_dims_; }

@@ -63,7 +63,7 @@ class CLImage {

  const DDim& tensor_dims() const { return tensor_dims_; }

-  /*
+  /*with_da
   * Resize original tensor dim.
   * */
  inline CLImage& Resize(const DDim& dims) {

--- a/paddle/fluid/lite/opencl/cl_test.cc
+++ b/paddle/fluid/lite/opencl/cl_test.cc
@@ -12,10 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_caller.h"
 #include "paddle/fluid/lite/opencl/cl_context.h"
 #include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#include "paddle/fluid/lite/opencl/cl_image.h"
+
+DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");

 namespace paddle {
 namespace lite {
@@ -23,7 +33,7 @@ namespace lite {
 TEST(cl_test, engine_test) {
  auto* engine = CLEngine::Global();
  CHECK(engine->IsInitSuccess());
-  engine->set_cl_path("/data/local/tmp/opencl");
+  engine->set_cl_path(FLAGS_cl_path);
  engine->platform();
  engine->device();
  engine->command_queue();
@@ -37,11 +47,108 @@ TEST(cl_test, engine_test) {
 TEST(cl_test, context_test) {
  auto* engine = CLEngine::Global();
  CHECK(engine->IsInitSuccess());
-  engine->set_cl_path("/data/local/tmp/opencl");
+  engine->set_cl_path(FLAGS_cl_path);
  CLContext context;
  context.GetKernel("pool_max", "pool_kernel.cl", "");
  context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
  context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
 }
+
+TEST(cl_test, kernel_test) {
+  auto* engine = CLEngine::Global();
+  CHECK(engine->IsInitSuccess());
+  engine->set_cl_path(FLAGS_cl_path);
+  std::unique_ptr<CLContext> context(new CLContext);
+  // std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
+  std::unique_ptr<CLHelper> helper(new CLHelper);
+  helper->set_context(context.get());
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  helper->AddKernel("pool_max", "pool_kernel.cl");
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  auto kernel = helper->KernelAt(2);
+
+  std::unique_ptr<float[]> in_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    in_data[i] = 1.f;
+  }
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  CLImage in_image;
+  in_image.set_tensor_data(in_data.get(), in_dim);
+  in_image.InitNormalCLImage(helper->OpenCLContext());
+  LOG(INFO) << in_image;
+
+  std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    bias_data[i] = 2.f;
+  }
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  CLImage bias_image;
+  bias_image.set_tensor_data(bias_data.get(), bias_dim);
+  bias_image.InitNormalCLImage(helper->OpenCLContext());
+  LOG(INFO) << bias_image;
+
+  CLImage out_image;
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
+  LOG(INFO) << out_image;
+
+  cl_int status;
+  status = kernel.setArg(0, *in_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(1, *bias_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(2, *out_image.cl_image());
+  CL_CHECK_ERRORS(status);
+
+  // auto global_work_size = helper->DefaultWorkSize(out_image);
+  size_t width = in_image.ImageWidth();
+  size_t height = in_image.ImageHeight();
+  auto global_work_size = cl::NDRange{width, height};
+  cl::Event event;
+  status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
+  CL_CHECK_ERRORS(status);
+
+  double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+  double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
+  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
+  LOG(INFO) << out_image;
+}
+
+TEST(cl_test, elementwise_add_test) {
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  std::unique_ptr<float[]> in_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    in_data[i] = dist(engine);
+  }
+
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    bias_data[i] = dist(engine);
+  }
+
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  std::unique_ptr<float[]> out(new float[1024 * 512]);
+
+  bool status = InitOpenCLEngine(FLAGS_cl_path);
+  CHECK(status) << "Fail to initialize OpenCL engine.";
+  CLContext context;
+
+  elementwise_add(&context, in_data.get(), in_dim, bias_data.get(), bias_dim,
+                  out.get(), out_dim);
+
+  int stride = 1024 * 512 / 20;
+  for (int i = 0; i < 1024 * 512; i += stride) {
+    std::cout << out[i] << " ";
+  }
+
+  std::cout << std::endl;
+}
+
 }  // namespace lite
 }  // namespace paddle