add opencl support for paddle-lite

bdf5e1fd · Zhen Wang · ZhenWang · 7cf536f0 · bdf5e1fd · bdf5e1fd
26 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,6 +150,7 @@ option(WITH_LITE "Enable lite framework" OFF)
 option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
+option(LITE_WITH_CL   "Enable OpenCL support in lite" OFF)
 option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)
 option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
@@ -166,6 +167,12 @@ endif()
 include_directories("${PADDLE_SOURCE_DIR}")
+# for opencl
+if (LITE_WITH_CL)
+    include(external/opencl-headers)
+    include(external/opencl-clhpp)
+endif()
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    message(STATUS "Building the mobile framework")

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -176,6 +176,10 @@ if (LITE_WITH_ARM)
    add_definitions("-DLITE_WITH_ARM")
 endif()
+if (LITE_WITH_CL)
+    add_definitions("-DLITE_WITH_CL")
+endif()
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
 endif()

--- a/cmake/external/opencl-clhpp.cmake
+++ b/cmake/external/opencl-clhpp.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+INCLUDE(ExternalProject)
+SET(OPENCL_CLHPP_SRCS_DIR    ${THIRD_PARTY_PATH}/opencl-clhpp)
+SET(OPENCL_CLHPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/opencl-clhpp)
+SET(OPENCL_CLHPP_INCLUDE_DIR "${OPENCL_CLHPP_INSTALL_DIR}" CACHE PATH "opencl-clhpp include directory." FORCE)
+INCLUDE_DIRECTORIES(${OPENCL_CLHPP_INCLUDE_DIR})
+ExternalProject_Add(
+  opencl_clhpp
+  GIT_REPOSITORY    "https://github.com/KhronosGroup/OpenCL-CLHPP.git"
+  GIT_TAG           "v2.0.10"
+  PREFIX            "${OPENCL_CLHPP_SRCS_DIR}"
+  CMAKE_ARGS        -DBUILD_DOCS=OFF
+                    -DBUILD_EXAMPLES=OFF
+                    -DBUILD_TESTS=OFF
+                    -DCMAKE_INSTALL_PREFIX=${OPENCL_CLHPP_INSTALL_DIR}
+                    ${OPTIONAL_ARGS}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+  CMAKE_CACHE_ARGS  -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+                    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+ADD_DEPENDENCIES(opencl_clhpp opencl_headers)
--- a/cmake/external/opencl-headers.cmake
+++ b/cmake/external/opencl-headers.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+INCLUDE(ExternalProject)
+SET(OPENCL_HEADERS_SRCS_DIR    ${THIRD_PARTY_PATH}/opencl-headers)
+SET(OPENCL_HEADERS_INCLUDE_DIR "${OPENCL_HEADERS_SRCS_DIR}/src/opencl_headers/opencl20" CACHE PATH "opencl-headers include directory." FORCE)
+INCLUDE_DIRECTORIES(${OPENCL_HEADERS_INCLUDE_DIR})
+ExternalProject_Add(
+  opencl_headers
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY    "https://github.com/KhronosGroup/OpenCL-Headers.git"
+  GIT_TAG           "c5a4bbeabb10d8ed3d1c651b93aa31737bc473dd"
+  PREFIX            ${OPENCL_HEADERS_SRCS_DIR}
+  DOWNLOAD_NAME     "OpenCL-Headers"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -182,6 +182,7 @@ add_subdirectory(x86)
 add_subdirectory(arm)
 add_subdirectory(host)
 add_subdirectory(cuda)
+add_subdirectory(opencl)
 add_subdirectory(model_parser)
 add_subdirectory(utils)
 add_subdirectory(api)

--- a/paddle/fluid/lite/opencl/CMakeLists.txt
+++ b/paddle/fluid/lite/opencl/CMakeLists.txt
+if (NOT LITE_WITH_CL)
+    return()
+endif()
+find_library(opencl-lib
+        NAMES OpenCL)
+message(STATUS "The OpenCL library path : ${opencl-lib}")
+add_compile_options(-fno-strict-aliasing)
+cc_library(cl_tool SRCS cl_tool.cc)
+cc_library(cl_half SRCS cl_half.cc)
+cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool)
+cc_library(cl_context SRCS cl_context.cc DEPS cl_engine)
+cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context)
+cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor)
+cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter)
+cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_engine cl_context)
+target_link_libraries(test_cl_runtime ${opencl-lib})
--- a/paddle/fluid/lite/opencl/cl2_header.h
+++ b/paddle/fluid/lite/opencl/cl2_header.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#define CL_TARGET_OPENCL_VERSION 200
+#define CL_HPP_TARGET_OPENCL_VERSION 200
+#define CL_HPP_MINIMUM_OPENCL_VERSION 110
+#include <CL/cl2.hpp>
--- a/paddle/fluid/lite/opencl/cl_context.cc
+++ b/paddle/fluid/lite/opencl/cl_context.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <glog/logging.h>
+#include <memory>
+#include <string>
+#include <utility>
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+namespace paddle {
+namespace lite {
+cl::CommandQueue &CLContext::GetCommandQueue() {
+  return CLEngine::Global()->command_queue();
+}
+cl::Context &CLContext::GetContext() { return CLEngine::Global()->context(); }
+cl::Program &CLContext::GetProgram(const std::string &file_name,
+                                   const std::string &options) {
+  std::string program_key = file_name;
+  if (!options.empty()) {
+    program_key += options;
+  }
+  auto it = programs_.find(program_key);
+  if (it != programs_.end()) {
+    VLOG(3) << " --- program -> " << program_key << " has been built --- ";
+    return *(it->second);
+  }
+  auto program = CLEngine::Global()->CreateProgram(
+      GetContext(), CLEngine::Global()->cl_path() + "/cl_kernel/" + file_name);
+  VLOG(3) << " --- begin build program -> " << program_key << " --- ";
+  CLEngine::Global()->BuildProgram(program.get(), options);
+  VLOG(3) << " --- end build program -> " << program_key << " --- ";
+  programs_[program_key] = std::move(program);
+  return *(programs_[program_key]);
+}
+std::unique_ptr<cl::Kernel> CLContext::GetKernel(const std::string &kernel_name,
+                                                 const std::string &file_name,
+                                                 const std::string &options) {
+  cl_int status{CL_SUCCESS};
+  VLOG(3) << " --- to get program " << file_name << " --- ";
+  auto program = GetProgram(file_name, options);
+  VLOG(3) << " --- end get program --- ";
+  VLOG(3) << " --- to create kernel: " << kernel_name << " --- ";
+  std::unique_ptr<cl::Kernel> kernel(
+      new cl::Kernel(program, kernel_name.c_str(), &status));
+  CL_CHECK_ERRORS(status);
+  VLOG(3) << " --- end create kernel --- ";
+  return std::move(kernel);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_context.h
+++ b/paddle/fluid/lite/opencl/cl_context.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/lite/opencl/cl2_header.h"
+namespace paddle {
+namespace lite {
+class CLContext {
+ public:
+  cl::CommandQueue &GetCommandQueue();
+  cl::Context &GetContext();
+  cl::Program &GetProgram(const std::string &file_name,
+                          const std::string &options);
+  std::unique_ptr<cl::Kernel> GetKernel(const std::string &kernel_name,
+                                        const std::string &file_name,
+                                        const std::string &options);
+ private:
+  std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
+};
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_engine.cc
+++ b/paddle/fluid/lite/opencl/cl_engine.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include <glog/logging.h>
+#include <string>
+#include <utility>
+#include <vector>
+namespace paddle {
+namespace lite {
+CLEngine* CLEngine::Global() {
+  static CLEngine cl_engine_;
+  cl_engine_.Init();
+  return &cl_engine_;
+}
+CLEngine::~CLEngine() {
+  if (command_queue_ != nullptr) {
+    command_queue_->finish();
+  }
+  // For controlling the destruction order:
+  command_queue_.reset();
+  context_.reset();
+  device_.reset();
+  platform_.reset();
+}
+bool CLEngine::Init() {
+  if (initialized_) {
+    return true;
+  }
+  bool is_platform_init = InitializePlatform();
+  bool is_device_init = InitializeDevice();
+  is_init_success_ = is_platform_init && is_device_init;
+  initialized_ = true;
+  return initialized_;
+}
+cl::Platform& CLEngine::platform() {
+  CHECK(platform_ != nullptr) << "platform_ is not initialized!";
+  return *platform_;
+}
+cl::Context& CLEngine::context() {
+  if (context_ == nullptr) {
+    context_ = CreateContext();
+  }
+  return *context_;
+}
+cl::Device& CLEngine::device() {
+  CHECK(device_ != nullptr) << "device_ is not initialized!";
+  return *device_;
+}
+cl::CommandQueue& CLEngine::command_queue() {
+  if (command_queue_ == nullptr) {
+    command_queue_ = CreateCommandQueue(context());
+  }
+  return *command_queue_;
+}
+std::unique_ptr<cl::Program> CLEngine::CreateProgram(const cl::Context& context,
+                                                     std::string file_name) {
+  std::ifstream file{file_name, std::ios::binary | std::ios::ate};
+  CHECK(file.is_open()) << "Can't open file from " << file_name;
+  auto size = file.tellg();
+  CHECK(size > 0) << "size is too small.";
+  std::string content(size, '\0');
+  file.seekg(0);
+  file.read(&content[0], size);
+  cl::Program::Sources sources;
+  sources.push_back(content);
+  auto prog =
+      std::unique_ptr<cl::Program>(new cl::Program(context, sources, &status_));
+  LOG(INFO) << "OpenCL kernel file name: " << file_name;
+  LOG(INFO) << "Program source size: " << content.size();
+  CL_CHECK_ERRORS(status_);
+  return std::move(prog);
+}
+std::unique_ptr<cl::UserEvent> CLEngine::CreateEvent(
+    const cl::Context& context) {
+  auto event =
+      std::unique_ptr<cl::UserEvent>(new cl::UserEvent(context, &status_));
+  CL_CHECK_ERRORS(status_);
+  return std::move(event);
+}
+bool CLEngine::BuildProgram(cl::Program* program, const std::string& options) {
+  std::string build_option = options + " -cl-fast-relaxed-math -I " +
+                             CLEngine::Global()->cl_path() + "/cl_kernel";
+  status_ = program->build({*device_}, build_option.c_str());
+  CL_CHECK_ERRORS(status_);
+  if (status_ != CL_SUCCESS) {
+    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
+        CL_BUILD_ERROR) {
+      std::string log = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
+      LOG(INFO) << "Program build error: " << log;
+    }
+    return false;
+  }
+  return true;
+}
+bool CLEngine::InitializePlatform() {
+  std::vector<cl::Platform> all_platforms;
+  status_ = cl::Platform::get(&all_platforms);
+  CL_CHECK_ERRORS(status_);
+  if (all_platforms.empty()) {
+    LOG(ERROR) << "No OpenCL platform found!";
+    return false;
+  }
+  platform_ = std::make_shared<cl::Platform>();
+  *platform_ = all_platforms[0];
+  return true;
+}
+bool CLEngine::InitializeDevice() {
+  std::vector<cl::Device> all_devices;
+  status_ = platform_->getDevices(CL_DEVICE_TYPE_DEFAULT, &all_devices);
+  CL_CHECK_ERRORS(status_);
+  if (all_devices.empty()) {
+    LOG(ERROR) << "No OpenCL device found!";
+    return false;
+  }
+  device_ = std::make_shared<cl::Device>();
+  *device_ = all_devices[0];
+  auto device_name = device_->getInfo<CL_DEVICE_NAME>();
+  LOG(INFO) << "Using device: " << device_name;
+  auto image_support = device_->getInfo<CL_DEVICE_IMAGE_SUPPORT>();
+  if (image_support) {
+    LOG(INFO) << "The chosen device supports image processing.";
+  } else {
+    LOG(ERROR) << "The chosen device doesn't support image processing!";
+    return false;
+  }
+  auto ext_data = device_->getInfo<CL_DEVICE_EXTENSIONS>();
+  if (ext_data.find("cl_khr_fp16") != std::string::npos) {
+    LOG(INFO) << "The chosen device supports the half data type.";
+  } else {
+    LOG(INFO) << "The chosen device doesn't support the half data type!";
+    LOG(INFO) << "The extensions supported by this device: " << ext_data;
+    // LOG(ERROR) << "The chosen platform doesn't support the half data type!";
+    // return false;
+  }
+  auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+  LOG(INFO) << "The chosen device has " << max_units << " compute units.";
+  auto local_mem = device_->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+  LOG(INFO) << "The local memory size of the chosen device is "
+            << static_cast<float>(local_mem) / 1024 << " KB.";
+  return true;
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_engine.h
+++ b/paddle/fluid/lite/opencl/cl_engine.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <fstream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/opencl/cl2_header.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+namespace paddle {
+namespace lite {
+class CLEngine {
+ public:
+  static CLEngine* Global();
+  bool Init();
+  cl::Platform& platform();
+  cl::Context& context();
+  cl::Device& device();
+  cl::CommandQueue& command_queue();
+  std::unique_ptr<cl::Program> CreateProgram(const cl::Context& context,
+                                             std::string file_name);
+  std::unique_ptr<cl::UserEvent> CreateEvent(const cl::Context& context);
+  bool BuildProgram(cl::Program* program, const std::string& options = "");
+  bool IsInitSuccess() { return is_init_success_; }
+  std::string cl_path() { return cl_path_; }
+  void set_cl_path(std::string cl_path) { cl_path_ = cl_path; }
+ private:
+  CLEngine() = default;
+  ~CLEngine();
+  bool InitializePlatform();
+  bool InitializeDevice();
+  std::shared_ptr<cl::Context> CreateContext() {
+    auto context = std::make_shared<cl::Context>(
+        std::vector<cl::Device>{device()}, nullptr, nullptr, nullptr, &status_);
+    CL_CHECK_ERRORS(status_);
+    return context;
+  }
+  std::shared_ptr<cl::CommandQueue> CreateCommandQueue(
+      const cl::Context& context) {
+    auto queue =
+        std::make_shared<cl::CommandQueue>(context, device(), 0, &status_);
+    CL_CHECK_ERRORS(status_);
+    return queue;
+  }
+  std::string cl_path_;
+  std::shared_ptr<cl::Platform> platform_{nullptr};
+  std::shared_ptr<cl::Context> context_{nullptr};
+  std::shared_ptr<cl::Device> device_{nullptr};
+  std::shared_ptr<cl::CommandQueue> command_queue_{nullptr};
+  cl_int status_{CL_SUCCESS};
+  bool initialized_{false};
+  bool is_init_success_{false};
+};
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_half.cc
+++ b/paddle/fluid/lite/opencl/cl_half.cc
--- a/paddle/fluid/lite/opencl/cl_half.h
+++ b/paddle/fluid/lite/opencl/cl_half.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cstdint>
+namespace paddle {
+namespace lite {
+typedef uint16_t half_t;
+half_t Float2Half(float f);
+float Half2Float(half_t h);
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_helper.cc
+++ b/paddle/fluid/lite/opencl/cl_helper.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#include <glog/logging.h>
+#include <string>
+#include <utility>
+#include <vector>
+namespace paddle {
+namespace lite {
+void CLHelper::AddKernel(const std::string &kernel_name,
+                         const std::string &file_name,
+                         const std::string &options) {
+  VLOG(3) << " --- begin to add kernel ---";
+  auto kernel = context_->GetKernel(kernel_name, file_name, options);
+  kernels.emplace_back(std::move(kernel));
+  VLOG(3) << " --- end to add kernel --- ";
+}
+cl::Kernel &CLHelper::KernelAt(const int index) {
+  VLOG(3) << " --- kernel count: " << kernels.size() << " --- ";
+  return *(kernels[index]);
+}
+cl::CommandQueue &CLHelper::OpenCLCommandQueue() {
+  return context_->GetCommandQueue();
+}
+cl::Context &CLHelper::OpenCLContext() { return context_->GetContext(); }
+std::vector<size_t> CLHelper::DefaultWorkSize(const CLImage &image) {
+  // n c h w
+  auto image_dim = image.tensor_dims();
+  if (image_dim.size() == 4) {
+    auto n = image_dim[0];
+    auto h = image_dim[2];
+    auto w = image_dim[3];
+    auto image_width = image.ImageWidth();
+    auto work_size_0 = image_width / w;
+    auto work_size_1 = w;
+    auto work_size_2 = n * h;
+    return {static_cast<size_t>(work_size_0), static_cast<size_t>(work_size_1),
+            static_cast<size_t>(work_size_2)};
+  } else if (image_dim.size() == 2) {
+    return {static_cast<size_t>(1), static_cast<size_t>(image.ImageWidth()),
+            static_cast<size_t>(image.ImageHeight())};
+  } else if (image_dim.size() == 1) {
+    return {static_cast<size_t>(1), static_cast<size_t>(image.ImageWidth()),
+            static_cast<size_t>(1)};
+  } else if (image_dim.size() == 3) {
+    auto c = image_dim[0];
+    auto h = image_dim[1];
+    auto w = image_dim[2];
+    return {static_cast<size_t>((c + 3) / 4), static_cast<size_t>(w),
+            static_cast<size_t>(h)};
+  } else {
+    LOG(FATAL) << "Not support this dimension, need to be implemented!";
+    return {};
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_helper.h
+++ b/paddle/fluid/lite/opencl/cl_helper.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/opencl/cl2_header.h"
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_image.h"
+namespace paddle {
+namespace lite {
+class CLHelper {
+ public:
+  CLHelper() = default;
+  explicit CLHelper(CLContext *context) : context_(context) {}
+  void AddKernel(const std::string &kernel_name, const std::string &file_name,
+                 const std::string &options = "");
+  cl::Kernel &KernelAt(const int index);
+  cl::CommandQueue &OpenCLCommandQueue();
+  cl::Context &OpenCLContext();
+  std::vector<size_t> DefaultWorkSize(const CLImage &image);
+ private:
+  CLContext *context_;
+  std::vector<std::unique_ptr<cl::Kernel>> kernels;
+};
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_image.cc
+++ b/paddle/fluid/lite/opencl/cl_image.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/opencl/cl_image.h"
+#include <glog/logging.h>
+#include <array>
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_half.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+namespace paddle {
+namespace lite {
+std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
+  int width = cl_image.image_dims_[0];
+  int height = cl_image.image_dims_[1];
+  half_t* image_data = new half_t[height * width * 4];
+  cl::Image2D& image = cl_image.cl_image();
+  const std::array<size_t, 3> origin{0, 0, 0};
+  const std::array<size_t, 3> region{static_cast<size_t>(width),
+                                     static_cast<size_t>(height), 1};
+  cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
+      image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
+  CL_CHECK_ERRORS(err);
+  float* tensor_data = new float[cl_image.numel()];
+  auto* converter = cl_image.image_converter();
+  converter->ImageToNCHW(image_data, tensor_data, cl_image.image_dims_,
+                         cl_image.tensor_dims_);
+  int stride = cl_image.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+  os << " dims: " << cl_image.tensor_dims_ << "\n";
+  for (int i = 0; i < cl_image.numel(); i += stride) {
+    os << tensor_data[i] << " ";
+  }
+  delete[] tensor_data;
+  delete[] image_data;
+  return os;
+}
+void CLImage::SetTensorData(float* tensor_data, const DDim& dim) {
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  auto numel = dim.product();
+#else
+  auto numel = dim.production();
+#endif
+  tensor_data_.reset(new float[numel]);
+  memcpy(tensor_data_.get(), tensor_data, numel * sizeof(float));
+  tensor_dims_ = dim;
+}
+void CLImage::InitCLImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  image_converter_.reset(new CLImageConverterFolder);
+  InitCLImage(context, image_converter_.get());
+}
+void CLImage::InitNormalCLImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  image_converter_.reset(new CLImageConverterNormal);
+  InitCLImage(context, image_converter_.get());
+}
+void CLImage::InitNImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
+  image_converter_.reset(new CLImageConverterNWBlock());
+  InitCLImage(context, image_converter_.get());
+}
+void CLImage::InitDWImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
+  image_converter_.reset(new CLImageConverterDWBlock());
+  InitCLImage(context, image_converter_.get());
+}
+void CLImage::InitEmptyImage(const cl::Context& context, const DDim& dim) {
+  CHECK(tensor_data_ == nullptr)
+      << " Empty image tensor data shouldn't have value";
+  tensor_dims_ = dim;
+  image_converter_.reset(new CLImageConverterNormal());
+  VLOG(3) << " to get image dims ";
+  image_dims_ = image_converter_->InitImageDimInfoWith(tensor_dims_);
+  VLOG(3) << " end get image dims " << image_dims_;
+  InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
+  cl_event_ = CLEngine::Global()->CreateEvent(context);
+  initialized_ = true;
+  VLOG(3) << " end init cl image ";
+}
+void CLImage::InitEmptyWithImageDim(const cl::Context& context,
+                                    const DDim& image_dims) {
+  VLOG(3) << " to get image dims ";
+  image_dims_ = image_dims;
+  VLOG(3) << " end get image dims " << image_dims_;
+  InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
+  cl_event_ = CLEngine::Global()->CreateEvent(context);
+  initialized_ = true;
+  VLOG(3) << " end init cl image";
+}
+void CLImage::InitCLImage(const cl::Context& context,
+                          CLImageConverterBase* converter) {
+  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  VLOG(3) << " begin init cl image ";
+  image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  half_t* image_data = new half_t[image_dims_.product() * 4];
+#else
+  half_t* image_data = new half_t[image_dims_.production() * 4];
+#endif
+  VLOG(3) << " convert to image ";
+  converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_);
+  VLOG(3) << " end convert to image ";
+  InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
+  delete[] image_data;
+  tensor_data_ = nullptr;
+  cl_event_ = CLEngine::Global()->CreateEvent(context);
+  initialized_ = true;
+  VLOG(3) << " end init cl image ";
+}
+void CLImage::InitCLImage(const cl::Context& context, int width, int height,
+                          void* data) {
+  cl::ImageFormat img_format(CL_RGBA, CL_HALF_FLOAT);
+  cl_int err;
+  cl_image_.reset(new cl::Image2D(
+      context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
+      img_format, width, height, 0, data, &err));
+  CL_CHECK_ERRORS(err);
+  CHECK(err == CL_SUCCESS) << " Create image 2d error.";
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_image.h
+++ b/paddle/fluid/lite/opencl/cl_image.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <iostream>
+#include <memory>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl2_header.h"
+#include "paddle/fluid/lite/opencl/cl_image_converter.h"
+namespace paddle {
+namespace lite {
+class CLImage {
+  // For debug
+  friend std::ostream& operator<<(std::ostream& os, const CLImage& image);
+ public:
+  CLImage() = default;
+  /*
+   * Will not hold input tensor data, memcpy in this method.
+   * */
+  void SetTensorData(float* tensor_data, const DDim& dim);
+  bool IsInit() { return initialized_; }
+  /*
+   * Need call SetTensorData first.
+   * Folder when one dim or two dim.
+   * */
+  void InitCLImage(const cl::Context& context);
+  void InitNormalCLImage(const cl::Context& context);
+  void InitNImage(const cl::Context& context);
+  void InitDWImage(const cl::Context& context);
+  void InitEmptyImage(const cl::Context& context, const DDim& dim);
+  void InitEmptyWithImageDim(const cl::Context& context,
+                             const DDim& image_dims);
+  cl::Image2D& cl_image() const { return *cl_image_; }
+  const DDim& image_dims() const { return image_dims_; }
+  inline size_t ImageWidth() const { return image_dims_[0]; }
+  inline size_t ImageHeight() const { return image_dims_[1]; }
+  const DDim& tensor_dims() const { return tensor_dims_; }
+  /*
+   * Resize original tensor dim.
+   * */
+  inline CLImage& Resize(const DDim& dims) {
+    tensor_dims_ = dims;
+    return *this;
+  }
+  template <typename T>
+  T* data() const {
+    CHECK(!initialized_) << "CL image has initialized, tensor data has been "
+                            "deleted, can't use tensor data!";
+    return reinterpret_cast<T*>(tensor_data_);
+  }
+  /*
+   *  Numel of tensor dim
+   * */
+  inline int64_t numel() const {
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+    return tensor_dims_.product();
+#else
+    return tensor_dims_.production();
+#endif
+  }
+  /*
+   *  Original tensor dim
+   * */
+  cl::UserEvent& cl_event() const { return *cl_event_; }
+  CLImageConverterBase* image_converter() const {
+    return image_converter_.get();
+  }
+ private:
+  void InitCLImage(const cl::Context& context, CLImageConverterBase* converter);
+  void InitCLImage(const cl::Context& context, int width, int height,
+                   void* data);
+  bool initialized_ = false;
+  std::unique_ptr<cl::Image2D> cl_image_{nullptr};
+  std::unique_ptr<cl::UserEvent> cl_event_{nullptr};
+  DDim tensor_dims_;
+  DDim image_dims_;
+  std::unique_ptr<float> tensor_data_{nullptr};
+  std::unique_ptr<CLImageConverterBase> image_converter_{nullptr};
+};
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_image_converter.cc
+++ b/paddle/fluid/lite/opencl/cl_image_converter.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/opencl/cl_image_converter.h"
+#include <glog/logging.h>
+#include <vector>
+namespace paddle {
+namespace lite {
+DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+  VLOG(3) << " tensor dim: " << tensor_dim;
+  VLOG(3) << " image dim: " << in_image_dim;
+  size_t width = in_image_dim[0];
+  size_t w_block = width / W;
+  float *p = nchw;
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          if (c < C) {
+            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = Float2Half(*p);
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = image_dim[0];
+  float *p = tensor;
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          *p = Half2Float(image[i2]);
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
+  if (tensor_dim.size() <= 2) {
+    size_t tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+    size_t width = (tdim[1] + 3) / 4;
+    size_t height = tdim[0];
+    width_of_one_block_ = width;
+    height_of_one_block_ = height;
+    c_block_ = 1;
+    return DDim(
+        std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                       static_cast<DDim::value_type>(height)}));
+  } else {
+    size_t new_dims[] = {1, 1, 1, 1};
+    for (size_t j = 0; j < tensor_dim.size(); ++j) {
+      new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+    }
+    size_t N, C, H, W;
+    N = new_dims[0];
+    C = new_dims[1];
+    H = new_dims[2];
+    W = new_dims[3];
+    size_t width = W * ((C + 3) / 4);
+    size_t height = H * N;
+    width_of_one_block_ = W;
+    height_of_one_block_ = H;
+    c_block_ = width / W;
+    return DDim(
+        std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                       static_cast<DDim::value_type>(height)}));
+  }
+}
+void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
+                                         const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
+      << " Tensor dim is not support!";
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.NCHWToImage(tensor, image, tensor_dim);
+  } else {
+    size_t tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+    DDim image_dim = InitImageDimInfoWith(tensor_dim);
+    size_t width = image_dim[0];
+    for (size_t h = 0; h < tdim[0]; h++) {
+      for (size_t w = 0; w < tdim[1]; w++) {
+        image[(h * width + w / 4) * 4 + (w % 4)] =
+            Float2Half(tensor[h * tdim[1] + w]);
+      }
+    }
+  }
+}
+void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+  } else {
+    size_t width = image_dim[0];
+    size_t H = 1, W = 1;
+    if (tensor_dim.size() == 2) {
+      H = tensor_dim[0];
+      W = tensor_dim[1];
+    } else if (tensor_dim.size() == 1) {
+      W = tensor_dim[0];
+    }
+    float *p = tensor;
+    for (size_t h = 0; h < H; h++) {
+      for (size_t w = 0; w < W; w++) {
+        p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
+      }
+    }
+  }
+}
+DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
+                                          const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  auto image_dim = InitImageDimInfoWith(tensor_dim);
+  float *p = tensor;
+  size_t N = tensor_dim[0];
+  size_t C = tensor_dim[1];
+  size_t H = tensor_dim[2];
+  size_t W = tensor_dim[3];
+  size_t width = image_dim[0];
+  size_t height = image_dim[1];
+  size_t block = image_dim[0] / tensor_dim[3];
+  for (size_t n = 0; n < block * 4; n++) {
+    for (size_t c = 0; c < C; c++) {
+      for (size_t h = 0; h < H; ++h) {
+        for (size_t w = 0; w < W; ++w) {
+          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                         w * 4 + n % 4;
+          if (n < N) {
+            image[index] = Float2Half(*p);
+            p++;
+          } else {
+            image[index] = 0.0;
+          }
+          if (index >= (width * height * 4)) {
+            LOG(INFO) << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  VLOG(3) << " init done";
+}
+void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  float *p = tensor;
+  size_t N = tensor_dim[0];
+  size_t C = tensor_dim[1];
+  size_t H = tensor_dim[2];
+  size_t W = tensor_dim[3];
+  size_t width = image_dim[0];
+  size_t height = image_dim[1];
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < C; c++) {
+      for (size_t h = 0; h < H; ++h) {
+        for (size_t w = 0; w < W; ++w) {
+          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                         w * 4 + n % 4;
+          *p = Half2Float(image[index]);
+          p++;
+          if (index >= (width * height * 4)) {
+            LOG(INFO) << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  VLOG(3) << " init done";
+}
+DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[1];
+  C = new_dims[0];
+  H = new_dims[2];
+  W = new_dims[3];
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+  VLOG(3) << " tensor dim: " << tensor_dim;
+  VLOG(3) << " image dim: " << in_image_dim;
+  size_t width = in_image_dim[0];
+  size_t w_block = width / W;
+  float *p = tensor;
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          if (c < C) {
+            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = Float2Half(*p);
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  float *p = tensor;
+  size_t N = tensor_dim[1];
+  size_t C = tensor_dim[0];
+  size_t H = tensor_dim[2];
+  size_t W = tensor_dim[3];
+  size_t width = image_dim[0];
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          *p = Half2Float(image[i2]);
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+  width_of_one_block_ = W;
+  height_of_one_block_ = H;
+  c_block_ = width / W;
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
+                                         const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
+      << " Tensor dim is not support!";
+  CLImageConverterDefault default_converter;
+  default_converter.NCHWToImage(tensor, image, tensor_dim);
+}
+void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  CLImageConverterDefault default_converter;
+  default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+}
+DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  size_t N, C;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  size_t width = (C + 3) / 4;
+  size_t height = N * 16;  // N * (wino_blk_size + 2) * (wino_blk_size + 2)
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image,
+                                                  const DDim &tensor_dim) {}
+void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor,
+                                                  const DDim &image_dim,
+                                                  const DDim &tensor_dim) {}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_image_converter.h
+++ b/paddle/fluid/lite/opencl/cl_image_converter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_half.h"
+namespace paddle {
+namespace lite {
+class CLImageConverterBase {
+ public:
+  virtual ~CLImageConverterBase() {}
+  virtual void NCHWToImage(float *nchw, half_t *image,
+                           const DDim &tensor_dim) = 0;
+  virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
+                           const DDim &tensor_dim) = 0;
+  virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0;
+};
+class CLImageConverterDefault : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+class CLImageConverterFolder : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+  int GetCBlock() const { return c_block_; }
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+class CLImageConverterNormal : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+  int GetCBlock() const { return c_block_; }
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+class CLImageConverterNWBlock : public CLImageConverterBase {
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+class CLImageConverterDWBlock : public CLImageConverterBase {
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+class CLImageConverterWinoTransWeight : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_kernel/batchnorm_kernel.cl
+++ b/paddle/fluid/lite/opencl/cl_kernel/batchnorm_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+__kernel void batchnorm(__private const int out_width,
+                        __read_only image2d_t input,
+                        __read_only image2d_t new_scale_image,
+                        __read_only image2d_t new_bias_image,
+                        __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  float4 new_scale = read_imagef(new_scale_image, sampler, (int2)(out_c, 0));
+  float4 new_bias = read_imagef(new_bias_image, sampler, (int2)(out_c, 0));
+  int pos_x = mad24(out_c, out_width, out_w);
+  float4 in = read_imagef(input, sampler, (int2)(pos_x, out_nh));
+  float4 out = mad(in, new_scale, new_bias);
+  write_imagef(output, (int2)(pos_x, out_nh), out);
+}
--- a/paddle/fluid/lite/opencl/cl_kernel/cl_common.h
+++ b/paddle/fluid/lite/opencl/cl_kernel/cl_common.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+inline half4 activation(half4 in
+#ifdef PRELU
+                        ,
+                        half4 prelu_alpha
+#endif
+                        ) {
+  half4 output;
+#ifdef PRELU
+  output = select(prelu_alpha * in, in, in >= (half4)0.0);
+#endif
+#ifdef RELU
+  output = fmax(in, (half4)(0.0f));
+#endif
+  return output;
+}
--- a/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
+++ b/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) {
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+    int2 coords;
+    coords.x = x;
+    coords.y = y;
+    float4 in = read_imagef(input, sampler, coords);
+    float4 b = read_imagef(bias, sampler, coords);
+    float4 output = in + b;
+    write_imagef(outputImage, coords, output);
+}
--- a/paddle/fluid/lite/opencl/cl_test.cc
+++ b/paddle/fluid/lite/opencl/cl_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+namespace paddle {
+namespace lite {
+TEST(cl_test, engine_test) {
+  auto* engine = CLEngine::Global();
+  CHECK(engine->IsInitSuccess());
+  engine->set_cl_path("/work/Develop/Paddle/paddle/fluid/lite/opencl");
+  engine->platform();
+  engine->device();
+  engine->command_queue();
+  auto& context = engine->context();
+  auto program = engine->CreateProgram(
+      context, engine->cl_path() + "/cl_kernel/" + "elementwise_add_kernel.cl");
+  auto event = engine->CreateEvent(context);
+  CHECK(engine->BuildProgram(program.get()));
+}
+TEST(cl_test, context_test) {
+  auto* engine = CLEngine::Global();
+  CHECK(engine->IsInitSuccess());
+  engine->set_cl_path("/work/Develop/Paddle/paddle/fluid/lite/opencl");
+  CLContext context;
+  context.GetKernel("batchnorm", "batchnorm_kernel.cl", "");
+  context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
+  context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_tool.cc
+++ b/paddle/fluid/lite/opencl/cl_tool.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+namespace paddle {
+namespace lite {
+const char *opencl_error_to_str(cl_int error) {
+#define CASE_CL_CONSTANT(NAME) \
+  case NAME:                   \
+    return #NAME;
+  // Suppose that no combinations are possible.
+  switch (error) {
+    CASE_CL_CONSTANT(CL_SUCCESS)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
+    CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
+    CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
+    CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
+    CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
+    CASE_CL_CONSTANT(CL_MAP_FAILURE)
+    CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
+    CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
+    CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE)
+    CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
+    CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
+    CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
+    CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
+    CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
+    CASE_CL_CONSTANT(CL_INVALID_BINARY)
+    CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT)
+    CASE_CL_CONSTANT(CL_INVALID_OPERATION)
+    CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
+    default:
+      return "UNKNOWN ERROR CODE";
+  }
+#undef CASE_CL_CONSTANT
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_tool.h
+++ b/paddle/fluid/lite/opencl/cl_tool.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <CL/cl.h>
+namespace paddle {
+namespace lite {
+const char* opencl_error_to_str(cl_int error);
+#define CL_CHECK_ERRORS(ERR)                                         \
+  if (ERR != CL_SUCCESS) {                                           \
+    printf(                                                          \
+        "OpenCL error with code %s happened in file %s at line %d. " \
+        "Exiting.\n",                                                \
+        opencl_error_to_str(ERR), __FILE__, __LINE__);               \
+  }
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -25,6 +25,12 @@ function cmake_x86 {
    cmake ..  -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags}
 }
+function cmake_cl {
+    prepare_for_codegen
+    cmake ..  -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON -DLITE_WITH_CL=ON ${common_flags}
+}
 # This method is only called in CI.
 function cmake_x86_for_CI {
    prepare_for_codegen # fake an empty __generated_code__.cc to pass cmake.
@@ -422,6 +428,10 @@ function main {
                cmake_x86
                shift
                ;;
+            cmake_cl)
+                cmake_cl
+                shift
+                ;;
            cmake_cuda)
                cmake_cuda
                shift