Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into hongming/fix_pooling

a95dcea5 · hong19860320 · 54f421c6 · c8deaaa9 · a95dcea5 · a95dcea5
63 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -76,8 +76,8 @@ build:mobile_android:
    cache:
        key: mobile_thirdparty
        paths:
-            - $MOBILE_LITE_CACHE0
+            - build.lite.android.armv8.gcc/third_party
-            - $MOBILE_LITE_CACHE1
+            - build.lite.android.armv7.gcc/third_party
            - ~/.ccache
            - $CI_PROJECT_DIR/build_mobile_ccache
    script:
@@ -96,8 +96,9 @@ build:mobile_armlinux:
    cache:
        key: mobile_thirdparty
        paths:
-            - $MOBILE_LITE_CACHE0
+            - build.lite.armlinux.armv8.gcc
-            - $MOBILE_LITE_CACHE1
+            - build.lite.armlinux.armv7.gcc
+            - build.lite.armlinux.armv7hf.gcc
            - ~/.ccache
            - $CI_PROJECT_DIR/build_mobile_ccache2
    script:
@@ -107,24 +108,13 @@ build:mobile_armlinux:
    dependencies:
        - build:server
-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache
 build:mobile_model_mobilenetv1:
    tags:
        - lite
    stage: build_mobile
    image: $MOBILE_LITE_DOCKER_IMAGE
-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache
    script:
        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv1
        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv1
@@ -135,8 +125,7 @@ build:mobile_model_mobilenetv1:
    cache:
        key: mobile_thirdparty
        paths:
-            - $MOBILE_LITE_CACHE0
+            - build.lite.android.armv8.gcc
-            - $MOBILE_LITE_CACHE1
            - ~/.ccache
            - $CI_PROJECT_DIR/build_mobile_model_mobilenetv1
@@ -145,12 +134,7 @@ build:mobile_model_mobilenetv2:
        - lite
    stage: build_mobile
    image: $MOBILE_LITE_DOCKER_IMAGE
-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache
    script:
        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv2
        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv2
@@ -161,8 +145,7 @@ build:mobile_model_mobilenetv2:
    cache:
        key: mobile_thirdparty
        paths:
-            - $MOBILE_LITE_CACHE0
+            - build.lite.android.armv8.gcc
-            - $MOBILE_LITE_CACHE1
            - ~/.ccache
            - $CI_PROJECT_DIR/build_mobile_model_mobilenetv2
@@ -171,12 +154,7 @@ build:mobile_model_resnet50:
        - lite
    stage: build_mobile
    image: $MOBILE_LITE_DOCKER_IMAGE
-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache
    script:
        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_resnet50
        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_resnet50
@@ -187,8 +165,7 @@ build:mobile_model_resnet50:
    cache:
        key: mobile_thirdparty
        paths:
-            - $MOBILE_LITE_CACHE0
+            - build.lite.android.armv8.gcc
-            - $MOBILE_LITE_CACHE1
            - ~/.ccache
            - $CI_PROJECT_DIR/build_mobile_model_resnet50

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,6 +150,7 @@ option(WITH_LITE "Enable lite framework" OFF)
 option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
+option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
 option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)
 option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
@@ -181,6 +182,12 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    include(external/eigen)     # download eigen3
    include(ccache)             # set ccache for compilation
+    # for opencl
+    if (LITE_WITH_OPENCL)
+        include(external/opencl-headers)
+        include(external/opencl-clhpp)
+    endif()
    include(generic)            # simplify cmake module
    include(configure)          # add paddle env configuration

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -176,6 +176,10 @@ if (LITE_WITH_ARM)
    add_definitions("-DLITE_WITH_ARM")
 endif()
+if (LITE_WITH_OPENCL)
+    add_definitions("-DLITE_WITH_OPENCL")
+endif()
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
 endif()

--- a/cmake/external/opencl-clhpp.cmake
+++ b/cmake/external/opencl-clhpp.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+INCLUDE(ExternalProject)
+SET(OPENCL_CLHPP_SRCS_DIR    ${THIRD_PARTY_PATH}/opencl-clhpp)
+SET(OPENCL_CLHPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/opencl-clhpp)
+SET(OPENCL_CLHPP_INCLUDE_DIR "${OPENCL_CLHPP_INSTALL_DIR}" CACHE PATH "opencl-clhpp include directory." FORCE)
+INCLUDE_DIRECTORIES(${OPENCL_CLHPP_INCLUDE_DIR})
+ExternalProject_Add(
+  opencl_clhpp
+  GIT_REPOSITORY    "https://github.com/KhronosGroup/OpenCL-CLHPP.git"
+  GIT_TAG           "v2.0.10"
+  PREFIX            "${OPENCL_CLHPP_SRCS_DIR}"
+  CMAKE_ARGS        -DBUILD_DOCS=OFF
+                    -DBUILD_EXAMPLES=OFF
+                    -DBUILD_TESTS=OFF
+                    -DCMAKE_INSTALL_PREFIX=${OPENCL_CLHPP_INSTALL_DIR}
+  CMAKE_CACHE_ARGS  -DCMAKE_INSTALL_PREFIX:PATH=${OPENCL_CLHPP_INSTALL_DIR}
+                    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+ADD_DEPENDENCIES(opencl_clhpp opencl_headers)
--- a/cmake/external/opencl-headers.cmake
+++ b/cmake/external/opencl-headers.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+INCLUDE(ExternalProject)
+SET(OPENCL_HEADERS_SRCS_DIR    ${THIRD_PARTY_PATH}/opencl-headers)
+SET(OPENCL_HEADERS_INCLUDE_DIR "${OPENCL_HEADERS_SRCS_DIR}/src/opencl_headers" CACHE PATH "opencl-headers include directory." FORCE)
+INCLUDE_DIRECTORIES(${OPENCL_HEADERS_INCLUDE_DIR})
+ExternalProject_Add(
+  opencl_headers
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY    "https://github.com/KhronosGroup/OpenCL-Headers.git"
+  GIT_TAG           "c5a4bbeabb10d8ed3d1c651b93aa31737bc473dd"
+  PREFIX            ${OPENCL_HEADERS_SRCS_DIR}
+  DOWNLOAD_NAME     "OpenCL-Headers"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -24,8 +24,7 @@ function(lite_download_and_uncompress INSTALL_DIR URL FILENAME)
            ${EXTERNAL_PROJECT_NAME}
            ${EXTERNAL_PROJECT_LOG_ARGS}
            PREFIX                ${INSTALL_DIR}
-            DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
+            DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
-            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
            DOWNLOAD_DIR          ${INSTALL_DIR}
            DOWNLOAD_NO_PROGRESS  1
            CONFIGURE_COMMAND     ""
@@ -143,6 +142,8 @@ function(lite_cc_binary TARGET)
            HVY_DEPS ${args_HVY_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
+    # collect targets need to compile for lite
+    add_dependencies(lite_compile_deps ${TARGET})
 endfunction()
 # Add a unit-test name to file for latter offline manual test.
@@ -181,6 +182,7 @@ add_subdirectory(x86)
 add_subdirectory(arm)
 add_subdirectory(host)
 add_subdirectory(cuda)
+add_subdirectory(opencl)
 add_subdirectory(model_parser)
 add_subdirectory(utils)
 add_subdirectory(api)

--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
@@ -12,7 +12,6 @@ lite_cc_library(lite_api_test_helper SRCS lite_api_test_helper.cc
  CUDA_DEPS kernels_cuda
  X86_DEPS ${x86_kernels}
  )
-lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS lite_api_test_helper)
 set(light_api_deps
    scope_lite target_wrapper_host model_parser_lite program_lite)
@@ -21,27 +20,34 @@ if(LITE_WITH_CUDA)
    set(light_api_deps ${light_api_deps} target_wrapper_cuda)
 endif()
-lite_cc_library(light_api_lite SRCS light_api.cc
-  DEPS ${light_api_deps} ${ops_lite} ${host_kernels}
-  )
 message(STATUS "get ops ${ops_lite}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
+lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} ${ops_lite} ${host_kernels} program_lite)
+lite_cc_library(light_api_lite SRCS light_api.cc
+        DEPS scope_lite target_wrapper_host model_parser_lite
+            ${light_api_deps} ${ops_lite} ${host_kernels} program_lite
+        CUDA_DEPS target_wrapper_cuda
+        X86_DEPS ${x86_kernels} operator
+        ARM_DEPS ${arm_kernels}
+        )
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
        "A path setting inference demo download directories.")
 if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
    lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
-       DEPS cxx_api_lite mir_passes 
+       DEPS cxx_api_lite mir_passes lite_api_test_helper
       ${ops_lite} ${host_kernels} ${x86_kernels}
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
    add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
 endif()
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
    set(lite_model_test_DEPS cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${arm_kernels})
@@ -68,25 +74,18 @@ endif()
 # These tests needs CLI arguments, and is not supported in ARM CI.
 # TODO(Superjomn) support latter.
-if(NOT LITE_ON_MOBILE)
+lite_cc_test(test_light_api_lite SRCS light_api_test.cc
-    lite_cc_test(test_light_api SRCS light_api_test.cc
+    DEPS light_api_lite program_lite mir_passes
-      DEPS light_api_lite mir_passes
+    ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt
-      X86_DEPS ${x86_kernels}
+    SERIAL)
-      ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt
-      SERIAL)
+lite_cc_test(test_apis_lite SRCS apis_test.cc
+  DEPS cxx_api_lite light_api_lite ${ops_lite}
-    lite_cc_test(test_apis_lite SRCS apis_test.cc
+  X86_DEPS ${x86_kernels} operator
-      DEPS cxx_api_lite light_api_lite ${ops_lite} mir_passes
+  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
-      X86_DEPS ${x86_kernels}
+      --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
-      ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
-          --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
-endif()
-lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
+#lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
-    DEPS
+    #X86_DEPS operator
-    cxx_api_lite
+    #DEPS light_api_lite model_parser_lite target_wrapper_host mir_passes
-    model_parser_lite
+    #ARM_DEPS ${arm_kernels})
-    target_wrapper_host
-    mir_passes
-    ${ops_lite} ${host_kernels}
-    ARM_DEPS ${arm_kernels})
--- a/paddle/fluid/lite/api/apis_test.cc
+++ b/paddle/fluid/lite/api/apis_test.cc
@@ -39,23 +39,41 @@ void SetConstInput(lite::Tensor* x) {
  }
 }
-bool CompareTensors(const std::string& name, const ExecutorLite& cxx_api,
+bool CompareTensors(const std::string& name, const Predictor& cxx_api,
                    const LightPredictor& light_api) {
  const auto* a = cxx_api.GetTensor(name);
  const auto* b = light_api.GetTensor(name);
  return TensorCompareWith(*a, *b);
 }
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+TEST(CXXApi_LightApi, optim_model) {
+  lite::Predictor cxx_api;
+  std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kX86), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},  // Both works on X86 and ARM
+  });
+  // On ARM devices, the preferred X86 target not works, but it can still
+  // select ARM kernels.
+  cxx_api.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
+                valid_places);
+  cxx_api.SaveModel(FLAGS_optimized_model);
+}
 TEST(CXXApi_LightApi, save_and_load_model) {
-  lite::ExecutorLite cxx_api;
+  lite::Predictor cxx_api;
-  lite::LightPredictor light_api;
+  lite::LightPredictor light_api(FLAGS_optimized_model);
  // CXXAPi
  {
-    std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+    std::vector<Place> valid_places({
-                                     Place{TARGET(kX86), PRECISION(kFloat)}});
+        Place{TARGET(kHost), PRECISION(kFloat)},
-    cxx_api.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},
+        Place{TARGET(kX86), PRECISION(kFloat)},
+        Place{TARGET(kARM), PRECISION(kFloat)},  // Both works on X86 and ARM
+    });
+    // On ARM devices, the preferred X86 target not works, but it can still
+    // select ARM kernels.
+    cxx_api.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
                  valid_places);
    auto* x = cxx_api.GetInput(0);
@@ -69,8 +87,6 @@ TEST(CXXApi_LightApi, save_and_load_model) {
  // LightApi
  {
-    light_api.Build(FLAGS_optimized_model);
    auto* x = light_api.GetInput(0);
    SetConstInput(x);
@@ -89,7 +105,6 @@ TEST(CXXApi_LightApi, save_and_load_model) {
    ASSERT_TRUE(CompareTensors(tensor_name, cxx_api, light_api));
  }
 }
-#endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/api/cxx_api.cc
+++ b/paddle/fluid/lite/api/cxx_api.cc
@@ -17,19 +17,66 @@
 #include <string>
 #include <utility>
 #include <vector>
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+#include "paddle/fluid/lite/utils/io.h"
-#include "paddle/fluid/platform/port.h"
-#endif
 namespace paddle {
 namespace lite {
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+void Predictor::SaveModel(const std::string &dir) {
-void ExecutorLite::SaveModel(const std::string &dir) {
+#ifndef LITE_WITH_ARM
-  MkDirRecursively(dir.c_str());
+  MkDirRecur(dir);
+#else
+#endif
  program_->PersistModel(dir, program_desc_);
+  LOG(INFO) << "Save model to " << dir;
+}
+lite::Tensor *Predictor::GetInput(size_t offset) {
+  auto *_feed_list = program_->exec_scope()->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+const lite::Tensor *Predictor::GetOutput(size_t offset) {
+  auto *_fetch_list = program_->exec_scope()->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+void Predictor::Build(const std::string &model_path, const Place &prefer_place,
+                      const std::vector<Place> &valid_places) {
+  LoadModel(model_path, scope_.get(), &program_desc_);
+  Build(program_desc_, prefer_place, valid_places);
+}
+const framework::proto::ProgramDesc &Predictor::program_desc() const {
+  return program_desc_;
+}
+void Predictor::Build(const framework::proto::ProgramDesc &desc,
+                      const Place &prefer_place,
+                      const std::vector<Place> &valid_places) {
+  program_desc_ = desc;
+  Program program(desc, scope_, valid_places);
+  optimizer_.KernelPickPreferPlace(prefer_place);
+  core::KernelPickFactor factor;
+  factor.ConsiderTarget();
+  factor.ConsiderPrecision();
+  optimizer_.Run(std::move(program), valid_places, factor);
+  program_ = optimizer_.GenRuntimeProgram();
+}
+const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
+  auto *var = program_->exec_scope()->FindVar(name);
+  return &var->Get<lite::Tensor>();
 }
-#endif
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/api/cxx_api.h
+++ b/paddle/fluid/lite/api/cxx_api.h
@@ -26,68 +26,39 @@
 namespace paddle {
 namespace lite {
-struct Config {};
+/*
+ * Predictor for inference, input a model, it will optimize and execute it.
-class ExecutorLite {
+ */
+class Predictor {
 public:
-  ExecutorLite() { scope_ = std::make_shared<Scope>(); }
+  // Create an empty predictor.
-  explicit ExecutorLite(const std::shared_ptr<lite::Scope>& root_scope) {
+  Predictor() { scope_ = std::make_shared<Scope>(); }
-    scope_ = root_scope;
+  // Create a predictor with the weight variable scope set.
-  }
+  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
+      : scope_(root_scope) {}
+  // Build from a model, with places set for hardware config.
  void Build(const std::string& model_path, const Place& prefer_place,
-             const std::vector<Place>& valid_places) {
+             const std::vector<Place>& valid_places);
-    LoadModel(model_path, scope_.get(), &program_desc_);
-    Build(program_desc_, prefer_place, valid_places);
-  }
  void Build(const framework::proto::ProgramDesc& desc,
-             const Place& prefer_place,
+             const Place& prefer_place, const std::vector<Place>& valid_places);
-             const std::vector<Place>& valid_places) {
-    program_desc_ = desc;
-    Program program(desc, scope_, valid_places);
-    optimizer_.KernelPickPreferPlace(prefer_place);
-    core::KernelPickFactor factor;
-    factor.ConsiderTarget();
-    optimizer_.Run(std::move(program), valid_places, factor);
-    program_ = optimizer_.GenRuntimeProgram();
-  }
-// This method is disabled in mobile, or unnecessary dependencies required.
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  void SaveModel(const std::string& dir);
-#endif
-  // Get offset-th col of feed.
+  // Run the predictor for a single batch of data.
-  lite::Tensor* GetInput(size_t offset) {
+  void Run() { program_->Run(); }
-    auto* _feed_list = program_->exec_scope()->FindVar("feed");
-    CHECK(_feed_list) << "no feed variable in exec_scope";
-    auto* feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
-    if (offset >= feed_list->size()) {
-      feed_list->resize(offset + 1);
-    }
-    return &feed_list->at(offset);
-  }
-  const lite::Tensor* GetOutput(size_t offset) {
+  // Get offset-th col of feed inputs.
-    auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
+  lite::Tensor* GetInput(size_t offset);
-    CHECK(_fetch_list) << "no fatch variable in exec_scope";
-    auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-    CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
-    return &fetch_list.at(offset);
-  }
-  const lite::Tensor* GetTensor(const std::string& name) const {
+  // Get offset-th col of fetch results.
-    auto* var = program_->exec_scope()->FindVar(name);
+  const lite::Tensor* GetOutput(size_t offset);
-    return &var->Get<lite::Tensor>();
-  }
-  void Run() { program_->Run(); }
+  // Return the program desc for debug.
+  const framework::proto::ProgramDesc& program_desc() const;
+  const lite::Tensor* GetTensor(const std::string& name) const;
-  const framework::proto::ProgramDesc& program_desc() const {
+  // This method is disabled in mobile, for unnecessary dependencies required.
-    return program_desc_;
+  void SaveModel(const std::string& dir);
-  }
 private:
  Optimizer optimizer_;
@@ -96,6 +67,7 @@ class ExecutorLite {
  std::unique_ptr<RuntimeProgram> program_;
 };
+#ifdef LITE_WITH_X86
 /*
 * An executor for training.
 *
@@ -119,13 +91,13 @@ class CXXTrainer {
      : scope_(root_scope),
        preferred_place_(preferred_place),
        valid_places_(valid_places),
-        main_program_executor_(ExecutorLite(scope_)) {}
+        main_program_executor_(Predictor(scope_)) {}
  // Build the RuntimeProgram cache for the main program. The cache will run
  // multiple times for the epoches.
  // NOTE Just support to execute the 0-th block currently.
-  ExecutorLite& BuildMainProgramExecutor(
+  Predictor& BuildMainProgramExecutor(const framework::proto::ProgramDesc& desc,
-      const framework::proto::ProgramDesc& desc, int block_id = 0) {
+                                      int block_id = 0) {
    main_program_executor_.Build(desc, preferred_place_, valid_places_);
    return main_program_executor_;
  }
@@ -133,7 +105,7 @@ class CXXTrainer {
  // Run the startup program. It just executes once, no cache needed.
  void RunStartupProgram(const framework::proto::ProgramDesc& desc,
                         int block_id = 0) {
-    ExecutorLite exe(scope_);
+    Predictor exe(scope_);
    exe.Build(desc, preferred_place_, valid_places_);
    exe.Run();
  }
@@ -145,8 +117,9 @@ class CXXTrainer {
  std::vector<Place> valid_places_;
  // The training program.
-  ExecutorLite main_program_executor_;
+  Predictor main_program_executor_;
 };
+#endif
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -34,7 +34,7 @@ void Run(const char* model_dir, int repeat, int thread_num) {
  DeviceInfo::Init();
  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
 #endif
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});

--- a/paddle/fluid/lite/api/cxx_api_test.cc
+++ b/paddle/fluid/lite/api/cxx_api_test.cc
@@ -42,7 +42,7 @@ TEST(CXXApi, test) {
 }
 TEST(CXXApi, save_model) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kX86), PRECISION(kFloat)}});
  predictor.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},

--- a/paddle/fluid/lite/api/inceptionv4_test.cc
+++ b/paddle/fluid/lite/api/inceptionv4_test.cc
@@ -16,21 +16,20 @@
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/test_helper.h"
 #include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/kernels/use_kernels.h"
 #include "paddle/fluid/lite/operators/use_ops.h"
-// for eval
-DEFINE_string(model_dir, "", "");
 namespace paddle {
 namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(InceptionV4, test) {
  DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});
@@ -44,7 +43,20 @@ TEST(InceptionV4, test) {
    data[i] = 1;
  }
-  predictor.Run();
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
  auto* out = predictor.GetOutput(0);
  std::vector<float> results({0.00078033, 0.00083865, 0.00060029, 0.00057083,

--- a/paddle/fluid/lite/api/light_api.cc
+++ b/paddle/fluid/lite/api/light_api.cc
@@ -13,3 +13,67 @@
 // limitations under the License.
 #include "paddle/fluid/lite/api/light_api.h"
+namespace paddle {
+namespace lite {
+void LightPredictor::Build(const std::string& model_dir) {
+  framework::proto::ProgramDesc desc;
+  LoadModel(model_dir, scope_.get(), &desc);
+  BuildRuntimeProgram(desc);
+}
+Tensor* LightPredictor::GetInput(size_t offset) {
+  auto* _feed_list = program_->exec_scope()->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto* feed_list = _feed_list->GetMutable<std::vector<Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+const Tensor* LightPredictor::GetOutput(size_t offset) {
+  auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+void LightPredictor::BuildRuntimeProgram(
+    const framework::proto::ProgramDesc& prog) {
+  std::vector<Instruction> insts;
+  // 1. Create op first
+  Program program(prog, scope_, {});
+  // 2. Create Instructs
+  // Create the kernels of the target places, and filter out the specific
+  // kernel with the target alias.
+  for (auto& op : program.ops()) {
+    auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
+    std::string op_type, alias;
+    Place place;
+    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
+    auto kernels = op->CreateKernels({place});
+    // filter out a kernel
+    auto it = std::find_if(
+        kernels.begin(), kernels.end(),
+        [&](std::unique_ptr<KernelBase>& it) { return it->alias() == alias; });
+    CHECK(it != kernels.end());
+    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+    insts.emplace_back(op, std::move(*it));
+  }
+  program_.reset(new RuntimeProgram(std::move(insts)));
+  CHECK(program.exec_scope());
+  program_->set_exec_scope(program.exec_scope());
+}
+LightPredictor::LightPredictor(const std::string& model_dir) {
+  scope_ = std::make_shared<Scope>();
+  Build(model_dir);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/api/light_api.h
+++ b/paddle/fluid/lite/api/light_api.h
@@ -32,36 +32,21 @@
 namespace paddle {
 namespace lite {
+/*
+ * The light weight predictor, mainly for mobile. It loads an optimized model,
+ * and will not depend on the MIR or perform latter optimization.
+ */
 class LightPredictor {
 public:
-  LightPredictor() { scope_ = std::make_shared<Scope>(); }
+  explicit LightPredictor(const std::string& model_dir);
-  void Build(const std::string& model_dir) {
-    framework::proto::ProgramDesc desc;
-    LoadModel(model_dir, scope_.get(), &desc);
-    BuildRuntimeProgram(desc);
-  }
  void Run() { program_->Run(); }
-  // Get offset-th col of feed.
+  // Get offset-th col of feed inputs.
-  Tensor* GetInput(size_t offset) {
+  Tensor* GetInput(size_t offset);
-    auto* _feed_list = program_->exec_scope()->FindVar("feed");
-    CHECK(_feed_list) << "no feed variable in exec_scope";
-    auto* feed_list = _feed_list->GetMutable<std::vector<Tensor>>();
-    if (offset >= feed_list->size()) {
-      feed_list->resize(offset + 1);
-    }
-    return &feed_list->at(offset);
-  }
-  const Tensor* GetOutput(size_t offset) {
+  // Get offset-th col of fetch outputs.
-    auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
+  const Tensor* GetOutput(size_t offset);
-    CHECK(_fetch_list) << "no fatch variable in exec_scope";
-    auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-    CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
-    return &fetch_list.at(offset);
-  }
  const lite::Tensor* GetTensor(const std::string& name) const {
    auto* var = program_->exec_scope()->FindVar(name);
@@ -69,34 +54,8 @@ class LightPredictor {
  }
 private:
-  void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog) {
+  void Build(const std::string& model_dir);
-    std::vector<Instruction> insts;
+  void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog);
-    // 1. Create op first
-    Program program(prog, scope_, {});
-    // 2. Create Instructs
-    // Create the kernels of the target places, and filter out the specific
-    // kernel with the target alias.
-    for (auto& op : program.ops()) {
-      auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
-      std::string op_type, alias;
-      Place place;
-      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-      auto kernels = op->CreateKernels({place});
-      // filter out a kernel
-      auto it = std::find_if(kernels.begin(), kernels.end(),
-                             [&](std::unique_ptr<KernelBase>& it) {
-                               return it->alias() == alias;
-                             });
-      CHECK(it != kernels.end());
-      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
-      insts.emplace_back(op, std::move(*it));
-    }
-    program_.reset(new RuntimeProgram(std::move(insts)));
-    CHECK(program.exec_scope());
-    program_->set_exec_scope(program.exec_scope());
-  }
 private:
  std::shared_ptr<Scope> scope_;

--- a/paddle/fluid/lite/api/light_api_test.cc
+++ b/paddle/fluid/lite/api/light_api_test.cc
@@ -25,8 +25,10 @@ namespace paddle {
 namespace lite {
 TEST(LightAPI, load) {
-  LightPredictor predictor;
+  if (FLAGS_optimized_model.empty()) {
-  predictor.Build(FLAGS_optimized_model);
+    FLAGS_optimized_model = "lite_naive_model";
+  }
+  LightPredictor predictor(FLAGS_optimized_model);
  auto* input_tensor = predictor.GetInput(0);
  input_tensor->Resize(DDim(std::vector<int64_t>({100, 100})));

--- a/paddle/fluid/lite/api/lite_api_test_helper.cc
+++ b/paddle/fluid/lite/api/lite_api_test_helper.cc
@@ -22,7 +22,7 @@ namespace paddle {
 namespace lite {
 const lite::Tensor* RunHvyModel() {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
 #ifndef LITE_WITH_CUDA
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kX86), PRECISION(kFloat)}});

--- a/paddle/fluid/lite/api/mobilenetv1_test.cc
+++ b/paddle/fluid/lite/api/mobilenetv1_test.cc
@@ -16,21 +16,20 @@
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/test_helper.h"
 #include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/kernels/use_kernels.h"
 #include "paddle/fluid/lite/operators/use_ops.h"
-// for eval
-DEFINE_string(model_dir, "", "");
 namespace paddle {
 namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(MobileNetV1, test) {
  DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});
@@ -44,7 +43,20 @@ TEST(MobileNetV1, test) {
    data[i] = 1;
  }
-  predictor.Run();
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
  auto* out = predictor.GetOutput(0);
  std::vector<float> results({1.91308980e-04, 5.92055148e-04, 1.12303176e-04,

--- a/paddle/fluid/lite/api/mobilenetv2_test.cc
+++ b/paddle/fluid/lite/api/mobilenetv2_test.cc
@@ -16,21 +16,20 @@
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/test_helper.h"
 #include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/kernels/use_kernels.h"
 #include "paddle/fluid/lite/operators/use_ops.h"
-// for eval
-DEFINE_string(model_dir, "", "");
 namespace paddle {
 namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(MobileNetV2, test) {
  DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});
@@ -44,7 +43,20 @@ TEST(MobileNetV2, test) {
    data[i] = 1;
  }
-  predictor.Run();
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
  auto* out = predictor.GetOutput(0);
  std::vector<float> results({0.00097802, 0.00099822, 0.00103093, 0.00100121,

--- a/paddle/fluid/lite/api/resnet50_test.cc
+++ b/paddle/fluid/lite/api/resnet50_test.cc
@@ -16,21 +16,20 @@
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/test_helper.h"
 #include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/kernels/use_kernels.h"
 #include "paddle/fluid/lite/operators/use_ops.h"
-// for eval
-DEFINE_string(model_dir, "", "");
 namespace paddle {
 namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(ResNet50, test) {
  DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});
@@ -44,7 +43,20 @@ TEST(ResNet50, test) {
    data[i] = 1;
  }
-  predictor.Run();
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
  auto* out = predictor.GetOutput(0);
  std::vector<float> results({2.41399175e-04, 4.13724629e-04, 2.64324830e-04,

--- a/paddle/fluid/lite/kernels/arm/use_kernels.h
+++ b/paddle/fluid/lite/kernels/arm/use_kernels.h
@@ -13,13 +13,24 @@
 // limitations under the License.
 #pragma once
-#include "paddle/fluid/lite/core/op_registry.h"
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
+#include <gflags/gflags.h>
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+#include <time.h>
-USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+// for eval
-USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
+DEFINE_string(model_dir, "", "model dir");
-USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def);
+DEFINE_int32(warmup, 0, "warmup times");
-USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
+DEFINE_int32(repeats, 1, "repeats times");
-USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
+DEFINE_int32(threads, 1, "threads num");
+namespace paddle {
+namespace lite {
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -35,6 +35,8 @@ cc_library(math_arm SRCS
    split.cc
    activation.cc
    dropout.cc
+    gemm_prepacked_int8.cc
+    gemv_arm_int8.cc
    DEPS ${lite_kernel_deps} eigen3 framework_proto_lite)
  # TODO(TJ): fix me do not deps proto

--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
@@ -25,7 +25,7 @@ cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite)
 cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite})
 cc_library(cpu_info_lite SRCS cpu_info.cc)
 lite_cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite eigen3)
-cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite
+cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite kernel_lite
  cpp_op_desc_lite ${tensor_lite})
 cc_library(types_lite SRCS types.cc)
 cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite)

--- a/paddle/fluid/lite/core/kernel.cc
+++ b/paddle/fluid/lite/core/kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/lite/core/kernel.h"
+#include <cstdlib>
 namespace paddle {
 namespace lite {
@@ -49,6 +50,36 @@ std::string KernelBase::GenParamTypeKey() const {
  return ss.str();
 }
+void KernelBase::ParseKernelType(const std::string &kernel_type,
+                                 std::string *op_type, std::string *alias,
+                                 Place *place) {
+  std::stringstream ss(kernel_type);
+  std::getline(ss, *op_type, '/');
+  std::getline(ss, *alias, '/');
+  std::string target, precision, layout;
+  std::getline(ss, target, '/');
+  std::getline(ss, precision, '/');
+  std::getline(ss, layout, '/');
+  place->target = static_cast<TargetType>(std::atoi(target.c_str()));
+  place->precision = static_cast<PrecisionType>(std::atoi(precision.c_str()));
+  place->layout = static_cast<DataLayoutType>(std::atoi(layout.c_str()));
+}
+std::string KernelBase::SerializeKernelType(const std::string &op_type,
+                                            const std::string &alias,
+                                            const Place &place) {
+  std::stringstream ss;
+  ss << op_type << "/";
+  ss << alias << "/";
+  // We serialize the place value not the string representation here for
+  // easier deserialization.
+  ss << static_cast<int>(place.target) << "/";
+  ss << static_cast<int>(place.precision) << "/";
+  ss << static_cast<int>(place.layout);
+  return ss.str();
+}
 bool ParamTypeRegistry::KeyCmp::operator()(
    const ParamTypeRegistry::key_t &a,
    const ParamTypeRegistry::key_t &b) const {

--- a/paddle/fluid/lite/core/kernel.h
+++ b/paddle/fluid/lite/core/kernel.h
@@ -118,33 +118,11 @@ class KernelBase {
  static std::string SerializeKernelType(const std::string& op_type,
                                         const std::string& alias,
-                                         const Place& place) {
+                                         const Place& place);
-    std::stringstream ss;
-    ss << op_type << "/";
-    ss << alias << "/";
-    // We serialize the place value not the string representation here for
-    // easier deserialization.
-    ss << static_cast<int>(place.target) << "/";
-    ss << static_cast<int>(place.precision) << "/";
-    ss << static_cast<int>(place.layout);
-    return ss.str();
-  }
  static void ParseKernelType(const std::string& kernel_type,
                              std::string* op_type, std::string* alias,
-                              Place* place) {
+                              Place* place);
-    std::stringstream ss(kernel_type);
-    std::getline(ss, *op_type, '/');
-    std::getline(ss, *alias, '/');
-    std::string target, precision, layout;
-    std::getline(ss, target, '/');
-    std::getline(ss, precision, '/');
-    std::getline(ss, layout, '/');
-    place->target = static_cast<TargetType>(std::stoi(target));
-    place->precision = static_cast<PrecisionType>(std::stoi(precision));
-    place->layout = static_cast<DataLayoutType>(std::stoi(layout));
-  }
  virtual ~KernelBase() = default;
  void Torch() {}

--- a/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc
@@ -28,7 +28,7 @@ namespace lite {
 namespace mir {
 TEST(fc_fuse_pass, fuse_test) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
 #ifndef LITE_WITH_CUDA
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kX86), PRECISION(kFloat)}});
@@ -69,7 +69,7 @@ TEST(fc_fuse_pass, fuse_test) {
 #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(fc_fuse_pass, save_model_test) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kX86), PRECISION(kFloat)}});
  predictor.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)},

--- a/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -25,7 +25,7 @@ namespace fusion {
 /* The model trained by fluid quantization is a simulation of real int8.
 * The quantized Ops(conv2d, mul, depthwise conv2d etc) have fake_quantop
- * in front  and fake_dequantop behind.
+ * in front and fake_dequantop behind.
 *
 * When in int8 mode, the pattern like "fake_quant + quantized_op +
 * fake_dequant"

--- a/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc
@@ -41,7 +41,7 @@ void FuseBase::DeleteInterNodes(SSAGraph *graph) {
    }
  }
-  LOG(INFO) << "keys: " << key2nodes_.size();
+  VLOG(4) << "keys: " << key2nodes_.size();
  std::unordered_set<const Node *> nodes2rm;
  for (auto &matched : key2nodes_) {
    for (const auto &key : keys) {

--- a/paddle/fluid/lite/core/op_registry.h
+++ b/paddle/fluid/lite/core/op_registry.h
@@ -80,6 +80,8 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kARM), PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //
              KernelRegistryForTarget<TARGET(kARM), PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kARM), PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *  //
              >;

--- a/paddle/fluid/lite/core/optimizer.h
+++ b/paddle/fluid/lite/core/optimizer.h
@@ -58,7 +58,6 @@ class Optimizer {
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
          "lite_elementwise_add_activation_fuse_pass",  //
 #endif
-          "lite_fc_fuse_pass",              //
          "static_kernel_pick_pass",        //
          "variable_place_inference_pass",  //
          "argument_type_display_pass",     //

--- a/paddle/fluid/lite/core/target_wrapper.h
+++ b/paddle/fluid/lite/core/target_wrapper.h
@@ -38,6 +38,7 @@ enum class PrecisionType : int {
  kUnk = 0,
  kFloat,
  kInt8,
+  kInt32,
  kAny,  // any precision
  NUM,   // number of fields.
 };
@@ -48,6 +49,19 @@ enum class DataLayoutType : int {
  NUM,   // number of fields.
 };
+static size_t PrecisionTypeLength(PrecisionType type) {
+  switch (type) {
+    case PrecisionType::kFloat:
+      return 4;
+    case PrecisionType::kInt8:
+      return 1;
+    case PrecisionType::kInt32:
+      return 4;
+    default:
+      return 4;
+  }
+}
 // Some helper macro to get a specific TargetType.
 #define TARGET(item__) paddle::lite::TargetType::item__
 // Some helper macro to get a specific PrecisionType.
@@ -87,7 +101,7 @@ static const std::string& TargetRepr(TargetType target) {
 static const std::string& PrecisionRepr(PrecisionType precision) {
  static const std::string precision2string[] = {"kUnk", "kFloat", "kInt8",
-                                                 "kAny"};
+                                                 "kInt32", "kAny"};
  auto x = static_cast<int>(precision);
  CHECK_LT(x, static_cast<int>(PRECISION(NUM)));
  return precision2string[x];

--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -51,5 +51,3 @@ set(arm_kernels
    )
 set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
--- a/paddle/fluid/lite/kernels/arm/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc
@@ -92,6 +92,9 @@ void ConvCompute::Run() {
  // }
 }
+void ConvComputeInt8::PrepareForRun() {}
+void ConvComputeInt8::Run() {}
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -112,3 +115,23 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW,
    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
+REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW,
+                     paddle::lite::kernels::arm::ConvComputeInt8, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .Finalize();
+REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW,
+                     paddle::lite::kernels::arm::ConvComputeInt8, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/arm/conv_compute.h
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.h
@@ -41,6 +41,25 @@ class ConvCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
      nullptr};
 };
+class ConvComputeInt8 : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
+ public:
+  using param_t = operators::ConvParam;
+  void PrepareForRun() override;
+  void Run() override;
+  ~ConvComputeInt8() {
+    if (impl_ != nullptr) {
+      delete impl_;
+    }
+  }
+ private:
+  lite::arm::math::ImplBase<TARGET(kARM), PRECISION(kInt8), param_t>* impl_{
+      nullptr};
+};
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

--- a/paddle/fluid/lite/kernels/use_kernels.h
+++ b/paddle/fluid/lite/kernels/use_kernels.h
@@ -12,14 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#pragma once
 /*
 * ATTENTION this header file can only include in .cc file.
 */
+#pragma once
+#include "paddle/fluid/lite/core/op_registry.h"
 USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
+#endif
 #ifdef LITE_WITH_X86
 USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
 USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
@@ -36,21 +55,6 @@ USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
 USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
 #endif
-#ifdef LITE_WITH_ARM
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
-#endif
 #ifdef LITE_WITH_CUDA
 USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
 USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);

--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -44,10 +44,9 @@ set(x86_kernels
    softmax_compute_x86
    dropout_compute_x86
    concat_compute_x86
-    conv_compute_x86 
+    conv_compute_x86
-    pool_compute_x86  
+    pool_compute_x86
-    batch_norm_compute_x86 
+    batch_norm_compute_x86
    )
 set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
--- a/paddle/fluid/lite/opencl/CMakeLists.txt
+++ b/paddle/fluid/lite/opencl/CMakeLists.txt
+if (NOT LITE_WITH_OPENCL)
+    return()
+endif()
+if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    cc_library(cl_wrapper SRCS cl_wrapper.cc)
+    cc_library(cl_tool SRCS cl_tool.cc)
+    target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers)
+    cc_library(cl_half SRCS cl_half.cc)
+    target_compile_options(cl_half BEFORE PUBLIC -fno-strict-aliasing)
+    cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool)
+    cc_library(cl_context SRCS cl_context.cc DEPS cl_engine)
+    cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context)
+    cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor)
+    cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine)
+    cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_helper cl_image)
+    lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper)
+    add_dependencies(cl_tool opencl_clhpp)
+endif()
--- a/paddle/fluid/lite/opencl/cl2_header.h
+++ b/paddle/fluid/lite/opencl/cl2_header.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#define CL_TARGET_OPENCL_VERSION 200
+#define CL_HPP_TARGET_OPENCL_VERSION 200
+#define CL_HPP_MINIMUM_OPENCL_VERSION 110
+#include <CL/cl2.hpp>
--- a/paddle/fluid/lite/opencl/cl_caller.cc
+++ b/paddle/fluid/lite/opencl/cl_caller.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/opencl/cl_caller.h"
+#include <string>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#include "paddle/fluid/lite/opencl/cl_image.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+namespace paddle {
+namespace lite {
+static void CopyImageData(const CLImage& cl_image, float* out) {
+  int width = cl_image.image_dims()[0];
+  int height = cl_image.image_dims()[1];
+  half_t* image_data = new half_t[height * width * 4];
+  cl::Image* image = cl_image.cl_image();
+  const std::array<size_t, 3> origin{0, 0, 0};
+  const std::array<size_t, 3> region{static_cast<size_t>(width),
+                                     static_cast<size_t>(height), 1};
+  cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
+      *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
+  CL_CHECK_ERRORS(err);
+  auto* converter = cl_image.image_converter();
+  converter->ImageToNCHW(image_data, out, cl_image.image_dims(),
+                         cl_image.tensor_dims());
+  delete[] image_data;
+}
+bool InitOpenCLEngine(std::string cl_path) {
+  auto* engine = CLEngine::Global();
+  engine->set_cl_path(cl_path);
+  return engine->IsInitSuccess();
+}
+void elementwise_add(CLContext* context, float* in, const DDim& in_dim,
+                     float* bias, const DDim& bias_dim, float* out,
+                     const DDim& out_dim) {
+  CLHelper helper(context);
+  helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  auto kernel = helper.KernelAt(0);
+  CLImage in_image;
+  in_image.set_tensor_data(in, in_dim);
+  in_image.InitNormalCLImage(helper.OpenCLContext());
+  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
+  CLImage bias_image;
+  bias_image.set_tensor_data(bias, bias_dim);
+  bias_image.InitNormalCLImage(helper.OpenCLContext());
+  VLOG(3) << " --- Bias image: " << bias_image << " --- ";
+  CLImage out_image;
+  out_image.InitEmptyImage(helper.OpenCLContext(), out_dim);
+  cl_int status;
+  status = kernel.setArg(0, *in_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(1, *bias_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(2, *out_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  size_t width = in_image.ImageWidth();
+  size_t height = in_image.ImageHeight();
+  auto global_work_size = cl::NDRange{width, height};
+  status = helper.OpenCLCommandQueue().enqueueNDRangeKernel(
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
+  CL_CHECK_ERRORS(status);
+  VLOG(3) << " --- Out image: " << out_image << " --- ";
+  CopyImageData(out_image, out);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_caller.h
+++ b/paddle/fluid/lite/opencl/cl_caller.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_context.h"
+namespace paddle {
+namespace lite {
+bool InitOpenCLEngine(std::string cl_path);
+void elementwise_add(CLContext* context, float* in, const DDim& in_dim,
+                     float* bias, const DDim& bias_dim, float* out,
+                     const DDim& out_dim);
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_context.cc
+++ b/paddle/fluid/lite/opencl/cl_context.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <glog/logging.h>
+#include <memory>
+#include <string>
+#include <utility>
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+namespace paddle {
+namespace lite {
+cl::CommandQueue &CLContext::GetCommandQueue() {
+  return CLEngine::Global()->command_queue();
+}
+cl::Context &CLContext::GetContext() { return CLEngine::Global()->context(); }
+cl::Program &CLContext::GetProgram(const std::string &file_name,
+                                   const std::string &options) {
+  std::string program_key = file_name;
+  if (!options.empty()) {
+    program_key += options;
+  }
+  auto it = programs_.find(program_key);
+  if (it != programs_.end()) {
+    VLOG(3) << " --- program -> " << program_key << " has been built --- ";
+    return *(it->second);
+  }
+  auto program = CLEngine::Global()->CreateProgram(
+      GetContext(), CLEngine::Global()->cl_path() + "/cl_kernel/" + file_name);
+  VLOG(3) << " --- begin build program -> " << program_key << " --- ";
+  CLEngine::Global()->BuildProgram(program.get(), options);
+  VLOG(3) << " --- end build program -> " << program_key << " --- ";
+  programs_[program_key] = std::move(program);
+  return *(programs_[program_key]);
+}
+std::unique_ptr<cl::Kernel> CLContext::GetKernel(const std::string &kernel_name,
+                                                 const std::string &file_name,
+                                                 const std::string &options) {
+  cl_int status{CL_SUCCESS};
+  VLOG(3) << " --- to get program " << file_name << " --- ";
+  auto program = GetProgram(file_name, options);
+  VLOG(3) << " --- end get program --- ";
+  VLOG(3) << " --- to create kernel: " << kernel_name << " --- ";
+  std::unique_ptr<cl::Kernel> kernel(
+      new cl::Kernel(program, kernel_name.c_str(), &status));
+  CL_CHECK_ERRORS(status);
+  VLOG(3) << " --- end create kernel --- ";
+  return std::move(kernel);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_context.h
+++ b/paddle/fluid/lite/opencl/cl_context.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/lite/opencl/cl2_header.h"
+namespace paddle {
+namespace lite {
+class CLContext {
+ public:
+  cl::CommandQueue &GetCommandQueue();
+  cl::Context &GetContext();
+  cl::Program &GetProgram(const std::string &file_name,
+                          const std::string &options);
+  std::unique_ptr<cl::Kernel> GetKernel(const std::string &kernel_name,
+                                        const std::string &file_name,
+                                        const std::string &options);
+ private:
+  std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
+};
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_engine.cc
+++ b/paddle/fluid/lite/opencl/cl_engine.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include <glog/logging.h>
+#include <string>
+#include <utility>
+#include <vector>
+namespace paddle {
+namespace lite {
+CLEngine* CLEngine::Global() {
+  static CLEngine cl_engine_;
+  cl_engine_.Init();
+  return &cl_engine_;
+}
+CLEngine::~CLEngine() {
+  if (command_queue_ != nullptr) {
+    command_queue_->finish();
+  }
+  // For controlling the destruction order:
+  command_queue_.reset();
+  context_.reset();
+  device_.reset();
+  platform_.reset();
+}
+bool CLEngine::Init() {
+  if (initialized_) {
+    return true;
+  }
+  bool is_platform_init = InitializePlatform();
+  bool is_device_init = InitializeDevice();
+  is_init_success_ = is_platform_init && is_device_init;
+  initialized_ = true;
+  return initialized_;
+}
+cl::Platform& CLEngine::platform() {
+  CHECK(platform_ != nullptr) << "platform_ is not initialized!";
+  return *platform_;
+}
+cl::Context& CLEngine::context() {
+  if (context_ == nullptr) {
+    context_ = CreateContext();
+  }
+  return *context_;
+}
+cl::Device& CLEngine::device() {
+  CHECK(device_ != nullptr) << "device_ is not initialized!";
+  return *device_;
+}
+cl::CommandQueue& CLEngine::command_queue() {
+  if (command_queue_ == nullptr) {
+    command_queue_ = CreateCommandQueue(context());
+  }
+  return *command_queue_;
+}
+std::unique_ptr<cl::Program> CLEngine::CreateProgram(const cl::Context& context,
+                                                     std::string file_name) {
+  std::ifstream file{file_name, std::ios::binary | std::ios::ate};
+  CHECK(file.is_open()) << "Can't open file from " << file_name;
+  auto size = file.tellg();
+  CHECK(size > 0) << "size is too small.";
+  std::string content(size, '\0');
+  file.seekg(0);
+  file.read(&content[0], size);
+  cl::Program::Sources sources;
+  sources.push_back(content);
+  auto prog =
+      std::unique_ptr<cl::Program>(new cl::Program(context, sources, &status_));
+  LOG(INFO) << "OpenCL kernel file name: " << file_name;
+  LOG(INFO) << "Program source size: " << content.size();
+  CL_CHECK_ERRORS(status_);
+  return std::move(prog);
+}
+std::unique_ptr<cl::UserEvent> CLEngine::CreateEvent(
+    const cl::Context& context) {
+  auto event =
+      std::unique_ptr<cl::UserEvent>(new cl::UserEvent(context, &status_));
+  CL_CHECK_ERRORS(status_);
+  return std::move(event);
+}
+bool CLEngine::BuildProgram(cl::Program* program, const std::string& options) {
+  std::string build_option = options + " -cl-fast-relaxed-math -I " +
+                             CLEngine::Global()->cl_path() + "/cl_kernel";
+  status_ = program->build({*device_}, build_option.c_str());
+  CL_CHECK_ERRORS(status_);
+  if (status_ != CL_SUCCESS) {
+    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
+        CL_BUILD_ERROR) {
+      std::string log = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
+      LOG(INFO) << "Program build error: " << log;
+    }
+    return false;
+  }
+  return true;
+}
+bool CLEngine::InitializePlatform() {
+  std::vector<cl::Platform> all_platforms;
+  status_ = cl::Platform::get(&all_platforms);
+  CL_CHECK_ERRORS(status_);
+  if (all_platforms.empty()) {
+    LOG(ERROR) << "No OpenCL platform found!";
+    return false;
+  }
+  platform_ = std::make_shared<cl::Platform>();
+  *platform_ = all_platforms[0];
+  return true;
+}
+bool CLEngine::InitializeDevice() {
+  std::vector<cl::Device> all_devices;
+  status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
+  CL_CHECK_ERRORS(status_);
+  if (all_devices.empty()) {
+    LOG(ERROR) << "No OpenCL GPU device found!";
+    return false;
+  }
+  device_ = std::make_shared<cl::Device>();
+  *device_ = all_devices[0];
+  auto device_name = device_->getInfo<CL_DEVICE_NAME>();
+  LOG(INFO) << "Using device: " << device_name;
+  auto image_support = device_->getInfo<CL_DEVICE_IMAGE_SUPPORT>();
+  if (image_support) {
+    LOG(INFO) << "The chosen device supports image processing.";
+  } else {
+    LOG(ERROR) << "The chosen device doesn't support image processing!";
+    return false;
+  }
+  auto ext_data = device_->getInfo<CL_DEVICE_EXTENSIONS>();
+  LOG(INFO) << "The extensions supported by this device: " << ext_data;
+  if (ext_data.find("cl_khr_fp16") != std::string::npos) {
+    LOG(INFO) << "The chosen device supports the half data type.";
+  } else {
+    LOG(ERROR) << "The chosen device doesn't support the half data type!";
+    return false;
+  }
+  auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+  LOG(INFO) << "The chosen device has " << max_units << " compute units.";
+  auto local_mem = device_->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+  LOG(INFO) << "The local memory size of the chosen device is "
+            << static_cast<float>(local_mem) / 1024 << " KB.";
+  return true;
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_engine.h
+++ b/paddle/fluid/lite/opencl/cl_engine.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <fstream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/opencl/cl2_header.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+namespace paddle {
+namespace lite {
+class CLEngine {
+ public:
+  static CLEngine* Global();
+  bool Init();
+  cl::Platform& platform();
+  cl::Context& context();
+  cl::Device& device();
+  cl::CommandQueue& command_queue();
+  std::unique_ptr<cl::Program> CreateProgram(const cl::Context& context,
+                                             std::string file_name);
+  std::unique_ptr<cl::UserEvent> CreateEvent(const cl::Context& context);
+  bool BuildProgram(cl::Program* program, const std::string& options = "");
+  bool IsInitSuccess() { return is_init_success_; }
+  std::string cl_path() { return cl_path_; }
+  void set_cl_path(std::string cl_path) { cl_path_ = cl_path; }
+ private:
+  CLEngine() = default;
+  ~CLEngine();
+  bool InitializePlatform();
+  bool InitializeDevice();
+  std::shared_ptr<cl::Context> CreateContext() {
+    auto context = std::make_shared<cl::Context>(
+        std::vector<cl::Device>{device()}, nullptr, nullptr, nullptr, &status_);
+    CL_CHECK_ERRORS(status_);
+    return context;
+  }
+  std::shared_ptr<cl::CommandQueue> CreateCommandQueue(
+      const cl::Context& context) {
+    auto queue =
+        std::make_shared<cl::CommandQueue>(context, device(), 0, &status_);
+    CL_CHECK_ERRORS(status_);
+    return queue;
+  }
+  std::string cl_path_;
+  std::shared_ptr<cl::Platform> platform_{nullptr};
+  std::shared_ptr<cl::Context> context_{nullptr};
+  std::shared_ptr<cl::Device> device_{nullptr};
+  std::shared_ptr<cl::CommandQueue> command_queue_{nullptr};
+  cl_int status_{CL_SUCCESS};
+  bool initialized_{false};
+  bool is_init_success_{false};
+};
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_half.cc
+++ b/paddle/fluid/lite/opencl/cl_half.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+#include "paddle/fluid/lite/opencl/cl_half.h"
+namespace paddle {
+namespace lite {
+static const uint32_t mantissatable[2048] = {
+    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
+    0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
+    0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
+    0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
+    0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
+    0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
+    0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
+    0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
+    0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
+    0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
+    0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
+    0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
+    0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
+    0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
+    0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
+    0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
+    0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
+    0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
+    0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
+    0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
+    0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
+    0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
+    0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
+    0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
+    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
+    0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
+    0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
+    0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
+    0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
+    0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
+    0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
+    0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
+    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
+    0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
+    0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
+    0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
+    0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
+    0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
+    0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
+    0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
+    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
+    0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
+    0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
+    0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
+    0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
+    0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
+    0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
+    0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
+    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
+    0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
+    0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
+    0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
+    0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
+    0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
+    0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
+    0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
+    0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
+    0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
+    0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
+    0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
+    0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
+    0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
+    0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
+    0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
+    0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
+    0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
+    0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
+    0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
+    0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
+    0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
+    0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
+    0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
+    0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
+    0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
+    0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
+    0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
+    0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
+    0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
+    0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
+    0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
+    0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
+    0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
+    0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
+    0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
+    0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
+    0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
+    0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
+    0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
+    0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
+    0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
+    0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
+    0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
+    0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
+    0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
+    0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
+    0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
+    0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
+    0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
+    0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
+    0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
+    0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
+    0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
+    0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
+    0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
+    0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
+    0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
+    0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
+    0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
+    0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
+    0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
+    0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
+    0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
+    0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
+    0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
+    0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
+    0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
+    0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
+    0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
+    0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
+    0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
+    0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
+    0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
+    0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
+    0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
+    0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
+    0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
+    0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
+    0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
+    0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
+    0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
+    0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
+    0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
+    0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
+    0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
+    0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
+    0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
+    0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
+    0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
+    0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
+    0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
+    0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
+    0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
+    0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
+    0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
+    0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
+    0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
+    0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
+    0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
+    0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
+    0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
+    0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
+    0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
+    0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
+    0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
+    0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
+    0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
+    0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
+    0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
+    0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
+    0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
+    0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
+    0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
+    0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
+    0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
+    0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
+    0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
+    0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
+    0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
+    0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
+    0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
+    0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
+    0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
+    0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
+    0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
+    0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
+    0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
+    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
+    0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
+    0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
+    0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
+    0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
+    0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
+    0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
+    0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
+    0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
+    0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
+    0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
+    0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
+    0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
+    0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
+    0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
+    0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
+    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
+    0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
+    0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
+    0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
+    0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
+    0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
+    0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
+    0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
+    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
+    0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
+    0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
+    0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
+    0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
+    0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
+    0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
+    0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
+    0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
+    0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
+    0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
+    0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
+    0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
+    0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
+    0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
+    0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
+    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
+    0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
+    0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
+    0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
+    0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
+    0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
+    0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
+    0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
+    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
+    0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
+    0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
+    0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
+    0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
+    0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
+    0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
+    0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
+    0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
+    0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
+    0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
+    0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
+    0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
+    0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
+    0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
+    0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
+    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
+    0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
+    0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
+    0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
+    0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
+    0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
+    0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
+    0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
+    0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
+    0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
+    0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
+    0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
+    0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
+    0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
+    0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
+    0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
+    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
+    0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
+    0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
+    0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
+    0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
+    0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
+    0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
+    0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
+    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
+    0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
+    0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
+    0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
+    0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
+    0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
+    0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
+    0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
+    0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
+    0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
+    0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
+    0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
+    0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
+    0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
+    0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
+    0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
+    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
+    0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
+    0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
+    0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
+    0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
+    0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
+    0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
+    0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
+    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
+    0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
+    0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
+    0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
+    0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
+    0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
+    0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
+    0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
+    0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
+    0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
+    0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
+    0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
+    0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
+    0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
+    0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
+    0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
+    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
+    0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
+    0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
+    0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
+    0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
+    0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
+    0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
+    0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
+    0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
+    0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
+    0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
+    0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
+    0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
+    0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
+    0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
+    0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
+    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
+    0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
+    0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
+    0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
+    0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
+    0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
+    0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
+    0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
+    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
+    0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
+    0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
+    0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
+    0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
+    0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
+    0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
+    0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
+    0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
+    0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
+    0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
+    0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
+    0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
+    0x387fc000, 0x387fe000};
+static const uint16_t offsettable[64] = {
+    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
+static const uint32_t exponenttable[64] = {
+    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
+    0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
+    0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
+    0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
+    0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
+    0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
+    0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
+    0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
+    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
+    0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
+    0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000};
+static const uint16_t basetable[512] = {
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010,
+    0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000,
+    0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400,
+    0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800,
+    0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
+    0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200,
+    0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400,
+    0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800,
+    0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
+    0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
+static const uint8_t shifttable[512] = {
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
+    0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
+    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
+half_t Float2Half(float f) {
+  uint32_t v = *reinterpret_cast<uint32_t *>(&f);
+  return basetable[(v >> 23) & 0x1ff] +
+         ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
+}
+float Half2Float(half_t h) {
+  uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
+               exponenttable[h >> 10];
+  return *reinterpret_cast<float *>(&v);
+}
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    h_array[i] = Float2Half(f_array[i]);
+  }
+}
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    f_array[i] = Half2Float(h_array[i]);
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_half.h
+++ b/paddle/fluid/lite/opencl/cl_half.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cstdint>
+namespace paddle {
+namespace lite {
+typedef uint16_t half_t;
+half_t Float2Half(float f);
+float Half2Float(half_t h);
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_helper.cc
+++ b/paddle/fluid/lite/opencl/cl_helper.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#include <glog/logging.h>
+#include <string>
+#include <utility>
+#include <vector>
+namespace paddle {
+namespace lite {
+void CLHelper::set_context(CLContext *context) { context_ = context; }
+void CLHelper::AddKernel(const std::string &kernel_name,
+                         const std::string &file_name,
+                         const std::string &options) {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
+  VLOG(3) << " --- begin to add kernel ---";
+  auto kernel = context_->GetKernel(kernel_name, file_name, options);
+  kernels.emplace_back(std::move(kernel));
+  VLOG(3) << " --- end to add kernel --- ";
+}
+cl::Kernel &CLHelper::KernelAt(const int index) {
+  VLOG(3) << " --- kernel count: " << kernels.size() << " --- ";
+  CHECK(static_cast<size_t>(index) < kernels.size())
+      << "The index must be less than the size of kernels.";
+  CHECK(kernels[index] != nullptr)
+      << "The target kernel pointer cannot be null.";
+  return *(kernels[index]);
+}
+cl::CommandQueue &CLHelper::OpenCLCommandQueue() {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
+  return context_->GetCommandQueue();
+}
+cl::Context &CLHelper::OpenCLContext() {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
+  return context_->GetContext();
+}
+cl::NDRange CLHelper::DefaultWorkSize(const CLImage &image) {
+  // n c h w
+  auto image_dim = image.tensor_dims();
+  if (image_dim.size() == 4) {
+    auto n = image_dim[0];
+    auto h = image_dim[2];
+    auto w = image_dim[3];
+    auto image_width = image.ImageWidth();
+    auto work_size_0 = image_width / w;
+    auto work_size_1 = w;
+    auto work_size_2 = n * h;
+    return cl::NDRange{static_cast<size_t>(work_size_0),
+                       static_cast<size_t>(work_size_1),
+                       static_cast<size_t>(work_size_2)};
+  } else if (image_dim.size() == 2) {
+    return cl::NDRange{static_cast<size_t>(1),
+                       static_cast<size_t>(image.ImageWidth()),
+                       static_cast<size_t>(image.ImageHeight())};
+  } else if (image_dim.size() == 1) {
+    return cl::NDRange{static_cast<size_t>(1),
+                       static_cast<size_t>(image.ImageWidth()),
+                       static_cast<size_t>(1)};
+  } else if (image_dim.size() == 3) {
+    auto c = image_dim[0];
+    auto h = image_dim[1];
+    auto w = image_dim[2];
+    return cl::NDRange{static_cast<size_t>((c + 3) / 4), static_cast<size_t>(w),
+                       static_cast<size_t>(h)};
+  } else {
+    LOG(FATAL) << "Not support this dimension, need to be implemented!";
+    return cl::NDRange{};
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_helper.h
+++ b/paddle/fluid/lite/opencl/cl_helper.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/opencl/cl2_header.h"
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_image.h"
+namespace paddle {
+namespace lite {
+class CLHelper {
+ public:
+  CLHelper() = default;
+  explicit CLHelper(CLContext *context) : context_(context) {}
+  void set_context(CLContext *context);
+  void AddKernel(const std::string &kernel_name, const std::string &file_name,
+                 const std::string &options = "");
+  cl::Kernel &KernelAt(const int index);
+  cl::CommandQueue &OpenCLCommandQueue();
+  cl::Context &OpenCLContext();
+  cl::NDRange DefaultWorkSize(const CLImage &image);
+ private:
+  CLContext *context_{nullptr};
+  std::vector<std::unique_ptr<cl::Kernel>> kernels;
+};
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_image.cc
+++ b/paddle/fluid/lite/opencl/cl_image.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/opencl/cl_image.h"
+#include <glog/logging.h>
+#include <array>
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_half.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+namespace paddle {
+namespace lite {
+std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
+  int width = cl_image.image_dims_[0];
+  int height = cl_image.image_dims_[1];
+  half_t* image_data = new half_t[height * width * 4];
+  cl::Image* image = cl_image.cl_image();
+  const std::array<size_t, 3> origin{0, 0, 0};
+  const std::array<size_t, 3> region{static_cast<size_t>(width),
+                                     static_cast<size_t>(height), 1};
+  cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
+      *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
+  CL_CHECK_ERRORS(err);
+  float* tensor_data = new float[cl_image.numel()];
+  auto* converter = cl_image.image_converter();
+  converter->ImageToNCHW(image_data, tensor_data, cl_image.image_dims_,
+                         cl_image.tensor_dims_);
+  int stride = cl_image.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+  os << " dims: " << cl_image.tensor_dims_ << "\n";
+  for (int i = 0; i < cl_image.numel(); i += stride) {
+    os << tensor_data[i] << " ";
+  }
+  delete[] tensor_data;
+  delete[] image_data;
+  return os;
+}
+void CLImage::set_tensor_data(float* tensor_data, const DDim& dim) {
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  auto numel = dim.product();
+#else
+  auto numel = dim.production();
+#endif
+  tensor_data_.reset(new float[numel]);
+  memcpy(tensor_data_.get(), tensor_data, numel * sizeof(float));
+  tensor_dims_ = dim;
+}
+void CLImage::InitCLImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call "
+                                    "set_tensohelper->DefaultWorkSize(out_"
+                                    "image)r_data first!";
+  image_converter_.reset(new CLImageConverterFolder);
+  InitCLImage(context, image_converter_.get());
+}
+void CLImage::InitNormalCLImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
+  image_converter_.reset(new CLImageConverterNormal);
+  InitCLImage(context, image_converter_.get());
+}
+void CLImage::InitNImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
+  CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
+  image_converter_.reset(new CLImageConverterNWBlock);
+  InitCLImage(context, image_converter_.get());
+}
+void CLImage::InitDWImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
+  CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
+  image_converter_.reset(new CLImageConverterDWBlock);
+  InitCLImage(context, image_converter_.get());
+}
+void CLImage::InitEmptyImage(const cl::Context& context, const DDim& dim) {
+  CHECK(tensor_data_ == nullptr)
+      << " Empty image tensor data shouldn't have value";
+  tensor_dims_ = dim;
+  image_converter_.reset(new CLImageConverterNormal);
+  VLOG(3) << " to get image dims ";
+  image_dims_ = image_converter_->InitImageDimInfoWith(tensor_dims_);
+  VLOG(3) << " end get image dims " << image_dims_;
+  InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
+  cl_event_ = CLEngine::Global()->CreateEvent(context);
+  initialized_ = true;
+  VLOG(3) << " end init cl image ";
+}
+void CLImage::InitEmptyWithImageDim(const cl::Context& context,
+                                    const DDim& image_dims) {
+  VLOG(3) << " to get image dims ";
+  image_dims_ = image_dims;
+  VLOG(3) << " end get image dims " << image_dims_;
+  InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
+  cl_event_ = CLEngine::Global()->CreateEvent(context);
+  initialized_ = true;
+  VLOG(3) << " end init cl image";
+}
+void CLImage::InitCLImage(const cl::Context& context,
+                          CLImageConverterBase* converter) {
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
+  VLOG(3) << " begin init cl image ";
+  image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  half_t* image_data = new half_t[image_dims_.product() * 4];
+#else
+  half_t* image_data = new half_t[image_dims_.production() * 4];
+#endif
+  VLOG(3) << " convert to image ";
+  converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_);
+  VLOG(3) << " end convert to image ";
+  InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
+  delete[] image_data;
+  tensor_data_ = nullptr;
+  cl_event_ = CLEngine::Global()->CreateEvent(context);
+  initialized_ = true;
+  VLOG(3) << " end init cl image ";
+}
+void CLImage::InitCLImage(const cl::Context& context, int width, int height,
+                          void* data) {
+  cl::ImageFormat img_format(CL_RGBA, CL_HALF_FLOAT);
+  cl_int err;
+  cl_image_.reset(new cl::Image2D(
+      context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
+      img_format, width, height, 0, data, &err));
+  CL_CHECK_ERRORS(err);
+  CHECK(err == CL_SUCCESS) << " Create image 2d error.";
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_image.h
+++ b/paddle/fluid/lite/opencl/cl_image.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <iostream>
+#include <memory>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl2_header.h"
+#include "paddle/fluid/lite/opencl/cl_image_converter.h"
+namespace paddle {
+namespace lite {
+class CLImage {
+  // For debug
+  friend std::ostream& operator<<(std::ostream& os, const CLImage& image);
+ public:
+  CLImage() = default;
+  /*
+   * Will not hold input tensor data, memcpy in this method.
+   * */
+  void set_tensor_data(float* tensor_data, const DDim& dim);
+  bool IsInit() { return initialized_; }
+  /*
+   * Need call set_tensor_data first.
+   * Folder when one dim or two dim.
+   * */
+  void InitCLImage(const cl::Context& context);
+  void InitNormalCLImage(const cl::Context& context);
+  void InitNImage(const cl::Context& context);
+  void InitDWImage(const cl::Context& context);
+  void InitEmptyImage(const cl::Context& context, const DDim& dim);
+  void InitEmptyWithImageDim(const cl::Context& context,
+                             const DDim& image_dims);
+  cl::Image* cl_image() const { return cl_image_.get(); }
+  const DDim& image_dims() const { return image_dims_; }
+  inline size_t ImageWidth() const { return image_dims_[0]; }
+  inline size_t ImageHeight() const { return image_dims_[1]; }
+  const DDim& tensor_dims() const { return tensor_dims_; }
+  /*with_da
+   * Resize original tensor dim.
+   * */
+  inline CLImage& Resize(const DDim& dims) {
+    tensor_dims_ = dims;
+    return *this;
+  }
+  template <typename T>
+  T* data() const {
+    CHECK(!initialized_) << "CL image has initialized, tensor data has been "
+                            "deleted, can't use tensor data!";
+    return reinterpret_cast<T*>(tensor_data_);
+  }
+  /*
+   *  Numel of tensor dim
+   * */
+  inline int64_t numel() const {
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+    return tensor_dims_.product();
+#else
+    return tensor_dims_.production();
+#endif
+  }
+  /*
+   *  Original tensor dim
+   * */
+  cl::UserEvent& cl_event() const { return *cl_event_; }
+  CLImageConverterBase* image_converter() const {
+    return image_converter_.get();
+  }
+ private:
+  void InitCLImage(const cl::Context& context, CLImageConverterBase* converter);
+  void InitCLImage(const cl::Context& context, int width, int height,
+                   void* data);
+  bool initialized_ = false;
+  std::unique_ptr<cl::Image2D> cl_image_{nullptr};
+  std::unique_ptr<cl::UserEvent> cl_event_{nullptr};
+  DDim tensor_dims_;
+  DDim image_dims_;
+  std::unique_ptr<float> tensor_data_{nullptr};
+  std::unique_ptr<CLImageConverterBase> image_converter_{nullptr};
+};
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_image_converter.cc
+++ b/paddle/fluid/lite/opencl/cl_image_converter.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/opencl/cl_image_converter.h"
+#include <glog/logging.h>
+#include <vector>
+namespace paddle {
+namespace lite {
+DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+  VLOG(3) << " tensor dim: " << tensor_dim;
+  VLOG(3) << " image dim: " << in_image_dim;
+  size_t width = in_image_dim[0];
+  size_t w_block = width / W;
+  float *p = nchw;
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          if (c < C) {
+            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = Float2Half(*p);
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = image_dim[0];
+  float *p = tensor;
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          *p = Half2Float(image[i2]);
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
+  if (tensor_dim.size() <= 2) {
+    size_t tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+    size_t width = (tdim[1] + 3) / 4;
+    size_t height = tdim[0];
+    width_of_one_block_ = width;
+    height_of_one_block_ = height;
+    c_block_ = 1;
+    return DDim(
+        std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                       static_cast<DDim::value_type>(height)}));
+  } else {
+    size_t new_dims[] = {1, 1, 1, 1};
+    for (size_t j = 0; j < tensor_dim.size(); ++j) {
+      new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+    }
+    size_t N, C, H, W;
+    N = new_dims[0];
+    C = new_dims[1];
+    H = new_dims[2];
+    W = new_dims[3];
+    size_t width = W * ((C + 3) / 4);
+    size_t height = H * N;
+    width_of_one_block_ = W;
+    height_of_one_block_ = H;
+    c_block_ = width / W;
+    return DDim(
+        std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                       static_cast<DDim::value_type>(height)}));
+  }
+}
+void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
+                                         const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
+      << " Tensor dim is not support!";
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.NCHWToImage(tensor, image, tensor_dim);
+  } else {
+    size_t tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+    DDim image_dim = InitImageDimInfoWith(tensor_dim);
+    size_t width = image_dim[0];
+    for (size_t h = 0; h < tdim[0]; h++) {
+      for (size_t w = 0; w < tdim[1]; w++) {
+        image[(h * width + w / 4) * 4 + (w % 4)] =
+            Float2Half(tensor[h * tdim[1] + w]);
+      }
+    }
+  }
+}
+void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+  } else {
+    size_t width = image_dim[0];
+    size_t H = 1, W = 1;
+    if (tensor_dim.size() == 2) {
+      H = tensor_dim[0];
+      W = tensor_dim[1];
+    } else if (tensor_dim.size() == 1) {
+      W = tensor_dim[0];
+    }
+    float *p = tensor;
+    for (size_t h = 0; h < H; h++) {
+      for (size_t w = 0; w < W; w++) {
+        p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
+      }
+    }
+  }
+}
+DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
+                                          const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  auto image_dim = InitImageDimInfoWith(tensor_dim);
+  float *p = tensor;
+  size_t N = tensor_dim[0];
+  size_t C = tensor_dim[1];
+  size_t H = tensor_dim[2];
+  size_t W = tensor_dim[3];
+  size_t width = image_dim[0];
+  size_t height = image_dim[1];
+  size_t block = image_dim[0] / tensor_dim[3];
+  for (size_t n = 0; n < block * 4; n++) {
+    for (size_t c = 0; c < C; c++) {
+      for (size_t h = 0; h < H; ++h) {
+        for (size_t w = 0; w < W; ++w) {
+          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                         w * 4 + n % 4;
+          if (n < N) {
+            image[index] = Float2Half(*p);
+            p++;
+          } else {
+            image[index] = 0.0;
+          }
+          if (index >= (width * height * 4)) {
+            LOG(INFO) << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  VLOG(3) << " init done";
+}
+void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  float *p = tensor;
+  size_t N = tensor_dim[0];
+  size_t C = tensor_dim[1];
+  size_t H = tensor_dim[2];
+  size_t W = tensor_dim[3];
+  size_t width = image_dim[0];
+  size_t height = image_dim[1];
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < C; c++) {
+      for (size_t h = 0; h < H; ++h) {
+        for (size_t w = 0; w < W; ++w) {
+          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                         w * 4 + n % 4;
+          *p = Half2Float(image[index]);
+          p++;
+          if (index >= (width * height * 4)) {
+            LOG(INFO) << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  VLOG(3) << " init done";
+}
+DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[1];
+  C = new_dims[0];
+  H = new_dims[2];
+  W = new_dims[3];
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+  VLOG(3) << " tensor dim: " << tensor_dim;
+  VLOG(3) << " image dim: " << in_image_dim;
+  size_t width = in_image_dim[0];
+  size_t w_block = width / W;
+  float *p = tensor;
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          if (c < C) {
+            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = Float2Half(*p);
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  float *p = tensor;
+  size_t N = tensor_dim[1];
+  size_t C = tensor_dim[0];
+  size_t H = tensor_dim[2];
+  size_t W = tensor_dim[3];
+  size_t width = image_dim[0];
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          *p = Half2Float(image[i2]);
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+  width_of_one_block_ = W;
+  height_of_one_block_ = H;
+  c_block_ = width / W;
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
+                                         const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
+      << " Tensor dim is not support!";
+  CLImageConverterDefault default_converter;
+  default_converter.NCHWToImage(tensor, image, tensor_dim);
+}
+void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  CLImageConverterDefault default_converter;
+  default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+}
+DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  size_t N, C;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  size_t width = (C + 3) / 4;
+  size_t height = N * 16;  // N * (wino_blk_size + 2) * (wino_blk_size + 2)
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image,
+                                                  const DDim &tensor_dim) {}
+void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor,
+                                                  const DDim &image_dim,
+                                                  const DDim &tensor_dim) {}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_image_converter.h
+++ b/paddle/fluid/lite/opencl/cl_image_converter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_half.h"
+namespace paddle {
+namespace lite {
+class CLImageConverterBase {
+ public:
+  virtual ~CLImageConverterBase() {}
+  virtual void NCHWToImage(float *nchw, half_t *image,
+                           const DDim &tensor_dim) = 0;
+  virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
+                           const DDim &tensor_dim) = 0;
+  virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0;
+};
+class CLImageConverterDefault : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+class CLImageConverterFolder : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+  int GetCBlock() const { return c_block_; }
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+class CLImageConverterNormal : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+  int GetCBlock() const { return c_block_; }
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+class CLImageConverterNWBlock : public CLImageConverterBase {
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+class CLImageConverterDWBlock : public CLImageConverterBase {
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+class CLImageConverterWinoTransWeight : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_kernel/cl_common.h
+++ b/paddle/fluid/lite/opencl/cl_kernel/cl_common.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+inline half4 activation(half4 in
+#ifdef PRELU
+                        ,
+                        half4 prelu_alpha
+#endif
+                        ) {
+  half4 output;
+#ifdef PRELU
+  output = select(prelu_alpha * in, in, in >= (half4)0.0);
+#endif
+#ifdef RELU
+  output = fmax(in, (half4)(0.0f));
+#endif
+  return output;
+}
--- a/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
+++ b/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     half4 in = read_imageh(input, sampler, coords);
+     half4 biase = read_imageh(bias, sampler, coords);
+     half4 output = in + biase;
+     write_imageh(outputImage,coords,output);
+ }
--- a/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
+++ b/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#define MIN_VALUE -FLT_MAX
+__kernel void pool_max(
+    __private const int in_height, __private const int in_width,
+    __private const int out_height, __private const int out_width,
+    __private const int pad_top, __private const int pad_left,
+    __private const int stride_h, __private const int stride_w,
+    __private const int ksize_h, __private const int ksize_w,
+    __read_only image2d_t input, __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int start_h = out_h * stride_h - pad_top;
+  int end_h = min(start_h + ksize_h, in_height);
+  start_h = max(start_h,0);
+  int start_w = out_w * stride_w - pad_left;
+  int end_w = min(start_w + ksize_w, in_width);
+  start_w = max(start_w,0);
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  half4 max_value = (half4)(MIN_VALUE);
+  for (int y = start_h; y < end_h; ++y) {
+    for (int x = start_w; x < end_w; ++x) {
+      half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      max_value = max(max_value, tmp);
+    }
+  }
+  const int pos_out_x = mad24(out_c, out_width, out_w);
+  write_imageh(output, (int2)(pos_out_x, out_nh), max_value);
+}
+__kernel void pool_avg(
+    __private const int in_height, __private const int in_width,
+    __private const int out_height, __private const int out_width,
+    __private const int pad_top, __private const int pad_left,
+    __private const int stride_h, __private const int stride_w,
+    __private const int ksize_h, __private const int ksize_w,
+    __read_only image2d_t input, __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int start_h = max(out_h * stride_h - pad_top, 0);
+  int end_h = min(start_h + ksize_h, in_height);
+  int start_w = max(out_w * stride_w - pad_left, 0);
+  int end_w = min(start_w + ksize_w, in_width);
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  half4 sum = (half4)(0.0f);
+  int num = 0;
+  for (int y = start_h; y < end_h; ++y) {
+    for (int x = start_w; x < end_w; ++x) {
+      sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      num++;
+    }
+  }
+  half4 avg = sum / num;
+  const int pos_out_x = mad24(out_c, out_width, out_w);
+  write_imageh(output, (int2)(pos_out_x, out_nh), avg);
+}
--- a/paddle/fluid/lite/opencl/cl_test.cc
+++ b/paddle/fluid/lite/opencl/cl_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_caller.h"
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#include "paddle/fluid/lite/opencl/cl_image.h"
+DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");
+namespace paddle {
+namespace lite {
+TEST(cl_test, engine_test) {
+  auto* engine = CLEngine::Global();
+  CHECK(engine->IsInitSuccess());
+  engine->set_cl_path(FLAGS_cl_path);
+  engine->platform();
+  engine->device();
+  engine->command_queue();
+  auto& context = engine->context();
+  auto program = engine->CreateProgram(
+      context, engine->cl_path() + "/cl_kernel/" + "elementwise_add_kernel.cl");
+  auto event = engine->CreateEvent(context);
+  CHECK(engine->BuildProgram(program.get()));
+}
+TEST(cl_test, context_test) {
+  auto* engine = CLEngine::Global();
+  CHECK(engine->IsInitSuccess());
+  engine->set_cl_path(FLAGS_cl_path);
+  CLContext context;
+  context.GetKernel("pool_max", "pool_kernel.cl", "");
+  context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
+  context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
+}
+TEST(cl_test, kernel_test) {
+  auto* engine = CLEngine::Global();
+  CHECK(engine->IsInitSuccess());
+  engine->set_cl_path(FLAGS_cl_path);
+  std::unique_ptr<CLContext> context(new CLContext);
+  // std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
+  std::unique_ptr<CLHelper> helper(new CLHelper);
+  helper->set_context(context.get());
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  helper->AddKernel("pool_max", "pool_kernel.cl");
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  auto kernel = helper->KernelAt(2);
+  std::unique_ptr<float[]> in_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    in_data[i] = 1.f;
+  }
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  CLImage in_image;
+  in_image.set_tensor_data(in_data.get(), in_dim);
+  in_image.InitNormalCLImage(helper->OpenCLContext());
+  LOG(INFO) << in_image;
+  std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    bias_data[i] = 2.f;
+  }
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  CLImage bias_image;
+  bias_image.set_tensor_data(bias_data.get(), bias_dim);
+  bias_image.InitNormalCLImage(helper->OpenCLContext());
+  LOG(INFO) << bias_image;
+  CLImage out_image;
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
+  LOG(INFO) << out_image;
+  cl_int status;
+  status = kernel.setArg(0, *in_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(1, *bias_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(2, *out_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  // auto global_work_size = helper->DefaultWorkSize(out_image);
+  size_t width = in_image.ImageWidth();
+  size_t height = in_image.ImageHeight();
+  auto global_work_size = cl::NDRange{width, height};
+  cl::Event event;
+  status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
+  CL_CHECK_ERRORS(status);
+  double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+  double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
+  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
+  LOG(INFO) << out_image;
+}
+TEST(cl_test, elementwise_add_test) {
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  std::unique_ptr<float[]> in_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    in_data[i] = dist(engine);
+  }
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    bias_data[i] = dist(engine);
+  }
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  std::unique_ptr<float[]> out(new float[1024 * 512]);
+  bool status = InitOpenCLEngine(FLAGS_cl_path);
+  CHECK(status) << "Fail to initialize OpenCL engine.";
+  CLContext context;
+  elementwise_add(&context, in_data.get(), in_dim, bias_data.get(), bias_dim,
+                  out.get(), out_dim);
+  int stride = 1024 * 512 / 20;
+  for (int i = 0; i < 1024 * 512; i += stride) {
+    std::cout << out[i] << " ";
+  }
+  std::cout << std::endl;
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_tool.cc
+++ b/paddle/fluid/lite/opencl/cl_tool.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+namespace paddle {
+namespace lite {
+const char *opencl_error_to_str(cl_int error) {
+#define CASE_CL_CONSTANT(NAME) \
+  case NAME:                   \
+    return #NAME;
+  // Suppose that no combinations are possible.
+  switch (error) {
+    CASE_CL_CONSTANT(CL_SUCCESS)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
+    CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
+    CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
+    CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
+    CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
+    CASE_CL_CONSTANT(CL_MAP_FAILURE)
+    CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
+    CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
+    CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE)
+    CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
+    CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
+    CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
+    CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
+    CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
+    CASE_CL_CONSTANT(CL_INVALID_BINARY)
+    CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT)
+    CASE_CL_CONSTANT(CL_INVALID_OPERATION)
+    CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
+    default:
+      return "UNKNOWN ERROR CODE";
+  }
+#undef CASE_CL_CONSTANT
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_tool.h
+++ b/paddle/fluid/lite/opencl/cl_tool.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/lite/opencl/cl2_header.h"
+namespace paddle {
+namespace lite {
+const char* opencl_error_to_str(cl_int error);
+#define CL_CHECK_ERRORS(ERR)                                         \
+  if (ERR != CL_SUCCESS) {                                           \
+    printf(                                                          \
+        "OpenCL error with code %s happened in file %s at line %d. " \
+        "Exiting.\n",                                                \
+        opencl_error_to_str(ERR), __FILE__, __LINE__);               \
+  }
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/opencl/cl_wrapper.cc
+++ b/paddle/fluid/lite/opencl/cl_wrapper.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <dlfcn.h>
+#include <glog/logging.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/opencl/cl2_header.h"
+/**
+ * Wrapper of OpenCL 2.0, based on file opencl20/CL/cl.h
+ */
+#if CL_HPP_TARGET_OPENCL_VERSION < 200
+#define CL_API_SUFFIX__VERSION_2_0
+#endif
+namespace paddle {
+namespace lite {
+class OpenCLLibrary final {
+ private:
+  OpenCLLibrary();
+  OpenCLLibrary(const OpenCLLibrary &) = delete;
+  OpenCLLibrary &operator=(const OpenCLLibrary &) = delete;
+  bool Load();
+  void *LoadFromPath(const std::string &path);
+ public:
+  static OpenCLLibrary *Get();
+  using clGetPlatformIDsFunc = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *);
+  using clGetPlatformInfoFunc = cl_int (*)(cl_platform_id, cl_platform_info,
+                                           size_t, void *, size_t *);
+  using clBuildProgramFunc = cl_int (*)(cl_program, cl_uint,
+                                        const cl_device_id *, const char *,
+                                        void (*pfn_notify)(cl_program, void *),
+                                        void *);
+  using clEnqueueNDRangeKernelFunc = cl_int (*)(cl_command_queue, cl_kernel,
+                                                cl_uint, const size_t *,
+                                                const size_t *, const size_t *,
+                                                cl_uint, const cl_event *,
+                                                cl_event *);
+  using clSetKernelArgFunc = cl_int (*)(cl_kernel, cl_uint, size_t,
+                                        const void *);
+  using clRetainMemObjectFunc = cl_int (*)(cl_mem);
+  using clReleaseMemObjectFunc = cl_int (*)(cl_mem);
+  using clEnqueueUnmapMemObjectFunc = cl_int (*)(cl_command_queue, cl_mem,
+                                                 void *, cl_uint,
+                                                 const cl_event *, cl_event *);
+  using clRetainCommandQueueFunc = cl_int (*)(cl_command_queue command_queue);
+  using clCreateContextFunc = cl_context (*)(
+      const cl_context_properties *, cl_uint, const cl_device_id *,
+      void(CL_CALLBACK *)(  // NOLINT(readability/casting)
+          const char *, const void *, size_t, void *),
+      void *, cl_int *);
+  using clCreateContextFromTypeFunc =
+      cl_context (*)(const cl_context_properties *, cl_device_type,
+                     void(CL_CALLBACK *)(  // NOLINT(readability/casting)
+                         const char *, const void *, size_t, void *),
+                     void *, cl_int *);
+  using clReleaseContextFunc = cl_int (*)(cl_context);
+  using clWaitForEventsFunc = cl_int (*)(cl_uint, const cl_event *);
+  using clReleaseEventFunc = cl_int (*)(cl_event);
+  using clEnqueueWriteBufferFunc = cl_int (*)(cl_command_queue, cl_mem, cl_bool,
+                                              size_t, size_t, const void *,
+                                              cl_uint, const cl_event *,
+                                              cl_event *);
+  using clEnqueueReadBufferFunc = cl_int (*)(cl_command_queue, cl_mem, cl_bool,
+                                             size_t, size_t, void *, cl_uint,
+                                             const cl_event *, cl_event *);
+  using clEnqueueReadImageFunc = cl_int (*)(cl_command_queue, cl_mem, cl_bool,
+                                            const size_t *, const size_t *,
+                                            size_t, size_t, void *, cl_uint,
+                                            const cl_event *, cl_event *);
+  using clGetProgramBuildInfoFunc = cl_int (*)(cl_program, cl_device_id,
+                                               cl_program_build_info, size_t,
+                                               void *, size_t *);
+  using clRetainProgramFunc = cl_int (*)(cl_program program);
+  using clEnqueueMapBufferFunc = void *(*)(cl_command_queue, cl_mem, cl_bool,
+                                           cl_map_flags, size_t, size_t,
+                                           cl_uint, const cl_event *,
+                                           cl_event *, cl_int *);
+  using clEnqueueMapImageFunc = void *(*)(cl_command_queue, cl_mem, cl_bool,
+                                          cl_map_flags, const size_t *,
+                                          const size_t *, size_t *, size_t *,
+                                          cl_uint, const cl_event *, cl_event *,
+                                          cl_int *);
+  using clCreateCommandQueueFunc = cl_command_queue(CL_API_CALL *)(  // NOLINT
+      cl_context, cl_device_id, cl_command_queue_properties, cl_int *);
+  using clCreateCommandQueueWithPropertiesFunc = cl_command_queue (*)(
+      cl_context, cl_device_id, const cl_queue_properties *, cl_int *);
+  using clReleaseCommandQueueFunc = cl_int (*)(cl_command_queue);
+  using clCreateProgramWithBinaryFunc = cl_program (*)(cl_context, cl_uint,
+                                                       const cl_device_id *,
+                                                       const size_t *,
+                                                       const unsigned char **,
+                                                       cl_int *, cl_int *);
+  using clRetainContextFunc = cl_int (*)(cl_context context);
+  using clGetContextInfoFunc = cl_int (*)(cl_context, cl_context_info, size_t,
+                                          void *, size_t *);
+  using clReleaseProgramFunc = cl_int (*)(cl_program program);
+  using clFlushFunc = cl_int (*)(cl_command_queue command_queue);
+  using clFinishFunc = cl_int (*)(cl_command_queue command_queue);
+  using clGetProgramInfoFunc = cl_int (*)(cl_program, cl_program_info, size_t,
+                                          void *, size_t *);
+  using clCreateKernelFunc = cl_kernel (*)(cl_program, const char *, cl_int *);
+  using clRetainKernelFunc = cl_int (*)(cl_kernel kernel);
+  using clCreateBufferFunc = cl_mem (*)(cl_context, cl_mem_flags, size_t,
+                                        void *, cl_int *);
+  using clCreateImage2DFunc = cl_mem(CL_API_CALL *)(cl_context,  // NOLINT
+                                                    cl_mem_flags,
+                                                    const cl_image_format *,
+                                                    size_t, size_t, size_t,
+                                                    void *, cl_int *);
+  using clCreateImageFunc = cl_mem (*)(cl_context, cl_mem_flags,
+                                       const cl_image_format *,
+                                       const cl_image_desc *, void *, cl_int *);
+  using clCreateUserEventFunc = cl_event (*)(cl_context, cl_int *);
+  using clCreateProgramWithSourceFunc = cl_program (*)(cl_context, cl_uint,
+                                                       const char **,
+                                                       const size_t *,
+                                                       cl_int *);
+  using clReleaseKernelFunc = cl_int (*)(cl_kernel kernel);
+  using clGetDeviceInfoFunc = cl_int (*)(cl_device_id, cl_device_info, size_t,
+                                         void *, size_t *);
+  using clGetDeviceIDsFunc = cl_int (*)(cl_platform_id, cl_device_type, cl_uint,
+                                        cl_device_id *, cl_uint *);
+  using clRetainDeviceFunc = cl_int (*)(cl_device_id);
+  using clReleaseDeviceFunc = cl_int (*)(cl_device_id);
+  using clRetainEventFunc = cl_int (*)(cl_event);
+  using clGetKernelWorkGroupInfoFunc = cl_int (*)(cl_kernel, cl_device_id,
+                                                  cl_kernel_work_group_info,
+                                                  size_t, void *, size_t *);
+  using clGetEventInfoFunc = cl_int (*)(cl_event event,
+                                        cl_event_info param_name,
+                                        size_t param_value_size,
+                                        void *param_value,
+                                        size_t *param_value_size_ret);
+  using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event,
+                                                 cl_profiling_info param_name,
+                                                 size_t param_value_size,
+                                                 void *param_value,
+                                                 size_t *param_value_size_ret);
+  using clGetImageInfoFunc = cl_int (*)(cl_mem, cl_image_info, size_t, void *,
+                                        size_t *);
+#define PADDLE_CL_DEFINE_FUNC_PTR(func) func##Func func = nullptr
+  PADDLE_CL_DEFINE_FUNC_PTR(clGetPlatformIDs);
+  PADDLE_CL_DEFINE_FUNC_PTR(clGetPlatformInfo);
+  PADDLE_CL_DEFINE_FUNC_PTR(clBuildProgram);
+  PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueNDRangeKernel);
+  PADDLE_CL_DEFINE_FUNC_PTR(clSetKernelArg);
+  PADDLE_CL_DEFINE_FUNC_PTR(clReleaseKernel);
+  PADDLE_CL_DEFINE_FUNC_PTR(clCreateProgramWithSource);
+  PADDLE_CL_DEFINE_FUNC_PTR(clCreateBuffer);
+  PADDLE_CL_DEFINE_FUNC_PTR(clCreateImage);
+  PADDLE_CL_DEFINE_FUNC_PTR(clCreateImage2D);
+  PADDLE_CL_DEFINE_FUNC_PTR(clCreateUserEvent);
+  PADDLE_CL_DEFINE_FUNC_PTR(clRetainKernel);
+  PADDLE_CL_DEFINE_FUNC_PTR(clCreateKernel);
+  PADDLE_CL_DEFINE_FUNC_PTR(clGetProgramInfo);
+  PADDLE_CL_DEFINE_FUNC_PTR(clFlush);
+  PADDLE_CL_DEFINE_FUNC_PTR(clFinish);
+  PADDLE_CL_DEFINE_FUNC_PTR(clReleaseProgram);
+  PADDLE_CL_DEFINE_FUNC_PTR(clRetainContext);
+  PADDLE_CL_DEFINE_FUNC_PTR(clGetContextInfo);
+  PADDLE_CL_DEFINE_FUNC_PTR(clCreateProgramWithBinary);
+  PADDLE_CL_DEFINE_FUNC_PTR(clCreateCommandQueue);
+  PADDLE_CL_DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties);
+  PADDLE_CL_DEFINE_FUNC_PTR(clReleaseCommandQueue);
+  PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueMapBuffer);
+  PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueMapImage);
+  PADDLE_CL_DEFINE_FUNC_PTR(clRetainProgram);
+  PADDLE_CL_DEFINE_FUNC_PTR(clGetProgramBuildInfo);
+  PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueReadBuffer);
+  PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueReadImage);
+  PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueWriteBuffer);
+  PADDLE_CL_DEFINE_FUNC_PTR(clWaitForEvents);
+  PADDLE_CL_DEFINE_FUNC_PTR(clReleaseEvent);
+  PADDLE_CL_DEFINE_FUNC_PTR(clCreateContext);
+  PADDLE_CL_DEFINE_FUNC_PTR(clCreateContextFromType);
+  PADDLE_CL_DEFINE_FUNC_PTR(clReleaseContext);
+  PADDLE_CL_DEFINE_FUNC_PTR(clRetainCommandQueue);
+  PADDLE_CL_DEFINE_FUNC_PTR(clEnqueueUnmapMemObject);
+  PADDLE_CL_DEFINE_FUNC_PTR(clRetainMemObject);
+  PADDLE_CL_DEFINE_FUNC_PTR(clReleaseMemObject);
+  PADDLE_CL_DEFINE_FUNC_PTR(clGetDeviceInfo);
+  PADDLE_CL_DEFINE_FUNC_PTR(clGetDeviceIDs);
+  PADDLE_CL_DEFINE_FUNC_PTR(clRetainDevice);
+  PADDLE_CL_DEFINE_FUNC_PTR(clReleaseDevice);
+  PADDLE_CL_DEFINE_FUNC_PTR(clRetainEvent);
+  PADDLE_CL_DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
+  PADDLE_CL_DEFINE_FUNC_PTR(clGetEventInfo);
+  PADDLE_CL_DEFINE_FUNC_PTR(clGetEventProfilingInfo);
+  PADDLE_CL_DEFINE_FUNC_PTR(clGetImageInfo);
+#undef PADDLE_CL_DEFINE_FUNC_PTR
+ private:
+  void *handle_ = nullptr;
+};
+OpenCLLibrary *OpenCLLibrary::Get() {
+  static OpenCLLibrary library;
+  return &library;
+}
+OpenCLLibrary::OpenCLLibrary() {
+  this->Load();
+  // Do not call dlclose which may unload all OpenCL symbols.
+  // If close the OpenCL library, the static OpenCLlite destructor may fail.
+  // If there is no dlclose, the library will be closed when the program exist.
+  // Besides, the library will not be load repeatedly even dlopen many times.
+}
+bool OpenCLLibrary::Load() {
+  if (handle_ != nullptr) {
+    return true;
+  }
+  // Add customized OpenCL search path here
+  const std::vector<std::string> paths = {
+    "libOpenCL.so",
+#if defined(__aarch64__)
+    // Qualcomm Adreno with Android
+    "/system/vendor/lib64/libOpenCL.so",
+    "/system/lib64/libOpenCL.so",
+    // Mali with Android
+    "/system/vendor/lib64/egl/libGLES_mali.so",
+    "/system/lib64/egl/libGLES_mali.so",
+    // Typical Linux board
+    "/usr/lib/aarch64-linux-gnu/libOpenCL.so",
+#else
+    // Qualcomm Adreno with Android
+    "/system/vendor/lib/libOpenCL.so",
+    "/system/lib/libOpenCL.so",
+    // Mali with Android
+    "/system/vendor/lib/egl/libGLES_mali.so",
+    "/system/lib/egl/libGLES_mali.so",
+    // Typical Linux board
+    "/usr/lib/arm-linux-gnueabihf/libOpenCL.so",
+#endif
+  };
+  for (const auto &path : paths) {
+    VLOG(3) << "Loading OpenCL from " << path;
+    void *handle = LoadFromPath(path);
+    if (handle != nullptr) {
+      handle_ = handle;
+      break;
+    }
+  }
+  if (handle_ == nullptr) {
+    LOG(ERROR)
+        << "Failed to load OpenCL library, "
+           "please make sure there exists OpenCL library on your device, "
+           "and your APP have right to access the library.";
+    return false;
+  }
+  return true;
+}
+void *OpenCLLibrary::LoadFromPath(const std::string &path) {
+  void *handle = dlopen(path.c_str(), RTLD_LAZY | RTLD_LOCAL);
+  if (handle == nullptr) {
+    VLOG(3) << "Failed to load OpenCL library from path " << path
+            << " error code: " << dlerror();
+    return nullptr;
+  }
+#define PADDLE_CL_ASSIGN_FROM_DLSYM(func)                        \
+  do {                                                           \
+    void *ptr = dlsym(handle, #func);                            \
+    if (ptr == nullptr) {                                        \
+      VLOG(1) << "Failed to load " << #func << " from " << path; \
+      continue;                                                  \
+    }                                                            \
+    func = reinterpret_cast<func##Func>(ptr);                    \
+    VLOG(3) << "Loaded " << #func << " from " << path;           \
+  } while (false)
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clGetPlatformIDs);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clGetPlatformInfo);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clBuildProgram);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueNDRangeKernel);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clSetKernelArg);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseKernel);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateProgramWithSource);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateBuffer);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateImage);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateImage2D);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateUserEvent);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainKernel);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateKernel);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clGetProgramInfo);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clFlush);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clFinish);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseProgram);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainContext);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clGetContextInfo);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateProgramWithBinary);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateCommandQueue);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateCommandQueueWithProperties);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseCommandQueue);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueMapBuffer);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueMapImage);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainProgram);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clGetProgramBuildInfo);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueReadBuffer);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueReadImage);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueWriteBuffer);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clWaitForEvents);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseEvent);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateContext);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clCreateContextFromType);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseContext);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainCommandQueue);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clEnqueueUnmapMemObject);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainMemObject);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseMemObject);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clGetDeviceInfo);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clGetDeviceIDs);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainDevice);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clReleaseDevice);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clRetainEvent);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clGetEventInfo);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clGetEventProfilingInfo);
+  PADDLE_CL_ASSIGN_FROM_DLSYM(clGetImageInfo);
+#undef PADDLE_CL_ASSIGN_FROM_DLSYM
+  return handle;
+}
+}  // namespace lite
+}  // namespace paddle
+CL_API_ENTRY cl_event clCreateUserEvent(cl_context context, cl_int *errcode_ret)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateUserEvent;
+  if (func != nullptr) {
+    return func(context, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+// Platform APIs
+CL_API_ENTRY cl_int
+clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms,
+                 cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetPlatformIDs;
+  if (func != nullptr) {
+    return func(num_entries, platforms, num_platforms);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int
+clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name,
+                  size_t param_value_size, void *param_value,
+                  size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetPlatformInfo;
+  if (func != nullptr) {
+    return func(platform, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Device APIs
+CL_API_ENTRY cl_int clGetDeviceIDs(
+    cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
+    cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetDeviceIDs;
+  if (func != nullptr) {
+    return func(platform, device_type, num_entries, devices, num_devices);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int
+clGetDeviceInfo(cl_device_id device, cl_device_info param_name,
+                size_t param_value_size, void *param_value,
+                size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetDeviceInfo;
+  if (func != nullptr) {
+    return func(device, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clRetainDevice(cl_device_id device)
+    CL_API_SUFFIX__VERSION_1_2 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainDevice;
+  if (func != nullptr) {
+    return func(device);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clReleaseDevice(cl_device_id device)
+    CL_API_SUFFIX__VERSION_1_2 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseDevice;
+  if (func != nullptr) {
+    return func(device);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Context APIs
+CL_API_ENTRY cl_context clCreateContext(
+    const cl_context_properties *properties, cl_uint num_devices,
+    const cl_device_id *devices,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateContext;
+  if (func != nullptr) {
+    return func(properties, num_devices, devices, pfn_notify, user_data,
+                errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+CL_API_ENTRY cl_context clCreateContextFromType(
+    const cl_context_properties *properties, cl_device_type device_type,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateContextFromType;
+  if (func != nullptr) {
+    return func(properties, device_type, pfn_notify, user_data, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+CL_API_ENTRY cl_int clRetainContext(cl_context context)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainContext;
+  if (func != nullptr) {
+    return func(context);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clReleaseContext(cl_context context)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseContext;
+  if (func != nullptr) {
+    return func(context);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int
+clGetContextInfo(cl_context context, cl_context_info param_name,
+                 size_t param_value_size, void *param_value,
+                 size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetContextInfo;
+  if (func != nullptr) {
+    return func(context, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Program Object APIs
+CL_API_ENTRY cl_program clCreateProgramWithSource(
+    cl_context context, cl_uint count, const char **strings,
+    const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateProgramWithSource;
+  if (func != nullptr) {
+    return func(context, count, strings, lengths, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+CL_API_ENTRY cl_program clCreateProgramWithBinary(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const size_t *lengths, const unsigned char **binaries,
+    cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateProgramWithBinary;
+  if (func != nullptr) {
+    return func(context, num_devices, device_list, lengths, binaries,
+                binary_status, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+CL_API_ENTRY cl_int
+clGetProgramInfo(cl_program program, cl_program_info param_name,
+                 size_t param_value_size, void *param_value,
+                 size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetProgramInfo;
+  if (func != nullptr) {
+    return func(program, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clGetProgramBuildInfo(
+    cl_program program, cl_device_id device, cl_program_build_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetProgramBuildInfo;
+  if (func != nullptr) {
+    return func(program, device, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clRetainProgram(cl_program program)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainProgram;
+  if (func != nullptr) {
+    return func(program);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clReleaseProgram(cl_program program)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseProgram;
+  if (func != nullptr) {
+    return func(program);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clBuildProgram(
+    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clBuildProgram;
+  if (func != nullptr) {
+    return func(program, num_devices, device_list, options, pfn_notify,
+                user_data);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Kernel Object APIs
+CL_API_ENTRY cl_kernel
+clCreateKernel(cl_program program, const char *kernel_name,
+               cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateKernel;
+  if (func != nullptr) {
+    return func(program, kernel_name, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+CL_API_ENTRY cl_int clRetainKernel(cl_kernel kernel)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainKernel;
+  if (func != nullptr) {
+    return func(kernel);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clReleaseKernel(cl_kernel kernel)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseKernel;
+  if (func != nullptr) {
+    return func(kernel);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index,
+                                   size_t arg_size, const void *arg_value)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clSetKernelArg;
+  if (func != nullptr) {
+    return func(kernel, arg_index, arg_size, arg_value);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Memory Object APIs
+CL_API_ENTRY cl_mem
+clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size,
+               void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateBuffer;
+  if (func != nullptr) {
+    return func(context, flags, size, host_ptr, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+CL_API_ENTRY cl_mem clCreateImage(
+    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
+    const cl_image_desc *image_desc, void *host_ptr,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateImage;
+  if (func != nullptr) {
+    return func(context, flags, image_format, image_desc, host_ptr,
+                errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+CL_API_ENTRY cl_int clRetainMemObject(cl_mem memobj)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainMemObject;
+  if (func != nullptr) {
+    return func(memobj);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clReleaseMemObject(cl_mem memobj)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseMemObject;
+  if (func != nullptr) {
+    return func(memobj);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clGetImageInfo(cl_mem image, cl_image_info param_name,
+                                   size_t param_value_size, void *param_value,
+                                   size_t *param_value_size_ret)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetImageInfo;
+  if (func != nullptr) {
+    return func(image, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Command Queue APIs
+CL_API_ENTRY cl_command_queue clCreateCommandQueueWithProperties(
+    cl_context context, cl_device_id device,
+    const cl_queue_properties *properties,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_0 {
+  auto func =
+      paddle::lite::OpenCLLibrary::Get()->clCreateCommandQueueWithProperties;
+  if (func != nullptr) {
+    return func(context, device, properties, errcode_ret);
+  } else {
+    // Fix MediaTek MT6771 OpenCL driver breakage
+    VLOG(3) << "Fallback to clCreateCommandQueue";
+    if (properties[0] == CL_QUEUE_PROPERTIES) {
+// When calling with OpenCL-CLHPP, the 2nd param is provided by caller.
+#pragma GCC diagnostic push  // disable warning both for clang and gcc
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+      return clCreateCommandQueue(context, device, properties[1], errcode_ret);
+#pragma GCC diagnostic pop
+    } else {
+      LOG(FATAL) << "Unknown calling parameters, check the code here";
+      if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+      return nullptr;
+    }
+  }
+}
+CL_API_ENTRY cl_int clRetainCommandQueue(cl_command_queue command_queue)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainCommandQueue;
+  if (func != nullptr) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clReleaseCommandQueue(cl_command_queue command_queue)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseCommandQueue;
+  if (func != nullptr) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Enqueued Commands APIs
+CL_API_ENTRY cl_int clEnqueueReadBuffer(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+    size_t offset, size_t size, void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueReadBuffer;
+  if (func != nullptr) {
+    return func(command_queue, buffer, blocking_read, offset, size, ptr,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clEnqueueReadImage(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,
+    const size_t *origin, const size_t *region, size_t row_pitch,
+    size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueReadImage;
+  if (func != nullptr) {
+    return func(command_queue, image, blocking_read, origin, region, row_pitch,
+                slice_pitch, ptr, num_events_in_wait_list, event_wait_list,
+                event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clEnqueueWriteBuffer(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+    size_t offset, size_t size, const void *ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueWriteBuffer;
+  if (func != nullptr) {
+    return func(command_queue, buffer, blocking_write, offset, size, ptr,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY void *clEnqueueMapBuffer(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map,
+    cl_map_flags map_flags, size_t offset, size_t size,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueMapBuffer;
+  if (func != nullptr) {
+    return func(command_queue, buffer, blocking_map, map_flags, offset, size,
+                num_events_in_wait_list, event_wait_list, event, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+CL_API_ENTRY void *clEnqueueMapImage(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_map,
+    cl_map_flags map_flags, const size_t *origin, const size_t *region,
+    size_t *image_row_pitch, size_t *image_slice_pitch,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueMapImage;
+  if (func != nullptr) {
+    return func(command_queue, image, blocking_map, map_flags, origin, region,
+                image_row_pitch, image_slice_pitch, num_events_in_wait_list,
+                event_wait_list, event, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+CL_API_ENTRY cl_int clEnqueueUnmapMemObject(
+    cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueUnmapMemObject;
+  if (func != nullptr) {
+    return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list,
+                event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clGetKernelWorkGroupInfo(
+    cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetKernelWorkGroupInfo;
+  if (func != nullptr) {
+    return func(kernel, device, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clEnqueueNDRangeKernel(
+    cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
+    const size_t *global_work_offset, const size_t *global_work_size,
+    const size_t *local_work_size, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueNDRangeKernel;
+  if (func != nullptr) {
+    return func(command_queue, kernel, work_dim, global_work_offset,
+                global_work_size, local_work_size, num_events_in_wait_list,
+                event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Event Object APIs
+CL_API_ENTRY cl_int clWaitForEvents(
+    cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clWaitForEvents;
+  if (func != nullptr) {
+    return func(num_events, event_list);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainEvent;
+  if (func != nullptr) {
+    return func(event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseEvent;
+  if (func != nullptr) {
+    return func(event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Event API
+CL_API_ENTRY cl_int clGetEventInfo(cl_event event, cl_event_info param_name,
+                                   size_t param_value_size, void *param_value,
+                                   size_t *param_value_size_ret)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetEventInfo;
+  if (func != nullptr) {
+    return func(event, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Profiling APIs
+CL_API_ENTRY cl_int clGetEventProfilingInfo(
+    cl_event event, cl_profiling_info param_name, size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetEventProfilingInfo;
+  if (func != nullptr) {
+    return func(event, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Flush and Finish APIs
+CL_API_ENTRY cl_int clFlush(cl_command_queue command_queue)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clFlush;
+  if (func != nullptr) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+CL_API_ENTRY cl_int clFinish(cl_command_queue command_queue)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clFinish;
+  if (func != nullptr) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+// Deprecated OpenCL 1.1 APIs
+CL_API_ENTRY /* CL_EXT_PREFIX__VERSION_1_1_DEPRECATED */ cl_mem clCreateImage2D(
+    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
+    size_t image_width, size_t image_height, size_t image_row_pitch,
+    void *host_ptr,
+    cl_int *errcode_ret) /* CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED */ {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateImage2D;
+  if (func != nullptr) {
+    return func(context, flags, image_format, image_width, image_height,
+                image_row_pitch, host_ptr, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+// Deprecated OpenCL 2.0 APIs
+CL_API_ENTRY /*CL_EXT_PREFIX__VERSION_1_2_DEPRECATED*/ cl_command_queue
+clCreateCommandQueue(cl_context context, cl_device_id device,
+                     cl_command_queue_properties properties,
+                     cl_int *errcode_ret)
+/* CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED */ {  // NOLINT
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateCommandQueue;
+  if (func != nullptr) {
+    return func(context, device, properties, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
--- a/paddle/fluid/lite/operators/use_ops.h
+++ b/paddle/fluid/lite/operators/use_ops.h
@@ -13,9 +13,10 @@
 // limitations under the License.
 #pragma once
-/*
- * ATTENTION this header file can only include in .cc file.
+// ATTENTION This can only include in a .cc file.
- */
+#include "paddle/fluid/lite/core/op_registry.h"
 USE_LITE_OP(mul);
 USE_LITE_OP(fc);

--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -25,6 +25,23 @@ function cmake_x86 {
    cmake ..  -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags}
 }
+function cmake_opencl {
+    # $1: ARM_TARGET_OS in "android" , "armlinux"
+    # $2: ARM_TARGET_ARCH_ABI in "arm64-v8a", "armeabi-v7a" ,"armeabi-v7a-hf"
+    cmake .. \
+        -DLITE_WITH_OPENCL=ON \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_LITE=ON \
+        -DLITE_WITH_CUDA=OFF \
+        -DLITE_WITH_X86=OFF \
+        -DLITE_WITH_ARM=ON \
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+        -DWITH_TESTING=ON \
+        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2
+}
 # This method is only called in CI.
 function cmake_x86_for_CI {
    prepare_for_codegen # fake an empty __generated_code__.cc to pass cmake.
@@ -85,8 +102,8 @@ function build_test_server {
 # test_arm_android <some_test_name> <adb_port_number>
 function test_arm_android {
-    test_name=$1
+    local test_name=$1
-    port=$2
+    local port=$2
    if [[ "${test_name}x" == "x" ]]; then
        echo "test_name can not be empty"
        exit 1
@@ -99,12 +116,18 @@ function test_arm_android {
    echo "test name: ${test_name}"
    adb_work_dir="/data/local/tmp"
-    skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite")
+    skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite" "test_light_api_lite" "test_apis_lite")
    for skip_name in ${skip_list[@]} ; do
        [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
    done
-    testpath=$(find ./paddle/fluid -name ${test_name})
+    local testpath=$(find ./paddle/fluid -name ${test_name})
+    # if [[ "$test_name" == "test_light_api" ]]; then
+    #     local model_path=$(find . -name "lite_naive_model")
+    #     arm_push_necessary_file $port $model_path $adb_work_dir
+    # fi
    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
    adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}"
@@ -204,6 +227,7 @@ function test_arm {
    abi=$2
    lang=$3
    port=$4
    if [[ ${os} == "armlinux" ]]; then
        # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
        echo "Skip test arm linux yet. armlinux must in another docker"
@@ -214,13 +238,14 @@ function test_arm {
        echo "android do not need armv7hf"
        return 0
    fi
    # TODO(yuanshuai): enable armv7 on android
    if [[ ${abi} == "armv7" ]]; then
        echo "skip android v7 test yet"
        return 0
    fi
    echo "test file: ${TESTS_FILE}"
    for _test in $(cat $TESTS_FILE); do
        test_arm_android $_test $port
@@ -235,13 +260,21 @@ function prepare_emulator {
    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
    # start android armv8 and armv7 emulators first
    echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
-    echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -verbose -port ${port_armv8} &
+    echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port ${port_armv8} &
    sleep 1m
    echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
-    echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose -port ${port_armv7} &
+    echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port ${port_armv7} &
    sleep 1m
 }
+function arm_push_necessary_file {
+    local port=$1
+    local testpath=$2
+    local adb_work_dir=$3
+    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+}
 # We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
 # sub-task1
@@ -286,20 +319,22 @@ function build_test_arm_subtask_armlinux {
    prepare_emulator $port_armv8 $port_armv7
+    cur=$PWD
    # job 5
-    build_arm "armlinux" "armv8"
+    build_arm "armlinux" "armv8" "gcc" $port_armv8
-    test_arm "armlinux" "armv8"
+    test_arm "armlinux" "armv8" "gcc" $port_armv8
-    cd -
+    cd $cur
    # job 6
-    build_arm "armlinux" "armv7"
+    build_arm "armlinux" "armv7" "gcc" $port_armv8
-    test_arm "armlinux" "armv7"
+    test_arm "armlinux" "armv7" "gcc" $port_armv8
-    cd -
+    cd $cur
    # job 7
-    build_arm "armlinux" "armv7hf"
+    build_arm "armlinux" "armv7hf" "gcc" $port_armv8
-    test_arm "armlinux" "armv7hf"
+    test_arm "armlinux" "armv7hf" "gcc" $port_armv8
-    cd -
+    cd $cur
    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
    echo "Done"
@@ -333,6 +368,22 @@ function build_test_arm_subtask_model {
    echo "Done"
 }
+# this test load a model, optimize it and check the prediction result of both cxx and light APIS.
+function test_arm_predict_apis {
+    local port=$1
+    local workspace=$2
+    local naive_model_path=$3
+    local api_test_path=$(find . -name "test_apis_lite")
+    # the model is pushed to ./lite_naive_model
+    adb -s emulator-${port} push ${naive_model_path} ${workspace}
+    adb -s emulator-${port} push $api_test_path ${workspace}
+    # test cxx_api first to store the optimized model.
+    adb -s emulator-${port} shell ./test_apis_lite --model_dir ./lite_naive_model --optimized_model ./lite_naive_model_opt
+}
 # Build the code and run lite arm tests. This is executed in the CI system.
 function build_test_arm {
    ########################################################################
@@ -404,6 +455,10 @@ function main {
                cmake_x86
                shift
                ;;
+            cmake_opencl)
+                cmake_opencl $ARM_OS $ARM_ABI
+                shift
+                ;;
            cmake_cuda)
                cmake_cuda
                shift

--- a/paddle/fluid/lite/utils/io.h
+++ b/paddle/fluid/lite/utils/io.h
@@ -18,11 +18,12 @@
 #include <fstream>
 #include <string>
 #include "paddle/fluid/lite/utils/cp_logging.h"
+#include "paddle/fluid/lite/utils/string.h"
 namespace paddle {
 namespace lite {
-static bool IsFileExists(const std::string &path) {
+static bool IsFileExists(const std::string& path) {
  std::ifstream file(path);
  bool res = file.is_open();
  if (res) {
@@ -31,5 +32,15 @@ static bool IsFileExists(const std::string &path) {
  return res;
 }
+// ARM mobile not support mkdir in C++
+static void MkDirRecur(const std::string& path) {
+#ifndef LITE_WITH_ARM
+  CHECK_EQ(system(string_format("mkdir -p %s", path.c_str()).c_str()), 0)
+      << "Cann't mkdir " << path;
+#else  // On ARM
+  CHECK_NE(mkdir(path.c_str(), S_IRWXU), -1) << "Cann't mkdir " << path;
+#endif
+}
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/utils/string.h
+++ b/paddle/fluid/lite/utils/string.h
@@ -74,5 +74,15 @@ static std::string Repr(const std::vector<std::string>& v) {
  return "{" + Join(tmp, ",") + "}";
 }
+static std::vector<std::string> Split(const std::string& s, char delim) {
+  std::stringstream ss(s);
+  std::string line;
+  std::vector<std::string> res;
+  while (std::getline(ss, line, delim)) {
+    res.push_back(line);
+  }
+  return res;
+}
 }  // namespace lite
 }  // namespace paddle