[LITE][OPENCL] Enable full and light api for OpenCL (#2331)

* Fix bug target for kHost and kARM not equal. test=develop * Fix license. test=develop * add debug -g option. test=develop * enable opencl demo. test=develop * Fix model_optimize_tool found no opencl kernel. test=develop * add more vlog. test=develop * remove macro LITE_WITH_OPENCL, LITE_WITH_FPGA in passes. test=develop * Fix valid_places in mobilenetv1_test. test=develop * Fix bug of find no real output of fetch, after tool OPs of optimzer passes. test=develop * Fix vlog as log message in model_optimize_tool. test=develop * fix miscs. test=develop * fix comment. test=develop * Fix misspell of opencl, fpga kernels name in lite/api/CMakeLists.txt. test=develop * add opencl macro in full_api of demo. test=develop

[LITE][OPENCL] Enable full and light api for OpenCL (#2331)
* Fix bug target for kHost and kARM not equal. test=develop * Fix license. test=develop * add debug -g option. test=develop * enable opencl demo. test=develop * Fix model_optimize_tool found no opencl kernel. test=develop * add more vlog. test=develop * remove macro LITE_WITH_OPENCL, LITE_WITH_FPGA in passes. test=develop * Fix valid_places in mobilenetv1_test. test=develop * Fix bug of find no real output of fetch, after tool OPs of optimzer passes. test=develop * Fix vlog as log message in model_optimize_tool. test=develop * fix miscs. test=develop * fix comment. test=develop * Fix misspell of opencl, fpga kernels name in lite/api/CMakeLists.txt. test=develop * add opencl macro in full_api of demo. test=develop
d242bdfb · Yuan Shuai · GitHub · 81852863 · d242bdfb · d242bdfb
16 changed file
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -157,7 +157,9 @@ function(lite_cc_library TARGET)
 endfunction()
 function(lite_cc_binary TARGET)
-    set(options "")
+    if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+        set(options " -g ")
+    endif()
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -195,7 +195,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
        endif()
    endif()
-    if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND
+    if ((ARM_TARGET_OS STREQUAL "android") AND
            ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
        if (NOT LITE_ON_TINY_PUBLISH)
            # copy
@@ -210,6 +210,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -76,8 +76,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
                    ARM_DEPS ${arm_kernels}
                    NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
                    XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
-                    CL_DEPS ${opencl_kenrels}
+                    CL_DEPS ${opencl_kernels}
-                    FPGA_DEPS ${fpga_kenrels})
+                    FPGA_DEPS ${fpga_kernels})
 endif()
 # for light api
@@ -96,8 +96,8 @@ lite_cc_library(light_api SRCS light_api.cc
        ARM_DEPS ${arm_kernels}
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
-        CL_DEPS ${opencl_kenrels}
+        CL_DEPS ${opencl_kernels}
-        FPGA_DEPS ${fpga_kenrels})
+        FPGA_DEPS ${fpga_kernels})
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -140,21 +140,28 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
 // get inputs names
 std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
 // get outputnames
 std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
-  auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
+  if (!program_) {
-  std::vector<cpp::OpDesc *> feeds;
+    GenRuntimeProgram();
-  std::vector<cpp::OpDesc *> fetchs;
+  }
-  for (size_t i = 0; i < current_block->OpsSize(); i++) {
+  std::vector<const cpp::OpDesc *> feeds;
-    auto op = current_block->GetOp<cpp::OpDesc>(i);
+  std::vector<const cpp::OpDesc *> fetchs;
+  const auto &insts = program_->instructions();
+  for (size_t i = 0; i < program_->num_instructions(); i++) {
+    const auto &op = insts[i].op()->op_info();
    if (op->Type() == "feed") {
      feeds.push_back(op);
    } else if (op->Type() == "fetch") {
      fetchs.push_back(op);
    }
  }
  input_names_.resize(feeds.size());
  output_names_.resize(fetchs.size());
  for (size_t i = 0; i < feeds.size(); i++) {
@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
 const cpp::ProgramDesc &Predictor::program_desc() const {
  return program_desc_;
 }
 const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
 void Predictor::Build(const lite_api::CxxConfig &config,
@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
                      const std::vector<Place> &valid_places,
                      const std::vector<std::string> &passes) {
  program_desc_ = desc;
+  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
  inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
  inner_places.emplace_back(
      TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
  Program program(desc, scope_, inner_places);
-  /// The first place in valid_places is
  core::KernelPickFactor factor;
  factor.ConsiderTarget();
  factor.ConsiderPrecision();
  factor.ConsiderDataLayout();
  optimizer_.Run(std::move(program), inner_places, factor, passes);
  exec_scope_ = optimizer_.exec_scope();
  PrepareFeedFetch();
@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
  auto *var = exec_scope_->FindVar(name);
  return &var->Get<lite::Tensor>();
 }
 // get input by name
 lite::Tensor *Predictor::GetInputByName(const std::string &name) {
  auto element = std::find(input_names_.begin(), input_names_.end(), name);

--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV1, test_opencl) {
  std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
  });
  TestModel(valid_places);

--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -80,7 +80,16 @@ void Main() {
    if (target_repr == "arm") {
      valid_places.emplace_back(TARGET(kARM));
    } else if (target_repr == "opencl") {
-      valid_places.emplace_back(TARGET(kOpenCL));
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
+      valid_places.emplace_back(
+          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
    } else if (target_repr == "x86") {
      valid_places.emplace_back(TARGET(kX86));
    } else {

--- a/lite/backends/opencl/cl_wrapper.cc
+++ b/lite/backends/opencl/cl_wrapper.cc
@@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() {
  do {                                                               \
    cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func);            \
    if (cl_func##_ == nullptr) {                                     \
-      LOG(ERROR) << "Cannot find the " << #cl_func                   \
+      LOG(FATAL) << "Cannot find the " << #cl_func                   \
                 << " symbol in libOpenCL.so!";                      \
      break;                                                         \
    }                                                                \

--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -70,6 +70,7 @@ class StaticKernelPickPass : public mir::StmtPass {
      const auto& place = places[i];
      float weight = static_cast<float>(place_size - i) / place_size;
      size_t score{};
      // The more important factor comes first
      if (kernel_pick_factors_.IsTargetConsidered() &&
          (place.target == kernel.target() || kernel.target() == TARGET(kAny) ||
@@ -102,17 +103,17 @@ class StaticKernelPickPass : public mir::StmtPass {
    VLOG(4) << "[score(final)]:" << final_score;
    VLOG(4) << "-------- pick summary --------";
-    VLOG(4) << " ===> place():" << PrecisionToStr(winner_place.precision) << " "
+    VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
-            << DataLayoutToStr(winner_place.layout) << " "
+            << " " << DataLayoutToStr(winner_place.layout) << " "
            << TargetToStr(winner_place.target);
    VLOG(4) << " ===> kernel.place():"
            << PrecisionToStr(kernel.place().precision) << " "
            << DataLayoutToStr(kernel.place().layout) << " "
            << TargetToStr(kernel.place().target);
    VLOG(4) << "kernel.op_type():" << kernel.op_type();
-    VLOG(4) << "picker tactic " << kernel_pick_factors_;
+    VLOG(4) << "kernel picker factors:" << kernel_pick_factors_;
-    VLOG(4) << "kernel place " << kernel.place().DebugString();
+    VLOG(4) << "kernel place:" << kernel.place().DebugString();
-    VLOG(4) << "picker place " << winner_place.DebugString();
+    VLOG(4) << "winner_picker place:" << winner_place.DebugString();
    VLOG(4) << "------------------------------";
    // The data layout is not considered, for the input and output arguments

--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst(
  for (auto& kernel : kernels) {
    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-#ifdef LITE_WITH_OPENCL
    // layout kernel choose
    //   must ignore [layout check] for layout of kernels's input and output
-    if (TargetCompatibleTo(*in_arg_ty, from) &&
+    // note: replace LITE_WITH_OPENCL macro with judge input and output target
-        PrecisionCompatibleTo(*in_arg_ty, from) &&
+    // of layout_trans
-        DeviceCompatibleTo(*in_arg_ty, from) &&
+    if ((in_arg_ty->target() == TARGET(kOpenCL) ||
-        out_arg_ty->layout() == to.layout()) {
+         out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
-#else
+        (TargetCompatibleTo(*in_arg_ty, from) &&
-    if (TypeCompatible(*in_arg_ty, from) &&
+         PrecisionCompatibleTo(*in_arg_ty, from) &&
-        out_arg_ty->layout() == to.layout()) {
+         DeviceCompatibleTo(*in_arg_ty, from) &&
-#endif
+         out_arg_ty->layout() == to.layout())) {
+      is_found = true;
+    } else if (TypeCompatible(*in_arg_ty, from) &&
+               out_arg_ty->layout() == to.layout()) {
      is_found = true;
+    }
+    if (is_found) {
      selected_kernels.emplace_back(std::move(kernel));
      // we pick the kernel
      layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op);
      break;
    }
  }
  CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":"
                  << in->AsArg().name << "->" << to << ":"
                  << inst_node->AsStmt().op_info()->Type();

--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst(
    VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty;
    VLOG(4) << "to:" << to << "\n";
-// kernel choose branch for opencl backend
+    // kernel choose branch for opencl backend
-//   judge inst's target whether is kOpenCL
+    //   judge inst's target whether is kOpenCL
-//   Note: to == *decl_arg_type == in of inst, not output of last inst
+    //   Note: to == *decl_arg_type == in of inst, not output of last inst
-#ifdef LITE_WITH_OPENCL
    // ignore [layout check] for layout between [to] and [from]
    //   Because all of origin opencl insts in model, are not default layout
    //   NCHW,
@@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst(
    //     [*decl_arg_type] -> [to]: input of inst, not output of last
    //     [in_arg_ty]: in of io_copy
    //     [out_arg_ty]: out of io_copy
-    if (TargetCompatibleTo(*in_arg_ty, from) &&
+    //
-        PrecisionCompatibleTo(*in_arg_ty, from) &&
+    // noto: replace LITE_WITH_OPENCL macro with judge input and output target
-        DeviceCompatibleTo(*in_arg_ty, from) &&
+    // of io_copy
-        TargetCompatibleTo(*out_arg_ty, to)) {
+    if ((in_arg_ty->target() == TARGET(kOpenCL) ||
-      VLOG(4) << "do nothing. opencl found";
+         out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
-#else
+        (TargetCompatibleTo(*in_arg_ty, from) &&
-    if (TypeCompatible(*in_arg_ty, from) &&
+         PrecisionCompatibleTo(*in_arg_ty, from) &&
-        out_arg_ty->target() == to.target()) {
+         DeviceCompatibleTo(*in_arg_ty, from) &&
-#endif
+         TargetCompatibleTo(*out_arg_ty, to))) {
+      VLOG(4) << "picked, opencl found";
+      is_found = true;
+    } else if (TypeCompatible(*in_arg_ty, from) &&
+               out_arg_ty->target() == to.target()) {
      VLOG(4) << "picked";
      is_found = true;
+    }
+    if (is_found) {
      selected_kernels.emplace_back(std::move(kernel));
      // we pick the kernel
      io_copy_inst->AsStmt(
          io_copy_type, std::move(selected_kernels), io_copy_op);
      break;
    }
    VLOG(4) << "not picked";
  }
  CHECK(is_found) << "Can't find a io_copy  kernel for io_copy op: " << from
                  << ":" << in->AsArg().name << " -> " << to << ":"
                  << inst_node->AsStmt().op_info()->Type();

--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass {
    }
  }
-  // Set the tye of the weight
+  // Set the type of the weight
-  void SetWeightType(Node* w, const LiteType& type) {
+  void SetWeightType(Node* w,
-// TODO(xg) to optimize this
+                     const LiteType& type,
-#ifdef LITE_WITH_FPGA
+                     const std::map<std::string, bool>& lite_with_targets) {
-    w->AsArg().type = LiteType::GetTensorTy(
+    VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
-        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    if (lite_with_targets.at("kFPGA")) {
-#endif
+      w->AsArg().type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-#ifdef LITE_WITH_OPENCL
+    } else if (lite_with_targets.at("kOpenCL")) {
-    w->AsArg().type = LiteType::GetTensorTy(
+      w->AsArg().type = LiteType::GetTensorTy(
-        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-#endif
+    } else {
+      w->AsArg().type = LiteType::GetTensorTy(
-#ifndef LITE_WITH_FPGA
+          TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
-#ifndef LITE_WITH_OPENCL
+    }
-    w->AsArg().type = LiteType::GetTensorTy(
-        TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
-#endif
-#endif
  }
  void InferenceArgumentPlace(SSAGraph* graph) {
+    auto& valid_places = graph->valid_places();
+    auto valid_places_has_target = [&](TargetType t) -> bool {
+      for (auto& p : valid_places) {
+        if (p.target == t) {
+          return true;
+        }
+      }
+      return false;
+    };
+    std::map<std::string, bool> lite_with_targets{
+        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
+        {"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
+    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
+    VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
    VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
    for (auto& x : graph->StmtTopologicalOrder()) {
      auto& inst = x->AsStmt();
-// The IoCopyOp is a tool operator, it won't support the type inference.
+      // The IoCopyOp is a tool operator, it won't support the type inference.
-// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for
+      // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
-// tool operator
+      // for
-#ifndef LITE_WITH_FPGA
+      // tool operator
-#ifndef LITE_WITH_OPENCL
+      if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
-      VLOG(3) << "inst.op_type() == 'io_copy', continue";
+        VLOG(3) << "inst.op_type() == 'io_copy', continue";
-      if (inst.op_type() == "io_copy") continue;
+        if (inst.op_type() == "io_copy") continue;
-#endif
+      }
-#endif
      // deal with inputs
      VLOG(4) << "Infering op " << inst.op_info()->Repr();
      // TODO(zhaolong): Add check if the node's name in op's arguments.
@@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass {
        if (!x_in->AsArg().type) {
          VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
          if (x_in->AsArg().is_weight) {
-            SetWeightType(x_in, *type);
+            SetWeightType(x_in, *type, lite_with_targets);
          } else {
            x_in->AsArg().type = type;
          }
@@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass {
        if (!x_out->AsArg().type) {
          VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
          if (x_out->AsArg().is_weight) {
-            SetWeightType(x_out, *type);
+            SetWeightType(x_out, *type, lite_with_targets);
          } else {
            x_out->AsArg().type = type;
          }

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -49,6 +50,22 @@ class Optimizer {
    valid_places_ = valid_places;
    CHECK(!valid_places.empty()) << "At least one valid_place should be set";
    CHECK(!graph_) << "duplicate optimize found";
+    auto valid_places_has_target = [&](TargetType t) -> bool {
+      for (auto& p : valid_places) {
+        if (p.target == t) {
+          return true;
+        }
+      }
+      return false;
+    };
+    std::map<std::string, bool> lite_with_targets{
+        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
+        {"kNPU", valid_places_has_target(TARGET(kNPU))},
+        {"kXPU", valid_places_has_target(TARGET(kXPU))}};
+    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
+    VLOG(4) << "lite_with_targets['kNPU']:" << lite_with_targets["kNPU"];
+    VLOG(4) << "lite_with_targets['kXPU']:" << lite_with_targets["kXPU"];
    graph_.reset(new mir::SSAGraph);
    graph_->Build(program, valid_places);
    graph_->SetValidPlaces(valid_places);
@@ -57,14 +74,11 @@ class Optimizer {
    InitTargetTypeTransformPass();
    if (passes.empty()) {
-      RunPasses(std::vector<std::string>{
+      std::vector<std::string> passes_local{
          {"lite_quant_dequant_fuse_pass",     //
           "lite_conv_elementwise_fuse_pass",  // conv-elemwise-bn
           "lite_conv_bn_fuse_pass",           //
           "lite_conv_elementwise_fuse_pass",  // conv-bn-elemwise
-           // This pass is disabled to force some opencl kernels selected for
-           // final running, otherwise, they will be fused to ARM fusion
-           // kernels, and the OpenCL devices will be discarded.
           // TODO(Superjomn) Refine the fusion related design to select fusion
           // kernels for devices automatically.
           "lite_conv_activation_fuse_pass",              //
@@ -105,16 +119,17 @@ class Optimizer {
           "argument_type_display_pass",  //
           "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
+           "argument_type_display_pass",
           "runtime_context_assign_pass",
-           "argument_type_display_pass",  //
+           "argument_type_display_pass"}};
-#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \
+      if ((!lite_with_targets["kOpenCL"]) && (!lite_with_targets["kNPU"]) &&
-    !defined(LITE_WITH_XPU)
+          (!lite_with_targets["kXPU"])) {
-           // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel
+        // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in OpenCL
-           "memory_optimize_pass",
+        // kernel
-#endif
+        passes_local.emplace_back("memory_optimize_pass");
-           "argument_type_display_pass"}});
+      }
+      RunPasses(passes_local);
    } else {
      RunPasses(passes);
    }
@@ -141,6 +156,7 @@ class Optimizer {
                      .LookUp<mir::subgraph::GenerateNPUProgramPass>(
                          "generate_npu_program_pass");
 #endif
 #ifdef LITE_WITH_XPU
      auto pass = mir::PassManager::Global()
                      .LookUp<mir::subgraph::GenerateXPUProgramPass>(

--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -32,11 +32,21 @@ int64_t ShapeProduction(const shape_t& shape) {
  return res;
 }
+// 0. Enable OpenCL, if needed
+// Enable `DEMO_WITH_OPENCL` macro below, if user need use gpu(opencl)
+// #define DEMO_WITH_OPENCL
 void RunModel() {
  // 1. Set CxxConfig
  CxxConfig config;
  config.set_model_dir(FLAGS_model_dir);
+#ifdef DEMO_WITH_OPENCL
+  std::vector<Place> valid_places{
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      Place{TARGET(kARM), PRECISION(kFloat)}};
+#else
  std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
+#endif
  if (FLAGS_prefer_int8_kernel) {
    valid_places.insert(valid_places.begin(),
                        Place{TARGET(kARM), PRECISION(kInt8)});

--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
-if (NOT LITE_WITH_OPENCL)
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
    return ()
 endif()

--- a/lite/kernels/opencl/io_copy_compute.cc
+++ b/lite/kernels/opencl/io_copy_compute.cc
@@ -103,8 +103,9 @@ class IoCopykOpenCLToHostCompute
    auto* wait_list = context.cl_wait_list();
    auto* x_ptr = param.x->data<float, cl::Buffer>();
-    /* TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    /* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to
-    in kernel and enable wait_list
+    `cl_wait_list`
+    in kernel and `wait_list` enabled
    auto it = wait_list->find(x_ptr);
    if (it != wait_list->end()) {
      VLOG(4) << "--- Find the sync event for the target cl tensor. ---";

--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -568,7 +568,7 @@ void SaveModelNaive(const std::string &model_dir,
      SaveParamNaive(path, exec_scope, var.Name());
    }
  }
-  VLOG(4) << "Save naive buffer model in '" << model_dir << "'' successfully";
+  LOG(INFO) << "Save naive buffer model in '" << model_dir << "' successfully";
 }
 #endif