From cfa086e9bfc56cebc4fcd84a94651628dfb6a69a Mon Sep 17 00:00:00 2001 From: Yuan Shuai Date: Mon, 18 Nov 2019 17:56:46 +0800 Subject: [PATCH] [LITE][OPENCL] Enable full and light api for OpenCL (#2331) * Fix bug target for kHost and kARM not equal. test=develop * Fix license. test=develop * add debug -g option. test=develop * enable opencl demo. test=develop * Fix model_optimize_tool found no opencl kernel. test=develop * add more vlog. test=develop * remove macro LITE_WITH_OPENCL, LITE_WITH_FPGA in passes. test=develop * Fix valid_places in mobilenetv1_test. test=develop * Fix bug of find no real output of fetch, after tool OPs of optimzer passes. test=develop * Fix vlog as log message in model_optimize_tool. test=develop * fix miscs. test=develop * fix comment. test=develop * Fix misspell of opencl, fpga kernels name in lite/api/CMakeLists.txt. test=develop * add opencl macro in full_api of demo. test=develop --- cmake/lite.cmake | 4 +- lite/CMakeLists.txt | 3 +- lite/api/CMakeLists.txt | 8 +-- lite/api/cxx_api.cc | 23 ++++-- lite/api/mobilenetv1_test.cc | 7 +- lite/api/model_optimize_tool.cc | 11 ++- lite/backends/opencl/cl_wrapper.cc | 2 +- lite/core/mir/static_kernel_pick_pass.h | 11 +-- lite/core/mir/type_layout_cast_pass.cc | 24 ++++--- lite/core/mir/type_target_cast_pass.cc | 34 +++++---- lite/core/mir/variable_place_inference_pass.h | 70 +++++++++++-------- lite/core/optimizer.h | 40 +++++++---- .../cxx/mobile_full/mobilenetv1_full_api.cc | 10 +++ lite/kernels/opencl/CMakeLists.txt | 2 +- lite/kernels/opencl/io_copy_compute.cc | 5 +- lite/model_parser/model_parser.cc | 2 +- 16 files changed, 167 insertions(+), 89 deletions(-) diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 98dbc9ab7e..9cf8b12635 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -157,7 +157,9 @@ function(lite_cc_library TARGET) endfunction() function(lite_cc_binary TARGET) - set(options "") + if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + set(options " -g ") + endif() set(oneValueArgs "") set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index dc38718c40..60edfc357c 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -195,7 +195,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) endif() endif() - if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND + if ((ARM_TARGET_OS STREQUAL "android") AND ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8))) if (NOT LITE_ON_TINY_PUBLISH) # copy @@ -210,6 +210,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include" ) add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index bf930ed0e2..c79927ba10 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -76,8 +76,8 @@ if (NOT LITE_ON_TINY_PUBLISH) ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass - CL_DEPS ${opencl_kenrels} - FPGA_DEPS ${fpga_kenrels}) + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) endif() # for light api @@ -96,8 +96,8 @@ lite_cc_library(light_api SRCS light_api.cc ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} - CL_DEPS ${opencl_kenrels} - FPGA_DEPS ${fpga_kenrels}) + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) include(ExternalProject) set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index a2b538aa77..cbe938cea6 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -140,21 +140,28 @@ lite::Tensor *Predictor::GetInput(size_t offset) { // get inputs names std::vector Predictor::GetInputNames() { return input_names_; } + // get outputnames std::vector Predictor::GetOutputNames() { return output_names_; } + // append the names of inputs and outputs into input_names_ and output_names_ void Predictor::PrepareFeedFetch() { - auto current_block = program_desc_.GetBlock(0); - std::vector feeds; - std::vector fetchs; - for (size_t i = 0; i < current_block->OpsSize(); i++) { - auto op = current_block->GetOp(i); + if (!program_) { + GenRuntimeProgram(); + } + std::vector feeds; + std::vector fetchs; + const auto &insts = program_->instructions(); + + for (size_t i = 0; i < program_->num_instructions(); i++) { + const auto &op = insts[i].op()->op_info(); if (op->Type() == "feed") { feeds.push_back(op); } else if (op->Type() == "fetch") { fetchs.push_back(op); } } + input_names_.resize(feeds.size()); output_names_.resize(fetchs.size()); for (size_t i = 0; i < feeds.size(); i++) { @@ -190,6 +197,7 @@ std::vector Predictor::GetOutputs() const { const cpp::ProgramDesc &Predictor::program_desc() const { return program_desc_; } + const RuntimeProgram &Predictor::runtime_program() const { return *program_; } void Predictor::Build(const lite_api::CxxConfig &config, @@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc, const std::vector &valid_places, const std::vector &passes) { program_desc_ = desc; + // `inner_places` is used to optimize passes std::vector inner_places = valid_places; inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); inner_places.emplace_back( TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); Program program(desc, scope_, inner_places); - /// The first place in valid_places is + core::KernelPickFactor factor; factor.ConsiderTarget(); factor.ConsiderPrecision(); factor.ConsiderDataLayout(); + optimizer_.Run(std::move(program), inner_places, factor, passes); exec_scope_ = optimizer_.exec_scope(); PrepareFeedFetch(); @@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const { auto *var = exec_scope_->FindVar(name); return &var->Get(); } + // get input by name lite::Tensor *Predictor::GetInputByName(const std::string &name) { auto element = std::find(input_names_.begin(), input_names_.end(), name); diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc index 63a401745b..79f9bea762 100644 --- a/lite/api/mobilenetv1_test.cc +++ b/lite/api/mobilenetv1_test.cc @@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) { #ifdef LITE_WITH_OPENCL TEST(MobileNetV1, test_opencl) { std::vector valid_places({ - Place{TARGET(kOpenCL), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)}, + TARGET(kARM), // enable kARM CPU kernel when no opencl kernel }); TestModel(valid_places); diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc index 1aef522b2a..daa57cd456 100644 --- a/lite/api/model_optimize_tool.cc +++ b/lite/api/model_optimize_tool.cc @@ -80,7 +80,16 @@ void Main() { if (target_repr == "arm") { valid_places.emplace_back(TARGET(kARM)); } else if (target_repr == "opencl") { - valid_places.emplace_back(TARGET(kOpenCL)); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)}); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)}); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)}); + valid_places.emplace_back( + TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel } else if (target_repr == "x86") { valid_places.emplace_back(TARGET(kX86)); } else { diff --git a/lite/backends/opencl/cl_wrapper.cc b/lite/backends/opencl/cl_wrapper.cc index 357ac8c2d6..93e176f9ed 100644 --- a/lite/backends/opencl/cl_wrapper.cc +++ b/lite/backends/opencl/cl_wrapper.cc @@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() { do { \ cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \ if (cl_func##_ == nullptr) { \ - LOG(ERROR) << "Cannot find the " << #cl_func \ + LOG(FATAL) << "Cannot find the " << #cl_func \ << " symbol in libOpenCL.so!"; \ break; \ } \ diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h index 7187ddcef6..90be0ea54e 100644 --- a/lite/core/mir/static_kernel_pick_pass.h +++ b/lite/core/mir/static_kernel_pick_pass.h @@ -70,6 +70,7 @@ class StaticKernelPickPass : public mir::StmtPass { const auto& place = places[i]; float weight = static_cast(place_size - i) / place_size; size_t score{}; + // The more important factor comes first if (kernel_pick_factors_.IsTargetConsidered() && (place.target == kernel.target() || kernel.target() == TARGET(kAny) || @@ -102,17 +103,17 @@ class StaticKernelPickPass : public mir::StmtPass { VLOG(4) << "[score(final)]:" << final_score; VLOG(4) << "-------- pick summary --------"; - VLOG(4) << " ===> place():" << PrecisionToStr(winner_place.precision) << " " - << DataLayoutToStr(winner_place.layout) << " " + VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) + << " " << DataLayoutToStr(winner_place.layout) << " " << TargetToStr(winner_place.target); VLOG(4) << " ===> kernel.place():" << PrecisionToStr(kernel.place().precision) << " " << DataLayoutToStr(kernel.place().layout) << " " << TargetToStr(kernel.place().target); VLOG(4) << "kernel.op_type():" << kernel.op_type(); - VLOG(4) << "picker tactic " << kernel_pick_factors_; - VLOG(4) << "kernel place " << kernel.place().DebugString(); - VLOG(4) << "picker place " << winner_place.DebugString(); + VLOG(4) << "kernel picker factors:" << kernel_pick_factors_; + VLOG(4) << "kernel place:" << kernel.place().DebugString(); + VLOG(4) << "winner_picker place:" << winner_place.DebugString(); VLOG(4) << "------------------------------"; // The data layout is not considered, for the input and output arguments diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc index 9d63dcbb38..b3b7a858f6 100644 --- a/lite/core/mir/type_layout_cast_pass.cc +++ b/lite/core/mir/type_layout_cast_pass.cc @@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst( for (auto& kernel : kernels) { const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); -#ifdef LITE_WITH_OPENCL + // layout kernel choose // must ignore [layout check] for layout of kernels's input and output - if (TargetCompatibleTo(*in_arg_ty, from) && - PrecisionCompatibleTo(*in_arg_ty, from) && - DeviceCompatibleTo(*in_arg_ty, from) && - out_arg_ty->layout() == to.layout()) { -#else - if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->layout() == to.layout()) { -#endif + // note: replace LITE_WITH_OPENCL macro with judge input and output target + // of layout_trans + if ((in_arg_ty->target() == TARGET(kOpenCL) || + out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first + (TargetCompatibleTo(*in_arg_ty, from) && + PrecisionCompatibleTo(*in_arg_ty, from) && + DeviceCompatibleTo(*in_arg_ty, from) && + out_arg_ty->layout() == to.layout())) { + is_found = true; + } else if (TypeCompatible(*in_arg_ty, from) && + out_arg_ty->layout() == to.layout()) { is_found = true; + } + if (is_found) { selected_kernels.emplace_back(std::move(kernel)); // we pick the kernel layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op); break; } } + CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":" << in->AsArg().name << "->" << to << ":" << inst_node->AsStmt().op_info()->Type(); diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc index 7a32777865..b008faa687 100644 --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst( VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty; VLOG(4) << "to:" << to << "\n"; -// kernel choose branch for opencl backend -// judge inst's target whether is kOpenCL -// Note: to == *decl_arg_type == in of inst, not output of last inst -#ifdef LITE_WITH_OPENCL + // kernel choose branch for opencl backend + // judge inst's target whether is kOpenCL + // Note: to == *decl_arg_type == in of inst, not output of last inst // ignore [layout check] for layout between [to] and [from] // Because all of origin opencl insts in model, are not default layout // NCHW, @@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst( // [*decl_arg_type] -> [to]: input of inst, not output of last // [in_arg_ty]: in of io_copy // [out_arg_ty]: out of io_copy - if (TargetCompatibleTo(*in_arg_ty, from) && - PrecisionCompatibleTo(*in_arg_ty, from) && - DeviceCompatibleTo(*in_arg_ty, from) && - TargetCompatibleTo(*out_arg_ty, to)) { - VLOG(4) << "do nothing. opencl found"; -#else - if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->target() == to.target()) { -#endif + // + // noto: replace LITE_WITH_OPENCL macro with judge input and output target + // of io_copy + if ((in_arg_ty->target() == TARGET(kOpenCL) || + out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first + (TargetCompatibleTo(*in_arg_ty, from) && + PrecisionCompatibleTo(*in_arg_ty, from) && + DeviceCompatibleTo(*in_arg_ty, from) && + TargetCompatibleTo(*out_arg_ty, to))) { + VLOG(4) << "picked, opencl found"; + is_found = true; + } else if (TypeCompatible(*in_arg_ty, from) && + out_arg_ty->target() == to.target()) { VLOG(4) << "picked"; is_found = true; + } + + if (is_found) { selected_kernels.emplace_back(std::move(kernel)); // we pick the kernel io_copy_inst->AsStmt( io_copy_type, std::move(selected_kernels), io_copy_op); break; } + VLOG(4) << "not picked"; } + CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from << ":" << in->AsArg().name << " -> " << to << ":" << inst_node->AsStmt().op_info()->Type(); diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h index fe6ecfd66d..3f5d161a56 100644 --- a/lite/core/mir/variable_place_inference_pass.h +++ b/lite/core/mir/variable_place_inference_pass.h @@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass { } } - // Set the tye of the weight - void SetWeightType(Node* w, const LiteType& type) { -// TODO(xg) to optimize this -#ifdef LITE_WITH_FPGA - w->AsArg().type = LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); -#endif - -#ifdef LITE_WITH_OPENCL - w->AsArg().type = LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); -#endif - -#ifndef LITE_WITH_FPGA -#ifndef LITE_WITH_OPENCL - w->AsArg().type = LiteType::GetTensorTy( - TARGET(kHost), type.precision(), DATALAYOUT(kNCHW)); -#endif -#endif + // Set the type of the weight + void SetWeightType(Node* w, + const LiteType& type, + const std::map& lite_with_targets) { + VLOG(4) << "type.precision():" << PrecisionRepr(type.precision()); + if (lite_with_targets.at("kFPGA")) { + w->AsArg().type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + } else if (lite_with_targets.at("kOpenCL")) { + w->AsArg().type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + } else { + w->AsArg().type = LiteType::GetTensorTy( + TARGET(kHost), type.precision(), DATALAYOUT(kNCHW)); + } } void InferenceArgumentPlace(SSAGraph* graph) { + auto& valid_places = graph->valid_places(); + auto valid_places_has_target = [&](TargetType t) -> bool { + for (auto& p : valid_places) { + if (p.target == t) { + return true; + } + } + return false; + }; + std::map lite_with_targets{ + {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))}, + {"kFPGA", valid_places_has_target(TARGET(kFPGA))}}; + VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"]; + VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"]; + VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global(); for (auto& x : graph->StmtTopologicalOrder()) { auto& inst = x->AsStmt(); -// The IoCopyOp is a tool operator, it won't support the type inference. -// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for -// tool operator -#ifndef LITE_WITH_FPGA -#ifndef LITE_WITH_OPENCL - VLOG(3) << "inst.op_type() == 'io_copy', continue"; - if (inst.op_type() == "io_copy") continue; -#endif -#endif + // The IoCopyOp is a tool operator, it won't support the type inference. + // in fpga, we has io_copy+cali+layout tool ops, so we need type inference + // for + // tool operator + if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) { + VLOG(3) << "inst.op_type() == 'io_copy', continue"; + if (inst.op_type() == "io_copy") continue; + } // deal with inputs VLOG(4) << "Infering op " << inst.op_info()->Repr(); // TODO(zhaolong): Add check if the node's name in op's arguments. @@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass { if (!x_in->AsArg().type) { VLOG(4) << "set type " << *type << " " << x_in->AsArg().name; if (x_in->AsArg().is_weight) { - SetWeightType(x_in, *type); + SetWeightType(x_in, *type, lite_with_targets); } else { x_in->AsArg().type = type; } @@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass { if (!x_out->AsArg().type) { VLOG(4) << "set type " << *type << " " << x_out->AsArg().name; if (x_out->AsArg().is_weight) { - SetWeightType(x_out, *type); + SetWeightType(x_out, *type, lite_with_targets); } else { x_out->AsArg().type = type; } diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 739615e276..a50ff3e611 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include @@ -49,6 +50,22 @@ class Optimizer { valid_places_ = valid_places; CHECK(!valid_places.empty()) << "At least one valid_place should be set"; CHECK(!graph_) << "duplicate optimize found"; + auto valid_places_has_target = [&](TargetType t) -> bool { + for (auto& p : valid_places) { + if (p.target == t) { + return true; + } + } + return false; + }; + std::map lite_with_targets{ + {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))}, + {"kNPU", valid_places_has_target(TARGET(kNPU))}, + {"kXPU", valid_places_has_target(TARGET(kXPU))}}; + VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"]; + VLOG(4) << "lite_with_targets['kNPU']:" << lite_with_targets["kNPU"]; + VLOG(4) << "lite_with_targets['kXPU']:" << lite_with_targets["kXPU"]; + graph_.reset(new mir::SSAGraph); graph_->Build(program, valid_places); graph_->SetValidPlaces(valid_places); @@ -57,14 +74,11 @@ class Optimizer { InitTargetTypeTransformPass(); if (passes.empty()) { - RunPasses(std::vector{ + std::vector passes_local{ {"lite_quant_dequant_fuse_pass", // "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn "lite_conv_bn_fuse_pass", // "lite_conv_elementwise_fuse_pass", // conv-bn-elemwise - // This pass is disabled to force some opencl kernels selected for - // final running, otherwise, they will be fused to ARM fusion - // kernels, and the OpenCL devices will be discarded. // TODO(Superjomn) Refine the fusion related design to select fusion // kernels for devices automatically. "lite_conv_activation_fuse_pass", // @@ -105,16 +119,17 @@ class Optimizer { "argument_type_display_pass", // "variable_place_inference_pass", // - "argument_type_display_pass", // + "argument_type_display_pass", "runtime_context_assign_pass", - "argument_type_display_pass", // -#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \ - !defined(LITE_WITH_XPU) - // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel - "memory_optimize_pass", -#endif - "argument_type_display_pass"}}); + "argument_type_display_pass"}}; + if ((!lite_with_targets["kOpenCL"]) && (!lite_with_targets["kNPU"]) && + (!lite_with_targets["kXPU"])) { + // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in OpenCL + // kernel + passes_local.emplace_back("memory_optimize_pass"); + } + RunPasses(passes_local); } else { RunPasses(passes); } @@ -141,6 +156,7 @@ class Optimizer { .LookUp( "generate_npu_program_pass"); #endif + #ifdef LITE_WITH_XPU auto pass = mir::PassManager::Global() .LookUp( diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc index 5ac041b2cc..aa084d1fef 100644 --- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc +++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc @@ -32,11 +32,21 @@ int64_t ShapeProduction(const shape_t& shape) { return res; } +// 0. Enable OpenCL, if needed +// Enable `DEMO_WITH_OPENCL` macro below, if user need use gpu(opencl) +// #define DEMO_WITH_OPENCL void RunModel() { // 1. Set CxxConfig CxxConfig config; config.set_model_dir(FLAGS_model_dir); +#ifdef DEMO_WITH_OPENCL + std::vector valid_places{ + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)}, + Place{TARGET(kARM), PRECISION(kFloat)}}; +#else std::vector valid_places{Place{TARGET(kARM), PRECISION(kFloat)}}; +#endif if (FLAGS_prefer_int8_kernel) { valid_places.insert(valid_places.begin(), Place{TARGET(kARM), PRECISION(kInt8)}); diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index d070eb84c5..99b23c19f0 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -1,4 +1,4 @@ -if (NOT LITE_WITH_OPENCL) +if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL)) return () endif() diff --git a/lite/kernels/opencl/io_copy_compute.cc b/lite/kernels/opencl/io_copy_compute.cc index dc4bdfe64c..3387a0887d 100644 --- a/lite/kernels/opencl/io_copy_compute.cc +++ b/lite/kernels/opencl/io_copy_compute.cc @@ -103,8 +103,9 @@ class IoCopykOpenCLToHostCompute auto* wait_list = context.cl_wait_list(); auto* x_ptr = param.x->data(); - /* TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` - in kernel and enable wait_list + /* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to + `cl_wait_list` + in kernel and `wait_list` enabled auto it = wait_list->find(x_ptr); if (it != wait_list->end()) { VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc index 13b6cb5b77..ed3f45c598 100644 --- a/lite/model_parser/model_parser.cc +++ b/lite/model_parser/model_parser.cc @@ -568,7 +568,7 @@ void SaveModelNaive(const std::string &model_dir, SaveParamNaive(path, exec_scope, var.Name()); } } - VLOG(4) << "Save naive buffer model in '" << model_dir << "'' successfully"; + LOG(INFO) << "Save naive buffer model in '" << model_dir << "' successfully"; } #endif -- GitLab