未验证 提交 d242bdfb 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Enable full and light api for OpenCL (#2331)

* Fix bug target for kHost and kARM not equal. test=develop

* Fix license. test=develop

* add debug -g option. test=develop

* enable opencl demo. test=develop

* Fix model_optimize_tool found no opencl kernel. test=develop

* add more vlog. test=develop

* remove macro LITE_WITH_OPENCL, LITE_WITH_FPGA in passes. test=develop

* Fix valid_places in mobilenetv1_test. test=develop

* Fix bug of find no real output of fetch, after tool OPs of optimzer passes. test=develop

* Fix vlog as log message in model_optimize_tool. test=develop

* fix miscs. test=develop

* fix comment. test=develop

* Fix misspell of opencl, fpga kernels name in lite/api/CMakeLists.txt. test=develop

* add opencl macro in full_api of demo. test=develop
上级 81852863
...@@ -157,7 +157,9 @@ function(lite_cc_library TARGET) ...@@ -157,7 +157,9 @@ function(lite_cc_library TARGET)
endfunction() endfunction()
function(lite_cc_binary TARGET) function(lite_cc_binary TARGET)
set(options "") if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
set(options " -g ")
endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
......
...@@ -195,7 +195,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -195,7 +195,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
endif() endif()
endif() endif()
if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND if ((ARM_TARGET_OS STREQUAL "android") AND
((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8))) ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
# copy # copy
...@@ -210,6 +210,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -210,6 +210,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
) )
add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_android_cxx_demos logging gflags)
add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
......
...@@ -76,8 +76,8 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -76,8 +76,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
CL_DEPS ${opencl_kenrels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kenrels}) FPGA_DEPS ${fpga_kernels})
endif() endif()
# for light api # for light api
...@@ -96,8 +96,8 @@ lite_cc_library(light_api SRCS light_api.cc ...@@ -96,8 +96,8 @@ lite_cc_library(light_api SRCS light_api.cc
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kenrels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kenrels}) FPGA_DEPS ${fpga_kernels})
include(ExternalProject) include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
......
...@@ -140,21 +140,28 @@ lite::Tensor *Predictor::GetInput(size_t offset) { ...@@ -140,21 +140,28 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
// get inputs names // get inputs names
std::vector<std::string> Predictor::GetInputNames() { return input_names_; } std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
// get outputnames // get outputnames
std::vector<std::string> Predictor::GetOutputNames() { return output_names_; } std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
// append the names of inputs and outputs into input_names_ and output_names_ // append the names of inputs and outputs into input_names_ and output_names_
void Predictor::PrepareFeedFetch() { void Predictor::PrepareFeedFetch() {
auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0); if (!program_) {
std::vector<cpp::OpDesc *> feeds; GenRuntimeProgram();
std::vector<cpp::OpDesc *> fetchs; }
for (size_t i = 0; i < current_block->OpsSize(); i++) { std::vector<const cpp::OpDesc *> feeds;
auto op = current_block->GetOp<cpp::OpDesc>(i); std::vector<const cpp::OpDesc *> fetchs;
const auto &insts = program_->instructions();
for (size_t i = 0; i < program_->num_instructions(); i++) {
const auto &op = insts[i].op()->op_info();
if (op->Type() == "feed") { if (op->Type() == "feed") {
feeds.push_back(op); feeds.push_back(op);
} else if (op->Type() == "fetch") { } else if (op->Type() == "fetch") {
fetchs.push_back(op); fetchs.push_back(op);
} }
} }
input_names_.resize(feeds.size()); input_names_.resize(feeds.size());
output_names_.resize(fetchs.size()); output_names_.resize(fetchs.size());
for (size_t i = 0; i < feeds.size(); i++) { for (size_t i = 0; i < feeds.size(); i++) {
...@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const { ...@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
const cpp::ProgramDesc &Predictor::program_desc() const { const cpp::ProgramDesc &Predictor::program_desc() const {
return program_desc_; return program_desc_;
} }
const RuntimeProgram &Predictor::runtime_program() const { return *program_; } const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
void Predictor::Build(const lite_api::CxxConfig &config, void Predictor::Build(const lite_api::CxxConfig &config,
...@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc, ...@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
const std::vector<Place> &valid_places, const std::vector<Place> &valid_places,
const std::vector<std::string> &passes) { const std::vector<std::string> &passes) {
program_desc_ = desc; program_desc_ = desc;
// `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places; std::vector<Place> inner_places = valid_places;
inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
inner_places.emplace_back( inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
Program program(desc, scope_, inner_places); Program program(desc, scope_, inner_places);
/// The first place in valid_places is
core::KernelPickFactor factor; core::KernelPickFactor factor;
factor.ConsiderTarget(); factor.ConsiderTarget();
factor.ConsiderPrecision(); factor.ConsiderPrecision();
factor.ConsiderDataLayout(); factor.ConsiderDataLayout();
optimizer_.Run(std::move(program), inner_places, factor, passes); optimizer_.Run(std::move(program), inner_places, factor, passes);
exec_scope_ = optimizer_.exec_scope(); exec_scope_ = optimizer_.exec_scope();
PrepareFeedFetch(); PrepareFeedFetch();
...@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const { ...@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
auto *var = exec_scope_->FindVar(name); auto *var = exec_scope_->FindVar(name);
return &var->Get<lite::Tensor>(); return &var->Get<lite::Tensor>();
} }
// get input by name // get input by name
lite::Tensor *Predictor::GetInputByName(const std::string &name) { lite::Tensor *Predictor::GetInputByName(const std::string &name) {
auto element = std::find(input_names_.begin(), input_names_.end(), name); auto element = std::find(input_names_.begin(), input_names_.end(), name);
......
...@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) { ...@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) {
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
TEST(MobileNetV1, test_opencl) { TEST(MobileNetV1, test_opencl) {
std::vector<Place> valid_places({ std::vector<Place> valid_places({
Place{TARGET(kOpenCL), PRECISION(kFloat)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
Place{TARGET(kARM), PRECISION(kFloat)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
TARGET(kARM), // enable kARM CPU kernel when no opencl kernel
}); });
TestModel(valid_places); TestModel(valid_places);
......
...@@ -80,7 +80,16 @@ void Main() { ...@@ -80,7 +80,16 @@ void Main() {
if (target_repr == "arm") { if (target_repr == "arm") {
valid_places.emplace_back(TARGET(kARM)); valid_places.emplace_back(TARGET(kARM));
} else if (target_repr == "opencl") { } else if (target_repr == "opencl") {
valid_places.emplace_back(TARGET(kOpenCL)); valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
valid_places.emplace_back(
TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel
} else if (target_repr == "x86") { } else if (target_repr == "x86") {
valid_places.emplace_back(TARGET(kX86)); valid_places.emplace_back(TARGET(kX86));
} else { } else {
......
...@@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() { ...@@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() {
do { \ do { \
cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \ cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \
if (cl_func##_ == nullptr) { \ if (cl_func##_ == nullptr) { \
LOG(ERROR) << "Cannot find the " << #cl_func \ LOG(FATAL) << "Cannot find the " << #cl_func \
<< " symbol in libOpenCL.so!"; \ << " symbol in libOpenCL.so!"; \
break; \ break; \
} \ } \
......
...@@ -70,6 +70,7 @@ class StaticKernelPickPass : public mir::StmtPass { ...@@ -70,6 +70,7 @@ class StaticKernelPickPass : public mir::StmtPass {
const auto& place = places[i]; const auto& place = places[i];
float weight = static_cast<float>(place_size - i) / place_size; float weight = static_cast<float>(place_size - i) / place_size;
size_t score{}; size_t score{};
// The more important factor comes first // The more important factor comes first
if (kernel_pick_factors_.IsTargetConsidered() && if (kernel_pick_factors_.IsTargetConsidered() &&
(place.target == kernel.target() || kernel.target() == TARGET(kAny) || (place.target == kernel.target() || kernel.target() == TARGET(kAny) ||
...@@ -102,17 +103,17 @@ class StaticKernelPickPass : public mir::StmtPass { ...@@ -102,17 +103,17 @@ class StaticKernelPickPass : public mir::StmtPass {
VLOG(4) << "[score(final)]:" << final_score; VLOG(4) << "[score(final)]:" << final_score;
VLOG(4) << "-------- pick summary --------"; VLOG(4) << "-------- pick summary --------";
VLOG(4) << " ===> place():" << PrecisionToStr(winner_place.precision) << " " VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
<< DataLayoutToStr(winner_place.layout) << " " << " " << DataLayoutToStr(winner_place.layout) << " "
<< TargetToStr(winner_place.target); << TargetToStr(winner_place.target);
VLOG(4) << " ===> kernel.place():" VLOG(4) << " ===> kernel.place():"
<< PrecisionToStr(kernel.place().precision) << " " << PrecisionToStr(kernel.place().precision) << " "
<< DataLayoutToStr(kernel.place().layout) << " " << DataLayoutToStr(kernel.place().layout) << " "
<< TargetToStr(kernel.place().target); << TargetToStr(kernel.place().target);
VLOG(4) << "kernel.op_type():" << kernel.op_type(); VLOG(4) << "kernel.op_type():" << kernel.op_type();
VLOG(4) << "picker tactic " << kernel_pick_factors_; VLOG(4) << "kernel picker factors:" << kernel_pick_factors_;
VLOG(4) << "kernel place " << kernel.place().DebugString(); VLOG(4) << "kernel place:" << kernel.place().DebugString();
VLOG(4) << "picker place " << winner_place.DebugString(); VLOG(4) << "winner_picker place:" << winner_place.DebugString();
VLOG(4) << "------------------------------"; VLOG(4) << "------------------------------";
// The data layout is not considered, for the input and output arguments // The data layout is not considered, for the input and output arguments
......
...@@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst( ...@@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst(
for (auto& kernel : kernels) { for (auto& kernel : kernels) {
const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
#ifdef LITE_WITH_OPENCL
// layout kernel choose // layout kernel choose
// must ignore [layout check] for layout of kernels's input and output // must ignore [layout check] for layout of kernels's input and output
if (TargetCompatibleTo(*in_arg_ty, from) && // note: replace LITE_WITH_OPENCL macro with judge input and output target
PrecisionCompatibleTo(*in_arg_ty, from) && // of layout_trans
DeviceCompatibleTo(*in_arg_ty, from) && if ((in_arg_ty->target() == TARGET(kOpenCL) ||
out_arg_ty->layout() == to.layout()) { out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first
#else (TargetCompatibleTo(*in_arg_ty, from) &&
if (TypeCompatible(*in_arg_ty, from) && PrecisionCompatibleTo(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) { DeviceCompatibleTo(*in_arg_ty, from) &&
#endif out_arg_ty->layout() == to.layout())) {
is_found = true;
} else if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) {
is_found = true; is_found = true;
}
if (is_found) {
selected_kernels.emplace_back(std::move(kernel)); selected_kernels.emplace_back(std::move(kernel));
// we pick the kernel // we pick the kernel
layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op); layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op);
break; break;
} }
} }
CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":" CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":"
<< in->AsArg().name << "->" << to << ":" << in->AsArg().name << "->" << to << ":"
<< inst_node->AsStmt().op_info()->Type(); << inst_node->AsStmt().op_info()->Type();
......
...@@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst(
VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty; VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty;
VLOG(4) << "to:" << to << "\n"; VLOG(4) << "to:" << to << "\n";
// kernel choose branch for opencl backend // kernel choose branch for opencl backend
// judge inst's target whether is kOpenCL // judge inst's target whether is kOpenCL
// Note: to == *decl_arg_type == in of inst, not output of last inst // Note: to == *decl_arg_type == in of inst, not output of last inst
#ifdef LITE_WITH_OPENCL
// ignore [layout check] for layout between [to] and [from] // ignore [layout check] for layout between [to] and [from]
// Because all of origin opencl insts in model, are not default layout // Because all of origin opencl insts in model, are not default layout
// NCHW, // NCHW,
...@@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst(
// [*decl_arg_type] -> [to]: input of inst, not output of last // [*decl_arg_type] -> [to]: input of inst, not output of last
// [in_arg_ty]: in of io_copy // [in_arg_ty]: in of io_copy
// [out_arg_ty]: out of io_copy // [out_arg_ty]: out of io_copy
if (TargetCompatibleTo(*in_arg_ty, from) && //
PrecisionCompatibleTo(*in_arg_ty, from) && // noto: replace LITE_WITH_OPENCL macro with judge input and output target
DeviceCompatibleTo(*in_arg_ty, from) && // of io_copy
TargetCompatibleTo(*out_arg_ty, to)) { if ((in_arg_ty->target() == TARGET(kOpenCL) ||
VLOG(4) << "do nothing. opencl found"; out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first
#else (TargetCompatibleTo(*in_arg_ty, from) &&
if (TypeCompatible(*in_arg_ty, from) && PrecisionCompatibleTo(*in_arg_ty, from) &&
out_arg_ty->target() == to.target()) { DeviceCompatibleTo(*in_arg_ty, from) &&
#endif TargetCompatibleTo(*out_arg_ty, to))) {
VLOG(4) << "picked, opencl found";
is_found = true;
} else if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->target() == to.target()) {
VLOG(4) << "picked"; VLOG(4) << "picked";
is_found = true; is_found = true;
}
if (is_found) {
selected_kernels.emplace_back(std::move(kernel)); selected_kernels.emplace_back(std::move(kernel));
// we pick the kernel // we pick the kernel
io_copy_inst->AsStmt( io_copy_inst->AsStmt(
io_copy_type, std::move(selected_kernels), io_copy_op); io_copy_type, std::move(selected_kernels), io_copy_op);
break; break;
} }
VLOG(4) << "not picked"; VLOG(4) << "not picked";
} }
CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from
<< ":" << in->AsArg().name << " -> " << to << ":" << ":" << in->AsArg().name << " -> " << to << ":"
<< inst_node->AsStmt().op_info()->Type(); << inst_node->AsStmt().op_info()->Type();
......
...@@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass {
} }
} }
// Set the tye of the weight // Set the type of the weight
void SetWeightType(Node* w, const LiteType& type) { void SetWeightType(Node* w,
// TODO(xg) to optimize this const LiteType& type,
#ifdef LITE_WITH_FPGA const std::map<std::string, bool>& lite_with_targets) {
w->AsArg().type = LiteType::GetTensorTy( VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); if (lite_with_targets.at("kFPGA")) {
#endif w->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
#ifdef LITE_WITH_OPENCL } else if (lite_with_targets.at("kOpenCL")) {
w->AsArg().type = LiteType::GetTensorTy( w->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
#endif } else {
w->AsArg().type = LiteType::GetTensorTy(
#ifndef LITE_WITH_FPGA TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
#ifndef LITE_WITH_OPENCL }
w->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
#endif
#endif
} }
void InferenceArgumentPlace(SSAGraph* graph) { void InferenceArgumentPlace(SSAGraph* graph) {
auto& valid_places = graph->valid_places();
auto valid_places_has_target = [&](TargetType t) -> bool {
for (auto& p : valid_places) {
if (p.target == t) {
return true;
}
}
return false;
};
std::map<std::string, bool> lite_with_targets{
{"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
{"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global(); VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
for (auto& x : graph->StmtTopologicalOrder()) { for (auto& x : graph->StmtTopologicalOrder()) {
auto& inst = x->AsStmt(); auto& inst = x->AsStmt();
// The IoCopyOp is a tool operator, it won't support the type inference. // The IoCopyOp is a tool operator, it won't support the type inference.
// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
// tool operator // for
#ifndef LITE_WITH_FPGA // tool operator
#ifndef LITE_WITH_OPENCL if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
VLOG(3) << "inst.op_type() == 'io_copy', continue"; VLOG(3) << "inst.op_type() == 'io_copy', continue";
if (inst.op_type() == "io_copy") continue; if (inst.op_type() == "io_copy") continue;
#endif }
#endif
// deal with inputs // deal with inputs
VLOG(4) << "Infering op " << inst.op_info()->Repr(); VLOG(4) << "Infering op " << inst.op_info()->Repr();
// TODO(zhaolong): Add check if the node's name in op's arguments. // TODO(zhaolong): Add check if the node's name in op's arguments.
...@@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass {
if (!x_in->AsArg().type) { if (!x_in->AsArg().type) {
VLOG(4) << "set type " << *type << " " << x_in->AsArg().name; VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
if (x_in->AsArg().is_weight) { if (x_in->AsArg().is_weight) {
SetWeightType(x_in, *type); SetWeightType(x_in, *type, lite_with_targets);
} else { } else {
x_in->AsArg().type = type; x_in->AsArg().type = type;
} }
...@@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass {
if (!x_out->AsArg().type) { if (!x_out->AsArg().type) {
VLOG(4) << "set type " << *type << " " << x_out->AsArg().name; VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
if (x_out->AsArg().is_weight) { if (x_out->AsArg().is_weight) {
SetWeightType(x_out, *type); SetWeightType(x_out, *type, lite_with_targets);
} else { } else {
x_out->AsArg().type = type; x_out->AsArg().type = type;
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -49,6 +50,22 @@ class Optimizer { ...@@ -49,6 +50,22 @@ class Optimizer {
valid_places_ = valid_places; valid_places_ = valid_places;
CHECK(!valid_places.empty()) << "At least one valid_place should be set"; CHECK(!valid_places.empty()) << "At least one valid_place should be set";
CHECK(!graph_) << "duplicate optimize found"; CHECK(!graph_) << "duplicate optimize found";
auto valid_places_has_target = [&](TargetType t) -> bool {
for (auto& p : valid_places) {
if (p.target == t) {
return true;
}
}
return false;
};
std::map<std::string, bool> lite_with_targets{
{"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
{"kNPU", valid_places_has_target(TARGET(kNPU))},
{"kXPU", valid_places_has_target(TARGET(kXPU))}};
VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
VLOG(4) << "lite_with_targets['kNPU']:" << lite_with_targets["kNPU"];
VLOG(4) << "lite_with_targets['kXPU']:" << lite_with_targets["kXPU"];
graph_.reset(new mir::SSAGraph); graph_.reset(new mir::SSAGraph);
graph_->Build(program, valid_places); graph_->Build(program, valid_places);
graph_->SetValidPlaces(valid_places); graph_->SetValidPlaces(valid_places);
...@@ -57,14 +74,11 @@ class Optimizer { ...@@ -57,14 +74,11 @@ class Optimizer {
InitTargetTypeTransformPass(); InitTargetTypeTransformPass();
if (passes.empty()) { if (passes.empty()) {
RunPasses(std::vector<std::string>{ std::vector<std::string> passes_local{
{"lite_quant_dequant_fuse_pass", // {"lite_quant_dequant_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", // "lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise "lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
// This pass is disabled to force some opencl kernels selected for
// final running, otherwise, they will be fused to ARM fusion
// kernels, and the OpenCL devices will be discarded.
// TODO(Superjomn) Refine the fusion related design to select fusion // TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically. // kernels for devices automatically.
"lite_conv_activation_fuse_pass", // "lite_conv_activation_fuse_pass", //
...@@ -105,16 +119,17 @@ class Optimizer { ...@@ -105,16 +119,17 @@ class Optimizer {
"argument_type_display_pass", // "argument_type_display_pass", //
"variable_place_inference_pass", // "variable_place_inference_pass", //
"argument_type_display_pass", // "argument_type_display_pass",
"runtime_context_assign_pass", "runtime_context_assign_pass",
"argument_type_display_pass", // "argument_type_display_pass"}};
#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \ if ((!lite_with_targets["kOpenCL"]) && (!lite_with_targets["kNPU"]) &&
!defined(LITE_WITH_XPU) (!lite_with_targets["kXPU"])) {
// TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in OpenCL
"memory_optimize_pass", // kernel
#endif passes_local.emplace_back("memory_optimize_pass");
"argument_type_display_pass"}}); }
RunPasses(passes_local);
} else { } else {
RunPasses(passes); RunPasses(passes);
} }
...@@ -141,6 +156,7 @@ class Optimizer { ...@@ -141,6 +156,7 @@ class Optimizer {
.LookUp<mir::subgraph::GenerateNPUProgramPass>( .LookUp<mir::subgraph::GenerateNPUProgramPass>(
"generate_npu_program_pass"); "generate_npu_program_pass");
#endif #endif
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XPU
auto pass = mir::PassManager::Global() auto pass = mir::PassManager::Global()
.LookUp<mir::subgraph::GenerateXPUProgramPass>( .LookUp<mir::subgraph::GenerateXPUProgramPass>(
......
...@@ -32,11 +32,21 @@ int64_t ShapeProduction(const shape_t& shape) { ...@@ -32,11 +32,21 @@ int64_t ShapeProduction(const shape_t& shape) {
return res; return res;
} }
// 0. Enable OpenCL, if needed
// Enable `DEMO_WITH_OPENCL` macro below, if user need use gpu(opencl)
// #define DEMO_WITH_OPENCL
void RunModel() { void RunModel() {
// 1. Set CxxConfig // 1. Set CxxConfig
CxxConfig config; CxxConfig config;
config.set_model_dir(FLAGS_model_dir); config.set_model_dir(FLAGS_model_dir);
#ifdef DEMO_WITH_OPENCL
std::vector<Place> valid_places{
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
Place{TARGET(kARM), PRECISION(kFloat)}};
#else
std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}}; std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
#endif
if (FLAGS_prefer_int8_kernel) { if (FLAGS_prefer_int8_kernel) {
valid_places.insert(valid_places.begin(), valid_places.insert(valid_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)}); Place{TARGET(kARM), PRECISION(kInt8)});
......
if (NOT LITE_WITH_OPENCL) if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
return () return ()
endif() endif()
......
...@@ -103,8 +103,9 @@ class IoCopykOpenCLToHostCompute ...@@ -103,8 +103,9 @@ class IoCopykOpenCLToHostCompute
auto* wait_list = context.cl_wait_list(); auto* wait_list = context.cl_wait_list();
auto* x_ptr = param.x->data<float, cl::Buffer>(); auto* x_ptr = param.x->data<float, cl::Buffer>();
/* TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` /* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to
in kernel and enable wait_list `cl_wait_list`
in kernel and `wait_list` enabled
auto it = wait_list->find(x_ptr); auto it = wait_list->find(x_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
......
...@@ -568,7 +568,7 @@ void SaveModelNaive(const std::string &model_dir, ...@@ -568,7 +568,7 @@ void SaveModelNaive(const std::string &model_dir,
SaveParamNaive(path, exec_scope, var.Name()); SaveParamNaive(path, exec_scope, var.Name());
} }
} }
VLOG(4) << "Save naive buffer model in '" << model_dir << "'' successfully"; LOG(INFO) << "Save naive buffer model in '" << model_dir << "' successfully";
} }
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册