未验证 提交 d242bdfb 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Enable full and light api for OpenCL (#2331)

* Fix bug target for kHost and kARM not equal. test=develop

* Fix license. test=develop

* add debug -g option. test=develop

* enable opencl demo. test=develop

* Fix model_optimize_tool found no opencl kernel. test=develop

* add more vlog. test=develop

* remove macro LITE_WITH_OPENCL, LITE_WITH_FPGA in passes. test=develop

* Fix valid_places in mobilenetv1_test. test=develop

* Fix bug of find no real output of fetch, after tool OPs of optimzer passes. test=develop

* Fix vlog as log message in model_optimize_tool. test=develop

* fix miscs. test=develop

* fix comment. test=develop

* Fix misspell of opencl, fpga kernels name in lite/api/CMakeLists.txt. test=develop

* add opencl macro in full_api of demo. test=develop
上级 81852863
......@@ -157,7 +157,9 @@ function(lite_cc_library TARGET)
endfunction()
function(lite_cc_binary TARGET)
set(options "")
if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
set(options " -g ")
endif()
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
......
......@@ -195,7 +195,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
endif()
endif()
if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND
if ((ARM_TARGET_OS STREQUAL "android") AND
((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
if (NOT LITE_ON_TINY_PUBLISH)
# copy
......@@ -210,6 +210,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
)
add_dependencies(publish_inference_android_cxx_demos logging gflags)
add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
......
......@@ -76,8 +76,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
CL_DEPS ${opencl_kenrels}
FPGA_DEPS ${fpga_kenrels})
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
endif()
# for light api
......@@ -96,8 +96,8 @@ lite_cc_library(light_api SRCS light_api.cc
ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kenrels}
FPGA_DEPS ${fpga_kenrels})
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
......
......@@ -140,21 +140,28 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
// get inputs names
std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
// get outputnames
std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
// append the names of inputs and outputs into input_names_ and output_names_
void Predictor::PrepareFeedFetch() {
auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
std::vector<cpp::OpDesc *> feeds;
std::vector<cpp::OpDesc *> fetchs;
for (size_t i = 0; i < current_block->OpsSize(); i++) {
auto op = current_block->GetOp<cpp::OpDesc>(i);
if (!program_) {
GenRuntimeProgram();
}
std::vector<const cpp::OpDesc *> feeds;
std::vector<const cpp::OpDesc *> fetchs;
const auto &insts = program_->instructions();
for (size_t i = 0; i < program_->num_instructions(); i++) {
const auto &op = insts[i].op()->op_info();
if (op->Type() == "feed") {
feeds.push_back(op);
} else if (op->Type() == "fetch") {
fetchs.push_back(op);
}
}
input_names_.resize(feeds.size());
output_names_.resize(fetchs.size());
for (size_t i = 0; i < feeds.size(); i++) {
......@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
const cpp::ProgramDesc &Predictor::program_desc() const {
return program_desc_;
}
const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
void Predictor::Build(const lite_api::CxxConfig &config,
......@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
const std::vector<Place> &valid_places,
const std::vector<std::string> &passes) {
program_desc_ = desc;
// `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places;
inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
Program program(desc, scope_, inner_places);
/// The first place in valid_places is
core::KernelPickFactor factor;
factor.ConsiderTarget();
factor.ConsiderPrecision();
factor.ConsiderDataLayout();
optimizer_.Run(std::move(program), inner_places, factor, passes);
exec_scope_ = optimizer_.exec_scope();
PrepareFeedFetch();
......@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
auto *var = exec_scope_->FindVar(name);
return &var->Get<lite::Tensor>();
}
// get input by name
lite::Tensor *Predictor::GetInputByName(const std::string &name) {
auto element = std::find(input_names_.begin(), input_names_.end(), name);
......
......@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) {
#ifdef LITE_WITH_OPENCL
TEST(MobileNetV1, test_opencl) {
std::vector<Place> valid_places({
Place{TARGET(kOpenCL), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
TARGET(kARM), // enable kARM CPU kernel when no opencl kernel
});
TestModel(valid_places);
......
......@@ -80,7 +80,16 @@ void Main() {
if (target_repr == "arm") {
valid_places.emplace_back(TARGET(kARM));
} else if (target_repr == "opencl") {
valid_places.emplace_back(TARGET(kOpenCL));
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
valid_places.emplace_back(
TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel
} else if (target_repr == "x86") {
valid_places.emplace_back(TARGET(kX86));
} else {
......
......@@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() {
do { \
cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \
if (cl_func##_ == nullptr) { \
LOG(ERROR) << "Cannot find the " << #cl_func \
LOG(FATAL) << "Cannot find the " << #cl_func \
<< " symbol in libOpenCL.so!"; \
break; \
} \
......
......@@ -70,6 +70,7 @@ class StaticKernelPickPass : public mir::StmtPass {
const auto& place = places[i];
float weight = static_cast<float>(place_size - i) / place_size;
size_t score{};
// The more important factor comes first
if (kernel_pick_factors_.IsTargetConsidered() &&
(place.target == kernel.target() || kernel.target() == TARGET(kAny) ||
......@@ -102,17 +103,17 @@ class StaticKernelPickPass : public mir::StmtPass {
VLOG(4) << "[score(final)]:" << final_score;
VLOG(4) << "-------- pick summary --------";
VLOG(4) << " ===> place():" << PrecisionToStr(winner_place.precision) << " "
<< DataLayoutToStr(winner_place.layout) << " "
VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
<< " " << DataLayoutToStr(winner_place.layout) << " "
<< TargetToStr(winner_place.target);
VLOG(4) << " ===> kernel.place():"
<< PrecisionToStr(kernel.place().precision) << " "
<< DataLayoutToStr(kernel.place().layout) << " "
<< TargetToStr(kernel.place().target);
VLOG(4) << "kernel.op_type():" << kernel.op_type();
VLOG(4) << "picker tactic " << kernel_pick_factors_;
VLOG(4) << "kernel place " << kernel.place().DebugString();
VLOG(4) << "picker place " << winner_place.DebugString();
VLOG(4) << "kernel picker factors:" << kernel_pick_factors_;
VLOG(4) << "kernel place:" << kernel.place().DebugString();
VLOG(4) << "winner_picker place:" << winner_place.DebugString();
VLOG(4) << "------------------------------";
// The data layout is not considered, for the input and output arguments
......
......@@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst(
for (auto& kernel : kernels) {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
#ifdef LITE_WITH_OPENCL
// layout kernel choose
// must ignore [layout check] for layout of kernels's input and output
if (TargetCompatibleTo(*in_arg_ty, from) &&
// note: replace LITE_WITH_OPENCL macro with judge input and output target
// of layout_trans
if ((in_arg_ty->target() == TARGET(kOpenCL) ||
out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first
(TargetCompatibleTo(*in_arg_ty, from) &&
PrecisionCompatibleTo(*in_arg_ty, from) &&
DeviceCompatibleTo(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout())) {
is_found = true;
} else if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) {
#else
if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) {
#endif
is_found = true;
}
if (is_found) {
selected_kernels.emplace_back(std::move(kernel));
// we pick the kernel
layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op);
break;
}
}
CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":"
<< in->AsArg().name << "->" << to << ":"
<< inst_node->AsStmt().op_info()->Type();
......
......@@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst(
VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty;
VLOG(4) << "to:" << to << "\n";
// kernel choose branch for opencl backend
// judge inst's target whether is kOpenCL
// Note: to == *decl_arg_type == in of inst, not output of last inst
#ifdef LITE_WITH_OPENCL
// kernel choose branch for opencl backend
// judge inst's target whether is kOpenCL
// Note: to == *decl_arg_type == in of inst, not output of last inst
// ignore [layout check] for layout between [to] and [from]
// Because all of origin opencl insts in model, are not default layout
// NCHW,
......@@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst(
// [*decl_arg_type] -> [to]: input of inst, not output of last
// [in_arg_ty]: in of io_copy
// [out_arg_ty]: out of io_copy
if (TargetCompatibleTo(*in_arg_ty, from) &&
//
// noto: replace LITE_WITH_OPENCL macro with judge input and output target
// of io_copy
if ((in_arg_ty->target() == TARGET(kOpenCL) ||
out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first
(TargetCompatibleTo(*in_arg_ty, from) &&
PrecisionCompatibleTo(*in_arg_ty, from) &&
DeviceCompatibleTo(*in_arg_ty, from) &&
TargetCompatibleTo(*out_arg_ty, to)) {
VLOG(4) << "do nothing. opencl found";
#else
if (TypeCompatible(*in_arg_ty, from) &&
TargetCompatibleTo(*out_arg_ty, to))) {
VLOG(4) << "picked, opencl found";
is_found = true;
} else if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->target() == to.target()) {
#endif
VLOG(4) << "picked";
is_found = true;
}
if (is_found) {
selected_kernels.emplace_back(std::move(kernel));
// we pick the kernel
io_copy_inst->AsStmt(
io_copy_type, std::move(selected_kernels), io_copy_op);
break;
}
VLOG(4) << "not picked";
}
CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from
<< ":" << in->AsArg().name << " -> " << to << ":"
<< inst_node->AsStmt().op_info()->Type();
......
......@@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass {
}
}
// Set the tye of the weight
void SetWeightType(Node* w, const LiteType& type) {
// TODO(xg) to optimize this
#ifdef LITE_WITH_FPGA
// Set the type of the weight
void SetWeightType(Node* w,
const LiteType& type,
const std::map<std::string, bool>& lite_with_targets) {
VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
if (lite_with_targets.at("kFPGA")) {
w->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
#endif
#ifdef LITE_WITH_OPENCL
} else if (lite_with_targets.at("kOpenCL")) {
w->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
#endif
#ifndef LITE_WITH_FPGA
#ifndef LITE_WITH_OPENCL
} else {
w->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
#endif
#endif
}
}
void InferenceArgumentPlace(SSAGraph* graph) {
auto& valid_places = graph->valid_places();
auto valid_places_has_target = [&](TargetType t) -> bool {
for (auto& p : valid_places) {
if (p.target == t) {
return true;
}
}
return false;
};
std::map<std::string, bool> lite_with_targets{
{"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
{"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
for (auto& x : graph->StmtTopologicalOrder()) {
auto& inst = x->AsStmt();
// The IoCopyOp is a tool operator, it won't support the type inference.
// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for
// tool operator
#ifndef LITE_WITH_FPGA
#ifndef LITE_WITH_OPENCL
// The IoCopyOp is a tool operator, it won't support the type inference.
// in fpga, we has io_copy+cali+layout tool ops, so we need type inference
// for
// tool operator
if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
VLOG(3) << "inst.op_type() == 'io_copy', continue";
if (inst.op_type() == "io_copy") continue;
#endif
#endif
}
// deal with inputs
VLOG(4) << "Infering op " << inst.op_info()->Repr();
// TODO(zhaolong): Add check if the node's name in op's arguments.
......@@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass {
if (!x_in->AsArg().type) {
VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
if (x_in->AsArg().is_weight) {
SetWeightType(x_in, *type);
SetWeightType(x_in, *type, lite_with_targets);
} else {
x_in->AsArg().type = type;
}
......@@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass {
if (!x_out->AsArg().type) {
VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
if (x_out->AsArg().is_weight) {
SetWeightType(x_out, *type);
SetWeightType(x_out, *type, lite_with_targets);
} else {
x_out->AsArg().type = type;
}
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
......@@ -49,6 +50,22 @@ class Optimizer {
valid_places_ = valid_places;
CHECK(!valid_places.empty()) << "At least one valid_place should be set";
CHECK(!graph_) << "duplicate optimize found";
auto valid_places_has_target = [&](TargetType t) -> bool {
for (auto& p : valid_places) {
if (p.target == t) {
return true;
}
}
return false;
};
std::map<std::string, bool> lite_with_targets{
{"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
{"kNPU", valid_places_has_target(TARGET(kNPU))},
{"kXPU", valid_places_has_target(TARGET(kXPU))}};
VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
VLOG(4) << "lite_with_targets['kNPU']:" << lite_with_targets["kNPU"];
VLOG(4) << "lite_with_targets['kXPU']:" << lite_with_targets["kXPU"];
graph_.reset(new mir::SSAGraph);
graph_->Build(program, valid_places);
graph_->SetValidPlaces(valid_places);
......@@ -57,14 +74,11 @@ class Optimizer {
InitTargetTypeTransformPass();
if (passes.empty()) {
RunPasses(std::vector<std::string>{
std::vector<std::string> passes_local{
{"lite_quant_dequant_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
// This pass is disabled to force some opencl kernels selected for
// final running, otherwise, they will be fused to ARM fusion
// kernels, and the OpenCL devices will be discarded.
// TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically.
"lite_conv_activation_fuse_pass", //
......@@ -105,16 +119,17 @@ class Optimizer {
"argument_type_display_pass", //
"variable_place_inference_pass", //
"argument_type_display_pass", //
"argument_type_display_pass",
"runtime_context_assign_pass",
"argument_type_display_pass", //
#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \
!defined(LITE_WITH_XPU)
// TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel
"memory_optimize_pass",
#endif
"argument_type_display_pass"}});
"argument_type_display_pass"}};
if ((!lite_with_targets["kOpenCL"]) && (!lite_with_targets["kNPU"]) &&
(!lite_with_targets["kXPU"])) {
// TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in OpenCL
// kernel
passes_local.emplace_back("memory_optimize_pass");
}
RunPasses(passes_local);
} else {
RunPasses(passes);
}
......@@ -141,6 +156,7 @@ class Optimizer {
.LookUp<mir::subgraph::GenerateNPUProgramPass>(
"generate_npu_program_pass");
#endif
#ifdef LITE_WITH_XPU
auto pass = mir::PassManager::Global()
.LookUp<mir::subgraph::GenerateXPUProgramPass>(
......
......@@ -32,11 +32,21 @@ int64_t ShapeProduction(const shape_t& shape) {
return res;
}
// 0. Enable OpenCL, if needed
// Enable `DEMO_WITH_OPENCL` macro below, if user need use gpu(opencl)
// #define DEMO_WITH_OPENCL
void RunModel() {
// 1. Set CxxConfig
CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
#ifdef DEMO_WITH_OPENCL
std::vector<Place> valid_places{
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
Place{TARGET(kARM), PRECISION(kFloat)}};
#else
std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
#endif
if (FLAGS_prefer_int8_kernel) {
valid_places.insert(valid_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)});
......
if (NOT LITE_WITH_OPENCL)
if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
return ()
endif()
......
......@@ -103,8 +103,9 @@ class IoCopykOpenCLToHostCompute
auto* wait_list = context.cl_wait_list();
auto* x_ptr = param.x->data<float, cl::Buffer>();
/* TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
in kernel and enable wait_list
/* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to
`cl_wait_list`
in kernel and `wait_list` enabled
auto it = wait_list->find(x_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
......
......@@ -568,7 +568,7 @@ void SaveModelNaive(const std::string &model_dir,
SaveParamNaive(path, exec_scope, var.Name());
}
}
VLOG(4) << "Save naive buffer model in '" << model_dir << "'' successfully";
LOG(INFO) << "Save naive buffer model in '" << model_dir << "' successfully";
}
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册