diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index 6445c31c1857d8011865661a50a85fd73dfc2f1d..fbf487e0065a28a19518673c1c1c9e793d913cfc 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -36,16 +36,12 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { #endif #ifdef LITE_WITH_MLU Env::Init(); - mlu_core_version_ = config.mlu_core_version(); - mlu_core_number_ = config.mlu_core_number(); - use_first_conv_ = config.use_first_conv(); - mean_vec_ = config.mean(); - std_vec_ = config.std(); - lite::DeviceInfo::Global().SetMLURunMode(mlu_core_version_, - mlu_core_number_, - use_first_conv_, - mean_vec_, - std_vec_); + lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(), + config.mlu_core_number(), + config.mlu_use_first_conv(), + config.mlu_first_conv_mean(), + config.mlu_first_conv_std(), + config.mlu_input_layout()); #endif // LITE_WITH_MLU auto places = config.valid_places(); std::vector passes{}; diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index 2cb2064da518bca442e882d0733c5c6966c4fac0..91edb2cda7849211f288d64e00191ddba8f82f19 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -203,6 +203,37 @@ void ConfigBase::set_threads(int threads) { #endif } +void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) { + mlu_core_version_ = core_version; +} +void CxxConfig::set_mlu_core_number(int core_number) { + mlu_core_number_ = core_number; +} +void CxxConfig::set_mlu_input_layout(DataLayoutType layout) { + mlu_input_layout_ = layout; +} +void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) { + mlu_use_first_conv_ = use_first_conv; +} +void CxxConfig::set_mlu_first_conv_mean(const std::vector &mean) { + mlu_first_conv_mean_ = mean; +} +void CxxConfig::set_mlu_first_conv_std(const std::vector &std) { + mlu_first_conv_std_ = std; +} +lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const { + return mlu_core_version_; +} +int CxxConfig::mlu_core_number() const { return mlu_core_number_; } +DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; } +bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; } +std::vector CxxConfig::mlu_first_conv_mean() const { + return mlu_first_conv_mean_; +} +std::vector CxxConfig::mlu_first_conv_std() const { + return mlu_first_conv_std_; +} + // set model data in combined format, `set_model_from_file` refers to loading // model from file, set_model_from_buffer refers to loading model from memory // buffer diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index 23956ee47d996567ba318afa09da2aebb93d540e..0cb60bf84fe5063287646f825dc74dc5f51bee11 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -106,11 +106,6 @@ class LITE_API PaddlePredictor { protected: int threads_{1}; lite_api::PowerMode mode_{lite_api::LITE_POWER_NO_BIND}; - lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLU_270}; - int mlu_core_number_{1}; - bool use_first_conv_{false}; - std::vector mean_vec_; - std::vector std_vec_; }; /// Base class for all the configs. @@ -141,11 +136,12 @@ class LITE_API CxxConfig : public ConfigBase { #ifdef LITE_WITH_X86 int x86_math_library_math_threads_ = 1; #endif - bool use_firstconv_{false}; - std::vector mean_ = {0.0f}; - std::vector std_ = {1.0f}; lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; int mlu_core_number_{1}; + DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)}; + bool mlu_use_first_conv_{false}; + std::vector mlu_first_conv_mean_; + std::vector mlu_first_conv_std_; public: void set_valid_places(const std::vector& x) { valid_places_ = x; } @@ -173,20 +169,20 @@ class LITE_API CxxConfig : public ConfigBase { return x86_math_library_math_threads_; } #endif - void set_use_firstconv(const bool firstconv) { use_firstconv_ = firstconv; } - void set_mean(const std::vector mean) { mean_ = mean; } - void set_std(const std::vector std) { std_ = std; } - void set_mlu_core_version(lite_api::MLUCoreVersion core_version) { - mlu_core_version_ = core_version; - } - void set_mlu_core_number(int core_number) { mlu_core_number_ = core_number; } - bool use_first_conv() const { return use_firstconv_; } - std::vector mean() const { return mean_; } - std::vector std() const { return std_; } - lite_api::MLUCoreVersion mlu_core_version() const { - return mlu_core_version_; - } - int mlu_core_number() const { return mlu_core_number_; } + + void set_mlu_core_version(lite_api::MLUCoreVersion core_version); + void set_mlu_core_number(int core_number); + void set_mlu_input_layout(DataLayoutType layout); + void set_mlu_use_first_conv(bool use_first_conv); + void set_mlu_first_conv_mean(const std::vector& mean); + void set_mlu_first_conv_std(const std::vector& std); + + lite_api::MLUCoreVersion mlu_core_version() const; + int mlu_core_number() const; + DataLayoutType mlu_input_layout() const; + bool mlu_use_first_conv() const; + std::vector mlu_first_conv_mean() const; + std::vector mlu_first_conv_std() const; }; /// MobileConfig is the config for the light weight predictor, it will skip diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc index 942d7f8b540a6ff7ae6d62e98e6e573e1af12aa8..5512e7bc438eddd6bcd9c8f792fc8507b03bf800 100644 --- a/lite/api/python/pybind/pybind.cc +++ b/lite/api/python/pybind/pybind.cc @@ -128,11 +128,12 @@ void BindLiteCxxConfig(py::module *m) { .def("power_mode", &CxxConfig::power_mode); #endif #ifdef LITE_WITH_MLU - cxx_config.def("set_use_firstconv", &CxxConfig::set_use_firstconv) - .def("set_mean", &CxxConfig::set_mean) - .def("set_std", &CxxConfig::set_std) - .def("set_mlu_core_version", &CxxConfig::set_mlu_core_version) - .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number); + cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version) + .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number) + .def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout) + .def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv) + .def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean) + .def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std); #endif } diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index 8b5f0b58efb30c3c44561b594397625b23f590cd..29ac96ed744b016833a746b35002dd68109efd8b 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -72,6 +72,7 @@ thread_local int DeviceInfo::mlu_core_number_{1}; thread_local bool DeviceInfo::use_first_conv_{false}; thread_local std::vector DeviceInfo::mean_vec_; thread_local std::vector DeviceInfo::std_vec_; +thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)}; #endif #ifdef TARGET_IOS @@ -1093,7 +1094,8 @@ void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version, int core_number, bool use_first_conv, const std::vector& mean_vec, - const std::vector& std_vec) { + const std::vector& std_vec, + DataLayoutType input_layout) { switch (core_version) { case (lite_api::MLUCoreVersion::MLU_220): mlu_core_version_ = CNML_MLU220; @@ -1109,6 +1111,7 @@ void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version, use_first_conv_ = use_first_conv; mean_vec_ = mean_vec; std_vec_ = std_vec; + input_layout_ = input_layout; } cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; } @@ -1121,6 +1124,8 @@ const std::vector& DeviceInfo::MeanVec() const { return mean_vec_; } const std::vector& DeviceInfo::StdVec() const { return std_vec_; } +DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; } + #endif // LITE_WITH_MLU void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { diff --git a/lite/core/device_info.h b/lite/core/device_info.h index dda9474b3ab9085b417a68213d76f432ee7440c9..4e7e4742c4f6caa8a902f56fe953acd383fe2185 100644 --- a/lite/core/device_info.h +++ b/lite/core/device_info.h @@ -60,12 +60,14 @@ class DeviceInfo { int core_number, bool use_first_conv, const std::vector& mean_vec, - const std::vector& std_vec); + const std::vector& std_vec, + DataLayoutType input_layout); cnmlCoreVersion_t MLUCoreVersion(); int MLUCoreNumber(); bool UseFirstConv(); const std::vector& MeanVec() const; const std::vector& StdVec() const; + DataLayoutType InputLayout() const; #endif void SetCache(int l1size, int l2size, int l3size); void SetArch(ARMArch arch) { arch_ = arch; } @@ -124,6 +126,7 @@ class DeviceInfo { static thread_local bool use_first_conv_; static thread_local std::vector mean_vec_; static thread_local std::vector std_vec_; + static thread_local DataLayoutType input_layout_; #endif void SetDotInfo(int argc, ...); diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc index 727189db77427a01a6f4d4477c053112e99e8103..c69584b2961c9a63b565536d33e36d8278f2c8ad 100644 --- a/lite/core/mir/mlu_postprocess_pass.cc +++ b/lite/core/mir/mlu_postprocess_pass.cc @@ -74,7 +74,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) && - DataLayoutCompatible(*out_arg_ty, *cast_type)) { + DataLayoutCompatible(*out_arg_ty, *cast_type) && + // for first conv + PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { is_found = true; } } else if (op_type == "io_copy") { @@ -121,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, cast_arg->AsArg().type = cast_type; auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name); // for CastAfter manully set the tensor's type - var->GetMutable<::paddle::lite::Tensor>(); + var->GetMutable(); // create the stmt node auto* cast_inst = graph->NewInstructNode(); @@ -215,23 +217,23 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph, first_conv_nodes_.end(), head_node->AsArg().name) != first_conv_nodes_.end(); - // layout cast node - if (head_type->layout() != inst_type->layout()) { + // precision cast node + if (head_type->precision() != inst_type->precision() && !is_first_conv_head) { cur_node = InsertCastBefore( - "layout", - name_prefix + "layout", + "cast", + name_prefix + "cast", graph, cur_node, inst_node, LiteType::GetTensorTy( - head_type->target(), head_type->precision(), inst_type->layout())); + head_type->target(), inst_type->precision(), head_type->layout())); } - // precision cast node - if (head_type->precision() != inst_type->precision() && !is_first_conv_head) { + // layout cast node + if (head_type->layout() != inst_type->layout()) { cur_node = InsertCastBefore( - "cast", - name_prefix + "cast", + "layout", + name_prefix + "layout", graph, cur_node, inst_node, @@ -281,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, // get subgraph's valid precision const auto& places = graph->valid_places(); - std::set<::paddle::lite_api::PrecisionType> prec_set; + std::set prec_set; for (const auto& place : places) { if (place.target == TARGET(kMLU)) { prec_set.insert(place.precision); @@ -364,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph, const auto name_prefix = tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; - // layout cast node - if (tail_type->layout() != inst_type->layout()) { + // precision cast node + if (tail_type->precision() != inst_type->precision()) { cur_node = InsertCastAfter( - "layout", - name_prefix + "layout", + "cast", + name_prefix + "cast", graph, cur_node, inst_node, LiteType::GetTensorTy( - tail_type->target(), tail_type->precision(), inst_type->layout())); + tail_type->target(), inst_type->precision(), tail_type->layout())); } - // precision cast node - if (tail_type->precision() != inst_type->precision()) { + // layout cast node + if (tail_type->layout() != inst_type->layout()) { cur_node = InsertCastAfter( - "cast", - name_prefix + "cast", + "layout", + name_prefix + "layout", graph, cur_node, inst_node, @@ -474,13 +476,20 @@ bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) { return false; } -void MLUPostprocessPass::GatherFirstConvNodes(SSAGraph* graph) { +void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) { for (auto& node : graph->mutable_nodes()) { if (!node.IsStmt()) continue; if (node.AsStmt().op_type() == "feed") { for (auto& out : node.outlinks) { if (IsFirstConvNode(out)) { first_conv_nodes_.insert(out->AsArg().name); + // modify first conv nodes' type + const auto* old_type = out->AsArg().type; + out->AsArg().type = + LiteType::GetTensorTy(old_type->target(), + paddle::lite_api::PrecisionType::kInt8, + old_type->layout(), + old_type->device()); } } } @@ -504,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { out->AsArg().type = LiteType::GetTensorTy(old_type->target(), old_type->precision(), - ::paddle::lite_api::DataLayoutType::kNHWC, + paddle::lite_api::DataLayoutType::kNHWC, old_type->device()); } } @@ -523,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { inp->AsArg().type = LiteType::GetTensorTy(old_type->target(), old_type->precision(), - ::paddle::lite_api::DataLayoutType::kNHWC, + paddle::lite_api::DataLayoutType::kNHWC, old_type->device()); } } @@ -539,10 +548,12 @@ void MLUPostprocessPass::Apply(const std::unique_ptr& graph) { // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch; // arg_in and arg_out are assumed to be NHWC which user should be aware of. // Thus here we change these args' layout to NHWC - ModifyLayout(graph.get()); + if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) { + ModifyLayout(graph.get()); + } if (lite::DeviceInfo::Global().UseFirstConv()) { - GatherFirstConvNodes(graph.get()); + GatherAndModifyFirstConvNodes(graph.get()); } // insert io_copy, layout and precision cast of subgraph's inputs and outputs diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h index 34b449cca664e28a24a02deb6e214f5e81386767..688dd06fb5fbec0c8e1c53acfe4215456ddb4192 100644 --- a/lite/core/mir/mlu_postprocess_pass.h +++ b/lite/core/mir/mlu_postprocess_pass.h @@ -109,7 +109,7 @@ class MLUPostprocessPass : public ProgramPass { void RecreateOp(Node* inst_node, SSAGraph* graph); - void GatherFirstConvNodes(SSAGraph* graph); + void GatherAndModifyFirstConvNodes(SSAGraph* graph); bool IsFirstConvNode(Node* arg_node); diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h index 2af8274e07713300277f7280f12e6d1fcb47c3c2..fa8fb1597c0fb068a855928dd20057d48ecd5eaf 100644 --- a/lite/kernels/mlu/bridges/utility.h +++ b/lite/kernels/mlu/bridges/utility.h @@ -84,7 +84,7 @@ struct FPTypeTraits { template <> struct FPTypeTraits { - typedef ::paddle::lite::fluid::float16 T; + typedef paddle::lite::fluid::float16 T; }; } // namespace mlu diff --git a/lite/kernels/mlu/layout_compute.cc b/lite/kernels/mlu/layout_compute.cc index 97d8202553ec8f11e35f6d11c9a9c12c580463c6..d4e16734d6d2dae6f5c119194008bce114a2e918 100644 --- a/lite/kernels/mlu/layout_compute.cc +++ b/lite/kernels/mlu/layout_compute.cc @@ -48,11 +48,11 @@ REGISTER_LITE_KERNEL( def_layout_nhwc2nchw_fp16) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kFloat), + PRECISION(kFP16), DATALAYOUT(kNHWC))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kFloat), + PRECISION(kFP16), DATALAYOUT(kNCHW))}) .Finalize(); @@ -82,10 +82,27 @@ REGISTER_LITE_KERNEL( def_layout_nchw2nhwc_fp16) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kFloat), + PRECISION(kFP16), DATALAYOUT(kNCHW))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kFloat), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + layout, + kMLU, + kInt8, + kNHWC, + paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, + def_layout_nchw2nhwc_fp32_int8) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), DATALAYOUT(kNHWC))}) .Finalize(); diff --git a/lite/kernels/mlu/layout_compute.h b/lite/kernels/mlu/layout_compute.h index 5e87e3526417573f2e0f01280b1d86ccb5691093..edacdf8a98a2ffde6e538f61d4dd8259e3211b22 100644 --- a/lite/kernels/mlu/layout_compute.h +++ b/lite/kernels/mlu/layout_compute.h @@ -29,6 +29,24 @@ namespace lite { namespace kernels { namespace mlu { +template +struct FPTypeTraits {}; + +template <> +struct FPTypeTraits { + typedef float T; +}; + +template <> +struct FPTypeTraits { + typedef paddle::lite::fluid::float16 T; +}; + +template <> +struct FPTypeTraits { + typedef int8_t T; +}; + template inline void LayoutTransCompute(const int dim, const lite::Context& context, @@ -63,7 +81,7 @@ class LayoutNchwToNhwcCompute auto& param = this->template Param(); auto* x = param.x; auto* out = param.y; - out->template mutable_data(); + out->template mutable_data::T>(); auto x_dims = param.x->dims().size(); auto& context = this->ctx_->template As(); @@ -88,7 +106,8 @@ class LayoutNchwToNhwcCompute CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc"; } - LayoutTransCompute( + LayoutTransCompute::T>( x_dims, context, *x, out, axis); if (x_dims > 2) { @@ -111,7 +130,7 @@ class LayoutNhwcToNchwCompute auto& param = this->template Param(); auto* x = param.x; auto* out = param.y; - out->template mutable_data(); + out->template mutable_data::T>(); auto x_dims = param.x->dims().size(); auto& context = this->ctx_->template As(); @@ -136,7 +155,8 @@ class LayoutNhwcToNchwCompute CHECK(0) << "Unsupport dim in mlu layout nhwc to nchw"; } - LayoutTransCompute( + LayoutTransCompute::T>( x_dims, context, *x, out, axis); if (x_dims > 2) {