Merge pull request #41 from Cambricon/develop-set-input-layout

Develop set input layout

Merge pull request #41 from Cambricon/develop-set-input-layout
Develop set input layout
3a98dabd · zhangshijin · GitHub · 97c2d205 · 59aaf733 · 3a98dabd
11 changed file
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -36,16 +36,12 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 #endif
 #ifdef LITE_WITH_MLU
  Env<TARGET(kMLU)>::Init();
-  mlu_core_version_ = config.mlu_core_version();
-  mlu_core_number_ = config.mlu_core_number();
-  use_first_conv_ = config.use_first_conv();
-  mean_vec_ = config.mean();
-  std_vec_ = config.std();
-  lite::DeviceInfo::Global().SetMLURunMode(mlu_core_version_,
-                                           mlu_core_number_,
-                                           use_first_conv_,
-                                           mean_vec_,
-                                           std_vec_);
+  lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
+                                           config.mlu_core_number(),
+                                           config.mlu_use_first_conv(),
+                                           config.mlu_first_conv_mean(),
+                                           config.mlu_first_conv_std(),
+                                           config.mlu_input_layout());
 #endif  // LITE_WITH_MLU
  auto places = config.valid_places();
  std::vector<std::string> passes{};

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -203,6 +203,37 @@ void ConfigBase::set_threads(int threads) {
 #endif
 }

+void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
+  mlu_core_version_ = core_version;
+}
+void CxxConfig::set_mlu_core_number(int core_number) {
+  mlu_core_number_ = core_number;
+}
+void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
+  mlu_input_layout_ = layout;
+}
+void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
+  mlu_use_first_conv_ = use_first_conv;
+}
+void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
+  mlu_first_conv_mean_ = mean;
+}
+void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
+  mlu_first_conv_std_ = std;
+}
+lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
+  return mlu_core_version_;
+}
+int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
+DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
+bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
+std::vector<float> CxxConfig::mlu_first_conv_mean() const {
+  return mlu_first_conv_mean_;
+}
+std::vector<float> CxxConfig::mlu_first_conv_std() const {
+  return mlu_first_conv_std_;
+}
+
 // set model data in combined format, `set_model_from_file` refers to loading
 // model from file, set_model_from_buffer refers to loading model from memory
 // buffer

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -106,11 +106,6 @@ class LITE_API PaddlePredictor {
 protected:
  int threads_{1};
  lite_api::PowerMode mode_{lite_api::LITE_POWER_NO_BIND};
-  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLU_270};
-  int mlu_core_number_{1};
-  bool use_first_conv_{false};
-  std::vector<float> mean_vec_;
-  std::vector<float> std_vec_;
 };

 /// Base class for all the configs.
@@ -141,11 +136,12 @@ class LITE_API CxxConfig : public ConfigBase {
 #ifdef LITE_WITH_X86
  int x86_math_library_math_threads_ = 1;
 #endif
-  bool use_firstconv_{false};
-  std::vector<float> mean_ = {0.0f};
-  std::vector<float> std_ = {1.0f};
  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
  int mlu_core_number_{1};
+  DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
+  bool mlu_use_first_conv_{false};
+  std::vector<float> mlu_first_conv_mean_;
+  std::vector<float> mlu_first_conv_std_;

 public:
  void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -173,20 +169,20 @@ class LITE_API CxxConfig : public ConfigBase {
    return x86_math_library_math_threads_;
  }
 #endif
-  void set_use_firstconv(const bool firstconv) { use_firstconv_ = firstconv; }
-  void set_mean(const std::vector<float> mean) { mean_ = mean; }
-  void set_std(const std::vector<float> std) { std_ = std; }
-  void set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
-    mlu_core_version_ = core_version;
-  }
-  void set_mlu_core_number(int core_number) { mlu_core_number_ = core_number; }
-  bool use_first_conv() const { return use_firstconv_; }
-  std::vector<float> mean() const { return mean_; }
-  std::vector<float> std() const { return std_; }
-  lite_api::MLUCoreVersion mlu_core_version() const {
-    return mlu_core_version_;
-  }
-  int mlu_core_number() const { return mlu_core_number_; }
+
+  void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
+  void set_mlu_core_number(int core_number);
+  void set_mlu_input_layout(DataLayoutType layout);
+  void set_mlu_use_first_conv(bool use_first_conv);
+  void set_mlu_first_conv_mean(const std::vector<float>& mean);
+  void set_mlu_first_conv_std(const std::vector<float>& std);
+
+  lite_api::MLUCoreVersion mlu_core_version() const;
+  int mlu_core_number() const;
+  DataLayoutType mlu_input_layout() const;
+  bool mlu_use_first_conv() const;
+  std::vector<float> mlu_first_conv_mean() const;
+  std::vector<float> mlu_first_conv_std() const;
 };

 /// MobileConfig is the config for the light weight predictor, it will skip

--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -128,11 +128,12 @@ void BindLiteCxxConfig(py::module *m) {
      .def("power_mode", &CxxConfig::power_mode);
 #endif
 #ifdef LITE_WITH_MLU
-  cxx_config.def("set_use_firstconv", &CxxConfig::set_use_firstconv)
-      .def("set_mean", &CxxConfig::set_mean)
-      .def("set_std", &CxxConfig::set_std)
-      .def("set_mlu_core_version", &CxxConfig::set_mlu_core_version)
-      .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number);
+  cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version)
+      .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number)
+      .def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout)
+      .def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv)
+      .def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean)
+      .def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std);
 #endif
 }


--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -72,6 +72,7 @@ thread_local int DeviceInfo::mlu_core_number_{1};
 thread_local bool DeviceInfo::use_first_conv_{false};
 thread_local std::vector<float> DeviceInfo::mean_vec_;
 thread_local std::vector<float> DeviceInfo::std_vec_;
+thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
 #endif

 #ifdef TARGET_IOS
@@ -1093,7 +1094,8 @@ void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
                               int core_number,
                               bool use_first_conv,
                               const std::vector<float>& mean_vec,
-                               const std::vector<float>& std_vec) {
+                               const std::vector<float>& std_vec,
+                               DataLayoutType input_layout) {
  switch (core_version) {
    case (lite_api::MLUCoreVersion::MLU_220):
      mlu_core_version_ = CNML_MLU220;
@@ -1109,6 +1111,7 @@ void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
  use_first_conv_ = use_first_conv;
  mean_vec_ = mean_vec;
  std_vec_ = std_vec;
+  input_layout_ = input_layout;
 }

 cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
@@ -1121,6 +1124,8 @@ const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }

 const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }

+DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
+
 #endif  // LITE_WITH_MLU

 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -60,12 +60,14 @@ class DeviceInfo {
                     int core_number,
                     bool use_first_conv,
                     const std::vector<float>& mean_vec,
-                     const std::vector<float>& std_vec);
+                     const std::vector<float>& std_vec,
+                     DataLayoutType input_layout);
  cnmlCoreVersion_t MLUCoreVersion();
  int MLUCoreNumber();
  bool UseFirstConv();
  const std::vector<float>& MeanVec() const;
  const std::vector<float>& StdVec() const;
+  DataLayoutType InputLayout() const;
 #endif
  void SetCache(int l1size, int l2size, int l3size);
  void SetArch(ARMArch arch) { arch_ = arch; }
@@ -124,6 +126,7 @@ class DeviceInfo {
  static thread_local bool use_first_conv_;
  static thread_local std::vector<float> mean_vec_;
  static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
 #endif

  void SetDotInfo(int argc, ...);

--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -74,7 +74,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
      if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
-          DataLayoutCompatible(*out_arg_ty, *cast_type)) {
+          DataLayoutCompatible(*out_arg_ty, *cast_type) &&
+          //  for first conv
+          PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
        is_found = true;
      }
    } else if (op_type == "io_copy") {
@@ -121,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
  cast_arg->AsArg().type = cast_type;
  auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
  // for CastAfter manully set the tensor's type
-  var->GetMutable<::paddle::lite::Tensor>();
+  var->GetMutable<paddle::lite::Tensor>();

  // create the stmt node
  auto* cast_inst = graph->NewInstructNode();
@@ -215,23 +217,23 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
                first_conv_nodes_.end(),
                head_node->AsArg().name) != first_conv_nodes_.end();

-  // layout cast node
-  if (head_type->layout() != inst_type->layout()) {
+  // precision cast node
+  if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
    cur_node = InsertCastBefore(
-        "layout",
-        name_prefix + "layout",
+        "cast",
+        name_prefix + "cast",
        graph,
        cur_node,
        inst_node,
        LiteType::GetTensorTy(
-            head_type->target(), head_type->precision(), inst_type->layout()));
+            head_type->target(), inst_type->precision(), head_type->layout()));
  }

-  // precision cast node
-  if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
+  // layout cast node
+  if (head_type->layout() != inst_type->layout()) {
    cur_node = InsertCastBefore(
-        "cast",
-        name_prefix + "cast",
+        "layout",
+        name_prefix + "layout",
        graph,
        cur_node,
        inst_node,
@@ -281,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,

  // get subgraph's valid precision
  const auto& places = graph->valid_places();
-  std::set<::paddle::lite_api::PrecisionType> prec_set;
+  std::set<paddle::lite_api::PrecisionType> prec_set;
  for (const auto& place : places) {
    if (place.target == TARGET(kMLU)) {
      prec_set.insert(place.precision);
@@ -364,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
  const auto name_prefix =
      tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";

-  // layout cast node
-  if (tail_type->layout() != inst_type->layout()) {
+  // precision cast node
+  if (tail_type->precision() != inst_type->precision()) {
    cur_node = InsertCastAfter(
-        "layout",
-        name_prefix + "layout",
+        "cast",
+        name_prefix + "cast",
        graph,
        cur_node,
        inst_node,
        LiteType::GetTensorTy(
-            tail_type->target(), tail_type->precision(), inst_type->layout()));
+            tail_type->target(), inst_type->precision(), tail_type->layout()));
  }

-  // precision cast node
-  if (tail_type->precision() != inst_type->precision()) {
+  // layout cast node
+  if (tail_type->layout() != inst_type->layout()) {
    cur_node = InsertCastAfter(
-        "cast",
-        name_prefix + "cast",
+        "layout",
+        name_prefix + "layout",
        graph,
        cur_node,
        inst_node,
@@ -474,13 +476,20 @@ bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) {
  return false;
 }

-void MLUPostprocessPass::GatherFirstConvNodes(SSAGraph* graph) {
+void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) {
  for (auto& node : graph->mutable_nodes()) {
    if (!node.IsStmt()) continue;
    if (node.AsStmt().op_type() == "feed") {
      for (auto& out : node.outlinks) {
        if (IsFirstConvNode(out)) {
          first_conv_nodes_.insert(out->AsArg().name);
+          // modify first conv nodes' type
+          const auto* old_type = out->AsArg().type;
+          out->AsArg().type =
+              LiteType::GetTensorTy(old_type->target(),
+                                    paddle::lite_api::PrecisionType::kInt8,
+                                    old_type->layout(),
+                                    old_type->device());
        }
      }
    }
@@ -504,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
          out->AsArg().type =
              LiteType::GetTensorTy(old_type->target(),
                                    old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                    old_type->device());
        }
      }
@@ -523,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
          inp->AsArg().type =
              LiteType::GetTensorTy(old_type->target(),
                                    old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                    old_type->device());
        }
      }
@@ -539,10 +548,12 @@ void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
  // arg_in and arg_out are assumed to be NHWC which user should be aware of.
  // Thus here we change these args' layout to NHWC
-  ModifyLayout(graph.get());
+  if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
+    ModifyLayout(graph.get());
+  }

  if (lite::DeviceInfo::Global().UseFirstConv()) {
-    GatherFirstConvNodes(graph.get());
+    GatherAndModifyFirstConvNodes(graph.get());
  }

  // insert io_copy, layout and precision cast of subgraph's inputs and outputs

--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -109,7 +109,7 @@ class MLUPostprocessPass : public ProgramPass {

  void RecreateOp(Node* inst_node, SSAGraph* graph);

-  void GatherFirstConvNodes(SSAGraph* graph);
+  void GatherAndModifyFirstConvNodes(SSAGraph* graph);

  bool IsFirstConvNode(Node* arg_node);


--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -84,7 +84,7 @@ struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {

 template <>
 struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
-  typedef ::paddle::lite::fluid::float16 T;
+  typedef paddle::lite::fluid::float16 T;
 };

 }  // namespace mlu

--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
@@ -48,11 +48,11 @@ REGISTER_LITE_KERNEL(
    def_layout_nhwc2nchw_fp16)
    .BindInput("Input",
               {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                      DATALAYOUT(kNHWC))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                       DATALAYOUT(kNCHW))})
    .Finalize();

@@ -82,10 +82,27 @@ REGISTER_LITE_KERNEL(
    def_layout_nchw2nhwc_fp16)
    .BindInput("Input",
               {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                      DATALAYOUT(kNCHW))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kInt8,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kInt8)>,
+    def_layout_nchw2nhwc_fp32_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt8),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
@@ -29,6 +29,24 @@ namespace lite {
 namespace kernels {
 namespace mlu {

+template <paddle::lite_api::PrecisionType>
+struct FPTypeTraits {};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
+  typedef float T;
+};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
+  typedef paddle::lite::fluid::float16 T;
+};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
+  typedef int8_t T;
+};
+
 template <lite::TargetType Target, typename T>
 inline void LayoutTransCompute(const int dim,
                               const lite::Context<Target>& context,
@@ -63,7 +81,7 @@ class LayoutNchwToNhwcCompute
    auto& param = this->template Param<param_t>();
    auto* x = param.x;
    auto* out = param.y;
-    out->template mutable_data<float>();
+    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
    auto x_dims = param.x->dims().size();
    auto& context = this->ctx_->template As<X86Context>();

@@ -88,7 +106,8 @@ class LayoutNchwToNhwcCompute
        CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc";
    }

-    LayoutTransCompute<lite::TargetType::kX86, float>(
+    LayoutTransCompute<lite::TargetType::kX86,
+                       typename FPTypeTraits<Precision>::T>(
        x_dims, context, *x, out, axis);

    if (x_dims > 2) {
@@ -111,7 +130,7 @@ class LayoutNhwcToNchwCompute
    auto& param = this->template Param<param_t>();
    auto* x = param.x;
    auto* out = param.y;
-    out->template mutable_data<float>();
+    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
    auto x_dims = param.x->dims().size();
    auto& context = this->ctx_->template As<X86Context>();

@@ -136,7 +155,8 @@ class LayoutNhwcToNchwCompute
        CHECK(0) << "Unsupport dim in mlu layout nhwc to nchw";
    }

-    LayoutTransCompute<lite::TargetType::kX86, float>(
+    LayoutTransCompute<lite::TargetType::kX86,
+                       typename FPTypeTraits<Precision>::T>(
        x_dims, context, *x, out, axis);

    if (x_dims > 2) {