diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 6445c31c1857d8011865661a50a85fd73dfc2f1d..fbf487e0065a28a19518673c1c1c9e793d913cfc 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -36,16 +36,12 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 #endif
 #ifdef LITE_WITH_MLU
   Env<TARGET(kMLU)>::Init();
-  mlu_core_version_ = config.mlu_core_version();
-  mlu_core_number_ = config.mlu_core_number();
-  use_first_conv_ = config.use_first_conv();
-  mean_vec_ = config.mean();
-  std_vec_ = config.std();
-  lite::DeviceInfo::Global().SetMLURunMode(mlu_core_version_,
-                                           mlu_core_number_,
-                                           use_first_conv_,
-                                           mean_vec_,
-                                           std_vec_);
+  lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
+                                           config.mlu_core_number(),
+                                           config.mlu_use_first_conv(),
+                                           config.mlu_first_conv_mean(),
+                                           config.mlu_first_conv_std(),
+                                           config.mlu_input_layout());
 #endif  // LITE_WITH_MLU
   auto places = config.valid_places();
   std::vector<std::string> passes{};
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index 2cb2064da518bca442e882d0733c5c6966c4fac0..91edb2cda7849211f288d64e00191ddba8f82f19 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -203,6 +203,37 @@ void ConfigBase::set_threads(int threads) {
 #endif
 }
 
+void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
+  mlu_core_version_ = core_version;
+}
+void CxxConfig::set_mlu_core_number(int core_number) {
+  mlu_core_number_ = core_number;
+}
+void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
+  mlu_input_layout_ = layout;
+}
+void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
+  mlu_use_first_conv_ = use_first_conv;
+}
+void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
+  mlu_first_conv_mean_ = mean;
+}
+void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
+  mlu_first_conv_std_ = std;
+}
+lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
+  return mlu_core_version_;
+}
+int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
+DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
+bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
+std::vector<float> CxxConfig::mlu_first_conv_mean() const {
+  return mlu_first_conv_mean_;
+}
+std::vector<float> CxxConfig::mlu_first_conv_std() const {
+  return mlu_first_conv_std_;
+}
+
 // set model data in combined format, `set_model_from_file` refers to loading
 // model from file, set_model_from_buffer refers to loading model from memory
 // buffer
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 23956ee47d996567ba318afa09da2aebb93d540e..0cb60bf84fe5063287646f825dc74dc5f51bee11 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -106,11 +106,6 @@ class LITE_API PaddlePredictor {
  protected:
   int threads_{1};
   lite_api::PowerMode mode_{lite_api::LITE_POWER_NO_BIND};
-  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLU_270};
-  int mlu_core_number_{1};
-  bool use_first_conv_{false};
-  std::vector<float> mean_vec_;
-  std::vector<float> std_vec_;
 };
 
 /// Base class for all the configs.
@@ -141,11 +136,12 @@ class LITE_API CxxConfig : public ConfigBase {
 #ifdef LITE_WITH_X86
   int x86_math_library_math_threads_ = 1;
 #endif
-  bool use_firstconv_{false};
-  std::vector<float> mean_ = {0.0f};
-  std::vector<float> std_ = {1.0f};
   lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
   int mlu_core_number_{1};
+  DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
+  bool mlu_use_first_conv_{false};
+  std::vector<float> mlu_first_conv_mean_;
+  std::vector<float> mlu_first_conv_std_;
 
  public:
   void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -173,20 +169,20 @@ class LITE_API CxxConfig : public ConfigBase {
     return x86_math_library_math_threads_;
   }
 #endif
-  void set_use_firstconv(const bool firstconv) { use_firstconv_ = firstconv; }
-  void set_mean(const std::vector<float> mean) { mean_ = mean; }
-  void set_std(const std::vector<float> std) { std_ = std; }
-  void set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
-    mlu_core_version_ = core_version;
-  }
-  void set_mlu_core_number(int core_number) { mlu_core_number_ = core_number; }
-  bool use_first_conv() const { return use_firstconv_; }
-  std::vector<float> mean() const { return mean_; }
-  std::vector<float> std() const { return std_; }
-  lite_api::MLUCoreVersion mlu_core_version() const {
-    return mlu_core_version_;
-  }
-  int mlu_core_number() const { return mlu_core_number_; }
+
+  void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
+  void set_mlu_core_number(int core_number);
+  void set_mlu_input_layout(DataLayoutType layout);
+  void set_mlu_use_first_conv(bool use_first_conv);
+  void set_mlu_first_conv_mean(const std::vector<float>& mean);
+  void set_mlu_first_conv_std(const std::vector<float>& std);
+
+  lite_api::MLUCoreVersion mlu_core_version() const;
+  int mlu_core_number() const;
+  DataLayoutType mlu_input_layout() const;
+  bool mlu_use_first_conv() const;
+  std::vector<float> mlu_first_conv_mean() const;
+  std::vector<float> mlu_first_conv_std() const;
 };
 
 /// MobileConfig is the config for the light weight predictor, it will skip
diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc
index 942d7f8b540a6ff7ae6d62e98e6e573e1af12aa8..5512e7bc438eddd6bcd9c8f792fc8507b03bf800 100644
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -128,11 +128,12 @@ void BindLiteCxxConfig(py::module *m) {
       .def("power_mode", &CxxConfig::power_mode);
 #endif
 #ifdef LITE_WITH_MLU
-  cxx_config.def("set_use_firstconv", &CxxConfig::set_use_firstconv)
-      .def("set_mean", &CxxConfig::set_mean)
-      .def("set_std", &CxxConfig::set_std)
-      .def("set_mlu_core_version", &CxxConfig::set_mlu_core_version)
-      .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number);
+  cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version)
+      .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number)
+      .def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout)
+      .def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv)
+      .def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean)
+      .def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std);
 #endif
 }
 
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index 8b5f0b58efb30c3c44561b594397625b23f590cd..29ac96ed744b016833a746b35002dd68109efd8b 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -72,6 +72,7 @@ thread_local int DeviceInfo::mlu_core_number_{1};
 thread_local bool DeviceInfo::use_first_conv_{false};
 thread_local std::vector<float> DeviceInfo::mean_vec_;
 thread_local std::vector<float> DeviceInfo::std_vec_;
+thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
 #endif
 
 #ifdef TARGET_IOS
@@ -1093,7 +1094,8 @@ void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
                                int core_number,
                                bool use_first_conv,
                                const std::vector<float>& mean_vec,
-                               const std::vector<float>& std_vec) {
+                               const std::vector<float>& std_vec,
+                               DataLayoutType input_layout) {
   switch (core_version) {
     case (lite_api::MLUCoreVersion::MLU_220):
       mlu_core_version_ = CNML_MLU220;
@@ -1109,6 +1111,7 @@ void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
   use_first_conv_ = use_first_conv;
   mean_vec_ = mean_vec;
   std_vec_ = std_vec;
+  input_layout_ = input_layout;
 }
 
 cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
@@ -1121,6 +1124,8 @@ const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
 
 const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
 
+DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
+
 #endif  // LITE_WITH_MLU
 
 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
diff --git a/lite/core/device_info.h b/lite/core/device_info.h
index dda9474b3ab9085b417a68213d76f432ee7440c9..4e7e4742c4f6caa8a902f56fe953acd383fe2185 100644
--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -60,12 +60,14 @@ class DeviceInfo {
                      int core_number,
                      bool use_first_conv,
                      const std::vector<float>& mean_vec,
-                     const std::vector<float>& std_vec);
+                     const std::vector<float>& std_vec,
+                     DataLayoutType input_layout);
   cnmlCoreVersion_t MLUCoreVersion();
   int MLUCoreNumber();
   bool UseFirstConv();
   const std::vector<float>& MeanVec() const;
   const std::vector<float>& StdVec() const;
+  DataLayoutType InputLayout() const;
 #endif
   void SetCache(int l1size, int l2size, int l3size);
   void SetArch(ARMArch arch) { arch_ = arch; }
@@ -124,6 +126,7 @@ class DeviceInfo {
   static thread_local bool use_first_conv_;
   static thread_local std::vector<float> mean_vec_;
   static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
 #endif
 
   void SetDotInfo(int argc, ...);
diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc
index 727189db77427a01a6f4d4477c053112e99e8103..c69584b2961c9a63b565536d33e36d8278f2c8ad 100644
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -74,7 +74,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
       const Type* in_arg_ty = kernel->GetInputDeclType("Input");
       const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
       if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
-          DataLayoutCompatible(*out_arg_ty, *cast_type)) {
+          DataLayoutCompatible(*out_arg_ty, *cast_type) &&
+          //  for first conv
+          PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
         is_found = true;
       }
     } else if (op_type == "io_copy") {
@@ -121,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
   cast_arg->AsArg().type = cast_type;
   auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
   // for CastAfter manully set the tensor's type
-  var->GetMutable<::paddle::lite::Tensor>();
+  var->GetMutable<paddle::lite::Tensor>();
 
   // create the stmt node
   auto* cast_inst = graph->NewInstructNode();
@@ -215,23 +217,23 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
                 first_conv_nodes_.end(),
                 head_node->AsArg().name) != first_conv_nodes_.end();
 
-  // layout cast node
-  if (head_type->layout() != inst_type->layout()) {
+  // precision cast node
+  if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
     cur_node = InsertCastBefore(
-        "layout",
-        name_prefix + "layout",
+        "cast",
+        name_prefix + "cast",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            head_type->target(), head_type->precision(), inst_type->layout()));
+            head_type->target(), inst_type->precision(), head_type->layout()));
   }
 
-  // precision cast node
-  if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
+  // layout cast node
+  if (head_type->layout() != inst_type->layout()) {
     cur_node = InsertCastBefore(
-        "cast",
-        name_prefix + "cast",
+        "layout",
+        name_prefix + "layout",
         graph,
         cur_node,
         inst_node,
@@ -281,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
 
   // get subgraph's valid precision
   const auto& places = graph->valid_places();
-  std::set<::paddle::lite_api::PrecisionType> prec_set;
+  std::set<paddle::lite_api::PrecisionType> prec_set;
   for (const auto& place : places) {
     if (place.target == TARGET(kMLU)) {
       prec_set.insert(place.precision);
@@ -364,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
   const auto name_prefix =
       tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
 
-  // layout cast node
-  if (tail_type->layout() != inst_type->layout()) {
+  // precision cast node
+  if (tail_type->precision() != inst_type->precision()) {
     cur_node = InsertCastAfter(
-        "layout",
-        name_prefix + "layout",
+        "cast",
+        name_prefix + "cast",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            tail_type->target(), tail_type->precision(), inst_type->layout()));
+            tail_type->target(), inst_type->precision(), tail_type->layout()));
   }
 
-  // precision cast node
-  if (tail_type->precision() != inst_type->precision()) {
+  // layout cast node
+  if (tail_type->layout() != inst_type->layout()) {
     cur_node = InsertCastAfter(
-        "cast",
-        name_prefix + "cast",
+        "layout",
+        name_prefix + "layout",
         graph,
         cur_node,
         inst_node,
@@ -474,13 +476,20 @@ bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) {
   return false;
 }
 
-void MLUPostprocessPass::GatherFirstConvNodes(SSAGraph* graph) {
+void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) {
   for (auto& node : graph->mutable_nodes()) {
     if (!node.IsStmt()) continue;
     if (node.AsStmt().op_type() == "feed") {
       for (auto& out : node.outlinks) {
         if (IsFirstConvNode(out)) {
           first_conv_nodes_.insert(out->AsArg().name);
+          // modify first conv nodes' type
+          const auto* old_type = out->AsArg().type;
+          out->AsArg().type =
+              LiteType::GetTensorTy(old_type->target(),
+                                    paddle::lite_api::PrecisionType::kInt8,
+                                    old_type->layout(),
+                                    old_type->device());
         }
       }
     }
@@ -504,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
           out->AsArg().type =
               LiteType::GetTensorTy(old_type->target(),
                                     old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                     old_type->device());
         }
       }
@@ -523,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
           inp->AsArg().type =
               LiteType::GetTensorTy(old_type->target(),
                                     old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                     old_type->device());
         }
       }
@@ -539,10 +548,12 @@ void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
   // arg_in and arg_out are assumed to be NHWC which user should be aware of.
   // Thus here we change these args' layout to NHWC
-  ModifyLayout(graph.get());
+  if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
+    ModifyLayout(graph.get());
+  }
 
   if (lite::DeviceInfo::Global().UseFirstConv()) {
-    GatherFirstConvNodes(graph.get());
+    GatherAndModifyFirstConvNodes(graph.get());
   }
 
   // insert io_copy, layout and precision cast of subgraph's inputs and outputs
diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h
index 34b449cca664e28a24a02deb6e214f5e81386767..688dd06fb5fbec0c8e1c53acfe4215456ddb4192 100644
--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -109,7 +109,7 @@ class MLUPostprocessPass : public ProgramPass {
 
   void RecreateOp(Node* inst_node, SSAGraph* graph);
 
-  void GatherFirstConvNodes(SSAGraph* graph);
+  void GatherAndModifyFirstConvNodes(SSAGraph* graph);
 
   bool IsFirstConvNode(Node* arg_node);
 
diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h
index 2af8274e07713300277f7280f12e6d1fcb47c3c2..fa8fb1597c0fb068a855928dd20057d48ecd5eaf 100644
--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -84,7 +84,7 @@ struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
 
 template <>
 struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
-  typedef ::paddle::lite::fluid::float16 T;
+  typedef paddle::lite::fluid::float16 T;
 };
 
 }  // namespace mlu
diff --git a/lite/kernels/mlu/layout_compute.cc b/lite/kernels/mlu/layout_compute.cc
index 97d8202553ec8f11e35f6d11c9a9c12c580463c6..d4e16734d6d2dae6f5c119194008bce114a2e918 100644
--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
@@ -48,11 +48,11 @@ REGISTER_LITE_KERNEL(
     def_layout_nhwc2nchw_fp16)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                        DATALAYOUT(kNCHW))})
     .Finalize();
 
@@ -82,10 +82,27 @@ REGISTER_LITE_KERNEL(
     def_layout_nchw2nhwc_fp16)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kNCHW))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kInt8,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kInt8)>,
+    def_layout_nchw2nhwc_fp32_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt8),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
diff --git a/lite/kernels/mlu/layout_compute.h b/lite/kernels/mlu/layout_compute.h
index 5e87e3526417573f2e0f01280b1d86ccb5691093..edacdf8a98a2ffde6e538f61d4dd8259e3211b22 100644
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
@@ -29,6 +29,24 @@ namespace lite {
 namespace kernels {
 namespace mlu {
 
+template <paddle::lite_api::PrecisionType>
+struct FPTypeTraits {};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
+  typedef float T;
+};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
+  typedef paddle::lite::fluid::float16 T;
+};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
+  typedef int8_t T;
+};
+
 template <lite::TargetType Target, typename T>
 inline void LayoutTransCompute(const int dim,
                                const lite::Context<Target>& context,
@@ -63,7 +81,7 @@ class LayoutNchwToNhwcCompute
     auto& param = this->template Param<param_t>();
     auto* x = param.x;
     auto* out = param.y;
-    out->template mutable_data<float>();
+    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
     auto x_dims = param.x->dims().size();
     auto& context = this->ctx_->template As<X86Context>();
 
@@ -88,7 +106,8 @@ class LayoutNchwToNhwcCompute
         CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc";
     }
 
-    LayoutTransCompute<lite::TargetType::kX86, float>(
+    LayoutTransCompute<lite::TargetType::kX86,
+                       typename FPTypeTraits<Precision>::T>(
         x_dims, context, *x, out, axis);
 
     if (x_dims > 2) {
@@ -111,7 +130,7 @@ class LayoutNhwcToNchwCompute
     auto& param = this->template Param<param_t>();
     auto* x = param.x;
     auto* out = param.y;
-    out->template mutable_data<float>();
+    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
     auto x_dims = param.x->dims().size();
     auto& context = this->ctx_->template As<X86Context>();
 
@@ -136,7 +155,8 @@ class LayoutNhwcToNchwCompute
         CHECK(0) << "Unsupport dim in mlu layout nhwc to nchw";
     }
 
-    LayoutTransCompute<lite::TargetType::kX86, float>(
+    LayoutTransCompute<lite::TargetType::kX86,
+                       typename FPTypeTraits<Precision>::T>(
         x_dims, context, *x, out, axis);
 
     if (x_dims > 2) {