[MLU] add cast on MLU as default, test=develop (#3776)

cc927184 · MaxwellDing · GitHub · 11cbd50e · cc927184 · cc927184
55 changed file
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -53,12 +53,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 #endif
 #ifdef LITE_WITH_MLU
    Env<TARGET(kMLU)>::Init();
-    lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
+    lite::TargetWrapperMlu::SetMLURunMode(config.mlu_core_version(),
-                                             config.mlu_core_number(),
+                                          config.mlu_core_number(),
-                                             config.mlu_use_first_conv(),
+                                          config.mlu_input_layout(),
-                                             config.mlu_first_conv_mean(),
+                                          config.mlu_firstconv_param());
-                                             config.mlu_first_conv_std(),
-                                             config.mlu_input_layout());
 #endif  // LITE_WITH_MLU
    auto use_layout_preprocess_pass =
        config.model_dir().find("OPENCL_PRE_PRECESS");

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 #include "lite/api/paddle_api.h"
+#include <utility>
 #include "lite/core/context.h"
 #include "lite/core/device_info.h"
 #include "lite/core/target_wrapper.h"
@@ -22,6 +25,10 @@
 #include "lite/backends/cuda/target_wrapper.h"
 #endif
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/target_wrapper.h"
+#endif
 namespace paddle {
 namespace lite_api {
@@ -97,6 +104,13 @@ void Tensor::CopyFromCpu(const T *src_data) {
        data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
 #else
    LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else if (type == TargetType::kMLU) {
+#ifdef LITE_WITH_MLU
+    lite::TargetWrapperMlu::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
+#else
+    LOG(FATAL) << "Please compile the lib with MLU.";
 #endif
  } else {
    LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA";
@@ -117,6 +131,13 @@ void Tensor::CopyToCpu(T *data) const {
        data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
 #else
    LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else if (type == TargetType::kMLU) {
+#ifdef LITE_WITH_MLU
+    lite::TargetWrapperMlu::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
+#else
+    LOG(FATAL) << "Please compile the lib with MLU.";
 #endif
  } else {
    LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA";
@@ -138,6 +159,11 @@ template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
 template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
+template void Tensor::CopyFromCpu<int, TargetType::kMLU>(const int *);
+template void Tensor::CopyFromCpu<int64_t, TargetType::kMLU>(const int64_t *);
+template void Tensor::CopyFromCpu<float, TargetType::kMLU>(const float *);
+template void Tensor::CopyFromCpu<int8_t, TargetType::kMLU>(const int8_t *);
 template void Tensor::CopyToCpu(float *) const;
 template void Tensor::CopyToCpu(int *) const;
 template void Tensor::CopyToCpu(int8_t *) const;
@@ -228,13 +254,9 @@ void CxxConfig::set_mlu_core_number(int core_number) {
 void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
  mlu_input_layout_ = layout;
 }
-void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
+void CxxConfig::set_mlu_firstconv_param(const std::vector<float> &mean,
-  mlu_use_first_conv_ = use_first_conv;
+                                        const std::vector<float> &std) {
-}
-void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
  mlu_first_conv_mean_ = mean;
-}
-void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
  mlu_first_conv_std_ = std;
 }
 lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
@@ -242,12 +264,9 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
 }
 int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
 DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
-bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
+std::pair<std::vector<float>, std::vector<float>>
-const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
+CxxConfig::mlu_firstconv_param() const {
-  return mlu_first_conv_mean_;
+  return std::make_pair(mlu_first_conv_mean_, mlu_first_conv_std_);
-}
-const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
-  return mlu_first_conv_std_;
 }
 #endif

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -21,6 +21,7 @@
 #define PADDLE_LITE_API_H_
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "paddle_place.h"  // NOLINT
@@ -160,9 +161,8 @@ class LITE_API CxxConfig : public ConfigBase {
  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
  int mlu_core_number_{1};
  DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
-  bool mlu_use_first_conv_{false};
+  std::vector<float> mlu_first_conv_mean_{};
-  std::vector<float> mlu_first_conv_mean_;
+  std::vector<float> mlu_first_conv_std_{};
-  std::vector<float> mlu_first_conv_std_;
 #endif
 public:
@@ -210,24 +210,22 @@ class LITE_API CxxConfig : public ConfigBase {
  void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
  // set MLU core number, which is used when compiling MLU kernels
  void set_mlu_core_number(int core_number);
-  // set MLU input layout. User can specify layout of input data to be NHWC,
-  // default is NCHW
-  void set_mlu_input_layout(DataLayoutType layout);
  // whether use MLU's first conv kernel. First conv is a special kernel
  // provided by MLU, its input is uint8, and also needs two 3-dimentional
  // vectors which save all inputs' mean and std values
-  void set_mlu_use_first_conv(bool use_first_conv);
+  // set the 3-dimentional mean vector and 3-dimentional std vector used by
-  // set the 3-dimentional mean vector used by MLU's first conv
+  // MLU's first conv
-  void set_mlu_first_conv_mean(const std::vector<float>& mean);
+  void set_mlu_firstconv_param(const std::vector<float>& mean,
-  // set the 3-dimentional std vector used by MLU's first conv
+                               const std::vector<float>& std);
-  void set_mlu_first_conv_std(const std::vector<float>& std);
+  // set MLU input layout. User can specify layout of input data to be NHWC,
+  // default is NCHW
+  void set_mlu_input_layout(DataLayoutType layout);
  lite_api::MLUCoreVersion mlu_core_version() const;
  int mlu_core_number() const;
  DataLayoutType mlu_input_layout() const;
-  bool mlu_use_first_conv() const;
+  // std::pair<mean, std>
-  const std::vector<float>& mlu_first_conv_mean() const;
+  std::pair<std::vector<float>, std::vector<float>> mlu_firstconv_param() const;
-  const std::vector<float>& mlu_first_conv_std() const;
 #endif
  // XPU only, set the size of the workspace memory from L3 cache for the

--- a/lite/backends/mlu/target_wrapper.cc
+++ b/lite/backends/mlu/target_wrapper.cc
@@ -15,6 +15,7 @@
 #include "lite/backends/mlu/target_wrapper.h"
 #include <memory>
+#include <utility>
 #include "lite/backends/mlu/mlu_utils.h"
@@ -36,6 +37,13 @@ void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) {
 }  // namespace mlu
+thread_local cnmlCoreVersion_t TargetWrapperMlu::mlu_core_version_{CNML_MLU270};
+thread_local int TargetWrapperMlu::mlu_core_number_{1};
+thread_local bool TargetWrapperMlu::use_first_conv_{false};
+thread_local std::vector<float> TargetWrapperMlu::mean_vec_;
+thread_local std::vector<float> TargetWrapperMlu::std_vec_;
+thread_local DataLayoutType TargetWrapperMlu::input_layout_{DATALAYOUT(kNCHW)};
 size_t TargetWrapperMlu::num_devices() {
  uint32_t dev_count = 0;
  CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed";
@@ -77,15 +85,42 @@ void TargetWrapperMlu::MemcpySync(void* dst,
      LOG(FATAL) << "Unsupported IoDirection" << static_cast<int>(dir);
  }
 }
+void TargetWrapperMlu::SetMLURunMode(
+    lite_api::MLUCoreVersion core_version,
+    int core_number,
+    DataLayoutType input_layout,
+    std::pair<std::vector<float>, std::vector<float>> firstconv_param) {
+  switch (core_version) {
+    case (lite_api::MLUCoreVersion::MLU_220):
+      mlu_core_version_ = CNML_MLU220;
+      break;
+    case (lite_api::MLUCoreVersion::MLU_270):
+      mlu_core_version_ = CNML_MLU270;
+      break;
+    default:
+      mlu_core_version_ = CNML_MLU270;
+      break;
+  }
+  mlu_core_number_ = core_number;
+  mean_vec_ = firstconv_param.first;
+  std_vec_ = firstconv_param.second;
+  use_first_conv_ = !(mean_vec_.empty() || std_vec_.empty());
+  input_layout_ = input_layout;
+}
+cnmlCoreVersion_t TargetWrapperMlu::MLUCoreVersion() {
+  return mlu_core_version_;
+}
+int TargetWrapperMlu::MLUCoreNumber() { return mlu_core_number_; }
+bool TargetWrapperMlu::UseFirstConv() { return use_first_conv_; }
+const std::vector<float>& TargetWrapperMlu::MeanVec() { return mean_vec_; }
+const std::vector<float>& TargetWrapperMlu::StdVec() { return std_vec_; }
-// void TargetWrapperMlu::MemcpyAsync(void* dst,
+DataLayoutType TargetWrapperMlu::InputLayout() { return input_layout_; }
-//                                    const void* src,
-//                                    size_t size,
-//                                    IoDirection dir,
-//                                    const stream_t& stream) {
-//   LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync.";
-//   MemcpySync(dst, src, size, dir);
-// }
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/mlu/target_wrapper.h
+++ b/lite/backends/mlu/target_wrapper.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 #pragma once
+#include <utility>
+#include <vector>
 #include "lite/backends/mlu/mlu_utils.h"
 #include "lite/core/target_wrapper.h"
@@ -43,11 +45,25 @@ class TargetWrapper<TARGET(kMLU)> {
                         const void* src,
                         size_t size,
                         IoDirection dir);
-  // static void MemcpyAsync(void* dst,
+  static void SetMLURunMode(
-  //                         const void* src,
+      lite_api::MLUCoreVersion core_version,
-  //                         size_t size,
+      int core_number,
-  //                         IoDirection dir,
+      DataLayoutType input_layout,
-  //                         const queue_t& queue);
+      std::pair<std::vector<float>, std::vector<float>> firstconv_param);
+  static cnmlCoreVersion_t MLUCoreVersion();
+  static int MLUCoreNumber();
+  static bool UseFirstConv();
+  static const std::vector<float>& MeanVec();
+  static const std::vector<float>& StdVec();
+  static DataLayoutType InputLayout();
+ private:
+  static thread_local cnmlCoreVersion_t mlu_core_version_;
+  static thread_local int mlu_core_number_;
+  static thread_local bool use_first_conv_;
+  static thread_local std::vector<float> mean_vec_;
+  static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
 };
 }  // namespace lite

--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -27,5 +27,11 @@ thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
 int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};
 #endif
+#ifdef LITE_WITH_MLU
+int Context<TargetType::kMLU>::next_queue_id_{0};
+std::map<int, int> Context<TargetType::kMLU>::queue_id_map_;
+std::mutex Context<TargetType::kMLU>::map_mutex_;
+#endif
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -25,6 +25,7 @@
 #ifdef LITE_WITH_MLU
 #include <cnml.h>
 #include <cnrt.h>
+#include <mutex>  // NOLINT
 #include "lite/backends/mlu/mlu_utils.h"
 #endif
 #ifdef LITE_WITH_XPU
@@ -249,11 +250,11 @@ class Context<TargetType::kMLU> {
  void InitOnce() {}
  MLUContext& operator=(const MLUContext& ctx) {
-    this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
+    this->Init(ctx.device_id_, ctx.exec_queue_id_);
    return *this;
  }
-  void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
+  void Init(int dev_id, int exec_queue_id = 0) {
    CHECK_GT(devs.size(), 0UL)
        << "Env is not initialized or current target is not exit!";
    if (dev_id >= static_cast<int>(devs.size())) {
@@ -264,21 +265,19 @@ class Context<TargetType::kMLU> {
      device_id_ = dev_id;
    }
    SetMluDevice(device_id_);
-    if (io_queue_id >= devs[dev_id].max_queue()) {
-      LOG(WARNING) << "data queue index exceeds the maximum queue number, "
+    // get queue id from map
-                      "set to default qeueu(0)!";
+    std::unique_lock<std::mutex> lk(map_mutex_);
-      io_queue_id = 0;
+    if (queue_id_map_.find(exec_queue_id) == queue_id_map_.end()) {
-    }
+      queue_id_map_[exec_queue_id] =
-    if (exec_queue_id >= devs[dev_id].max_queue()) {
+          next_queue_id_++ % devs[dev_id].max_queue();
-      LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
-                      "set to default qeueu(0)!";
-      exec_queue_id = 0;
    }
-    io_queue_ = devs[dev_id].io_queues()[io_queue_id];
+    exec_queue_id_ = queue_id_map_[exec_queue_id];
-    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
+    VLOG(4) << "pick mlu queue id: " << exec_queue_id_;
+    lk.unlock();
-    exec_queue_id_ = exec_queue_id;
+    io_queue_ = devs[dev_id].io_queues()[exec_queue_id_];
-    io_queue_id_ = io_queue_id;
+    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id_];
  }
  void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
@@ -290,10 +289,12 @@ class Context<TargetType::kMLU> {
  void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
  cnmlCoreVersion_t MLUCoreVersion() {
-    return DeviceInfo::Global().MLUCoreVersion();
+    return paddle::lite::TargetWrapperMlu::MLUCoreVersion();
  }
-  int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
+  int MLUCoreNumber() {
+    return paddle::lite::TargetWrapperMlu::MLUCoreNumber();
+  }
  u32_t affinity() { return affinity_; }
@@ -304,10 +305,12 @@ class Context<TargetType::kMLU> {
  std::string name() const { return "MLUContext"; }
 private:
+  static int next_queue_id_;
+  static std::map<int, int> queue_id_map_;
+  static std::mutex map_mutex_;
  int device_id_;
  // overall information
  int exec_queue_id_;
-  int io_queue_id_;
  cnrtQueue_t io_queue_;
  cnrtQueue_t exec_queue_;
@@ -455,7 +458,7 @@ class ContextScheduler {
      case TARGET(kMLU): {
        int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
        auto& context = ctx->As<MLUContext>();
-        context.Init(dev_id);
+        context.Init(dev_id, exec_stream_id);
        kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
            &context);
        LOG(INFO) << "New Context for MLU";

--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -66,15 +66,6 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
 thread_local TensorLite DeviceInfo::workspace_;
 thread_local int64_t DeviceInfo::count_ = 0;
-#ifdef LITE_WITH_MLU
-thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
-thread_local int DeviceInfo::mlu_core_number_{1};
-thread_local bool DeviceInfo::use_first_conv_{false};
-thread_local std::vector<float> DeviceInfo::mean_vec_;
-thread_local std::vector<float> DeviceInfo::std_vec_;
-thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
-#endif
 #ifdef TARGET_IOS
 const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
 const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
@@ -1089,45 +1080,6 @@ int DeviceInfo::Setup() {
  return 0;
 }
-#ifdef LITE_WITH_MLU
-void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
-                               int core_number,
-                               bool use_first_conv,
-                               const std::vector<float>& mean_vec,
-                               const std::vector<float>& std_vec,
-                               DataLayoutType input_layout) {
-  switch (core_version) {
-    case (lite_api::MLUCoreVersion::MLU_220):
-      mlu_core_version_ = CNML_MLU220;
-      break;
-    case (lite_api::MLUCoreVersion::MLU_270):
-      mlu_core_version_ = CNML_MLU270;
-      break;
-    default:
-      mlu_core_version_ = CNML_MLU270;
-      break;
-  }
-  mlu_core_number_ = core_number;
-  use_first_conv_ = use_first_conv;
-  mean_vec_ = mean_vec;
-  std_vec_ = std_vec;
-  input_layout_ = input_layout;
-}
-cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
-int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
-bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
-const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
-const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
-DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
-#endif  // LITE_WITH_MLU
 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
  thread_num = std::min(thread_num, core_num_);

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -55,20 +55,6 @@ class DeviceInfo {
  int Setup();
  void SetRunMode(lite_api::PowerMode mode, int thread_num);
-#ifdef LITE_WITH_MLU
-  void SetMLURunMode(lite_api::MLUCoreVersion core_version,
-                     int core_number,
-                     bool use_first_conv,
-                     const std::vector<float>& mean_vec,
-                     const std::vector<float>& std_vec,
-                     DataLayoutType input_layout);
-  cnmlCoreVersion_t MLUCoreVersion();
-  int MLUCoreNumber();
-  bool UseFirstConv();
-  const std::vector<float>& MeanVec() const;
-  const std::vector<float>& StdVec() const;
-  DataLayoutType InputLayout() const;
-#endif
  void SetCache(int l1size, int l2size, int l3size);
  void SetArch(ARMArch arch) { arch_ = arch; }
@@ -120,15 +106,6 @@ class DeviceInfo {
  static thread_local TensorLite workspace_;
  static thread_local int64_t count_;
-#ifdef LITE_WITH_MLU
-  static thread_local cnmlCoreVersion_t mlu_core_version_;
-  static thread_local int mlu_core_number_;
-  static thread_local bool use_first_conv_;
-  static thread_local std::vector<float> mean_vec_;
-  static thread_local std::vector<float> std_vec_;
-  static thread_local DataLayoutType input_layout_;
-#endif
  void SetDotInfo(int argc, ...);
  void SetFP16Info(int argc, ...);
  void SetFP32Info(int argc, ...);

--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
@@ -64,4 +64,5 @@ REGISTER_MIR_PASS(lite_conv_activation_fuse_pass,
                  paddle::lite::mir::ConvActivationFusePass)
    .BindTargets({TARGET(kAny)})
    .ExcludeTargets({TARGET(kXPU)})
+    .ExcludeTargets({TARGET(kMLU)})
    .BindKernel("conv2d");
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -24,8 +24,13 @@ namespace mir {
 void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 #ifdef LITE_WITH_X86
+#ifdef LITE_WITH_MLU
+  fusion::FcFuser fuser(false);
+  fuser(graph.get());
+#else
  fusion::FcFuser fuser(true);
  fuser(graph.get());
+#endif
 #endif
  fusion::FcFuser fuser2(false);
@@ -38,6 +43,9 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
    .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kXPU), TARGET(kX86)})
+    .ExcludeTargets({TARGET(kXPU)})
+#ifndef LITE_WITH_MLU
+    .ExcludeTargets({TARGET(kX86)})
+#endif
    .ExcludeTargets({TARGET(kBM)})
    .BindKernel("fc");
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -314,4 +314,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
                     TARGET(kXPU),
                     TARGET(kBM),
                     TARGET(kRKNPU),
-                     TARGET(kAPU)});
+                     TARGET(kAPU),
+                     TARGET(kMLU)});
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -79,6 +79,8 @@ class MLUPostprocessPass : public ProgramPass {
                            const Type** arg_type,
                            SSAGraph* graph);
+  void ModifyInputOutputDataType(SSAGraph* graph);
  void ModifyLayout(SSAGraph* graph);
  bool NeedInsert(Node* node, const Type* inst_type);
@@ -86,12 +88,14 @@ class MLUPostprocessPass : public ProgramPass {
  void InsertBefore(SSAGraph* graph,
                    Node* head_node,
                    Node* inst_node,
-                    const Type* type);
+                    const Type* type,
+                    bool use_mlu_cast);
  void InsertAfter(SSAGraph* graph,
                   Node* tail_node,
                   Node* inst_node,
-                   const Type* type);
+                   const Type* type,
+                   bool use_mlu_cast);
  Node* InsertCastBefore(const std::string& op_type,
                         const std::string& cast_arg_name,
@@ -115,6 +119,8 @@ class MLUPostprocessPass : public ProgramPass {
  bool IsFirstConvInSubgraph(Node* arg_node, Node* inst);
+  void AdjustSubgraph(Node* subgraph_node, const Type* op_type);
 private:
  std::set<std::string> first_conv_nodes_;
 };

--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ b/lite/core/mir/runtime_context_assign_pass.cc
@@ -44,6 +44,10 @@ class RuntimeContextAssignPass : public StmtPass {
        inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
            inst.picked_kernel().target()));
      }
+#elif LITE_WITH_MLU
+      inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+          inst.picked_kernel().target(),
+          static_cast<int>(reinterpret_cast<int64_t>(graph.get()))));
 #else
      int stream_id = inst.stream_id_;

--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -249,11 +249,13 @@ void OpenCLTypeLayoutTransformPass::Apply(
 REGISTER_MIR_PASS(type_layout_cast_pass,
                  paddle::lite::mir::TypeLayoutTransformPass)
    .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kMLU)})
    .BindKernel("layout_once")
    .BindKernel("layout");
 REGISTER_MIR_PASS(type_layout_cast_preprocess_pass,
                  paddle::lite::mir::OpenCLTypeLayoutTransformPass)
    .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kMLU)})
    .BindKernel("layout_once")
    .BindKernel("layout");
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -108,9 +108,13 @@ class Optimizer {
           "bm_subgraph_pass",
           "apu_subgraph_pass",
           "rknpu_subgraph_pass",
+           "mlu_subgraph_pass",
           "static_kernel_pick_pass",  // pick original kernel from graph
           "remove_tf_redundant_ops_pass",
           "variable_place_inference_pass",  // inference arg/var's
+           "mlu_postprocess_pass",
           // info(target/precision/layout/device)
           // using kernel info
           "argument_type_display_pass",  // debug pass: show arg-type-node's
@@ -140,13 +144,9 @@ class Optimizer {
           "variable_place_inference_pass",  //
           "argument_type_display_pass",
-           "mlu_subgraph_pass",
           "runtime_context_assign_pass",
           "argument_type_display_pass",
-           "mlu_postprocess_pass",
           "memory_optimize_pass"}};
      if (passes.size() == 1) {

--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
@@ -4,6 +4,7 @@ endif()
 add_subdirectory(bridges)
 add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
-add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
+add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu})
-add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
+add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
+# depend on transpose function in backend/x86/math/math_function
+add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_function})
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
@@ -3,7 +3,7 @@ if(NOT LITE_WITH_MLU)
 endif()
 lite_cc_library(subgraph_bridge_utility_mlu SRCS utility.cc DEPS ${mlu_builder_libs} tensor)
-lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs})
+lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs} subgraph_bridge_utility_mlu)
 lite_cc_library(subgraph_bridge_graph_mlu SRCS graph.cc DEPS subgraph_bridge_utility_mlu subgraph_bridge_tensor_mlu)
 set(mlu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_mlu subgraph_bridge_graph_mlu)
@@ -18,6 +18,8 @@ lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_d
 lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_cast_op_mlu SRCS cast_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_layout_op_mlu SRCS layout_op.cc DEPS ${subgraph_bridge_deps_mlu})
 set(mlu_subgraph_bridges
        subgraph_bridge_registry
        subgraph_bridge_utility_mlu
@@ -32,6 +34,8 @@ set(mlu_subgraph_bridges
        subgraph_bridge_scale_op_mlu
        subgraph_bridge_interp_op_mlu
        subgraph_bridge_concat_op_mlu
+        subgraph_bridge_cast_op_mlu
+        subgraph_bridge_layout_op_mlu
        CACHE INTERNAL "mlu_subgraph_bridges")
 lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
@@ -45,4 +49,6 @@ lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer targe
 lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_layout_converter_mlu SRCS layout_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_cast_converter_mlu SRCS cast_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
@@ -60,6 +60,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                 output_tensor->mlu_tensor()));
  }
  graph->FuseOp(activation_op);
+  CNML_CALL(cnmlDestroyBaseOp(&activation_op));
  return SUCCESS;
 }
@@ -72,6 +73,9 @@ REGISTER_SUBGRAPH_BRIDGE(sigmoid,
                         kMLU,
                         paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(relu6,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
                         kMLU,

--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 #include <gtest/gtest.h>
 #include <random>
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/mlu/bridges/test_helper.h"
@@ -116,7 +118,7 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
    opdesc.SetAttr("offset", 0.5f);
  }
-  // create and convert op to NPU model, then run it on NPU
+  // create and convert op to MLU model, then run it on MLU
  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
  // execute reference implementation and save to output tensor
  act_ref(op);
@@ -134,7 +136,8 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
 TEST(MLUBridges, activation) {
  std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
-  std::vector<std::string> types{"sigmoid", "relu", "tanh", "leaky_relu"};
+  std::vector<std::string> types{
+      "sigmoid", "relu", "relu6", "tanh", "leaky_relu"};
  for (auto x_shape : shapes) {
    for (auto op_type : types) {
      test_act(x_shape, op_type);
@@ -149,5 +152,6 @@ TEST(MLUBridges, activation) {
 USE_SUBGRAPH_BRIDGE(sigmoid, kMLU)
 USE_SUBGRAPH_BRIDGE(relu, kMLU)
+USE_SUBGRAPH_BRIDGE(relu6, kMLU)
 USE_SUBGRAPH_BRIDGE(tanh, kMLU)
 USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU)
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
@@ -48,25 +48,32 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
  auto mean_dims = mean->dims().Vectorize();
+  if (mean_dims.size() < 4) {
+    mean_dims.insert(mean_dims.begin(), 4 - mean_dims.size(), 1);
+  }
  auto mean_tensor = graph->AddNode(
-      mean_var_name, mean_dims, CNML_CONST, CNML_CNHW, graph->FPType());
+      mean_var_name, mean_dims, CNML_CONST, CNML_NHWC, graph->FPType());
  auto variance = scope->FindVar(variance_var_name)->GetMutable<Tensor>();
  auto variance_dims = variance->dims().Vectorize();
+  if (variance_dims.size() < 4) {
+    variance_dims.insert(variance_dims.begin(), 4 - variance_dims.size(), 1);
+  }
  auto variance_tensor = graph->AddNode(
-      variance_var_name, variance_dims, CNML_CONST, CNML_CNHW, graph->FPType());
+      variance_var_name, variance_dims, CNML_CONST, CNML_NHWC, graph->FPType());
  auto scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
  auto bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
-  int co = static_cast<int>(mean_dims[0]);
+  int co = static_cast<int>(mean_dims[3]);
+  std::vector<float> variance_trans(co);
+  std::vector<float> mean_trans(co);
  for (int i = 0; i < co; ++i) {
-    variance->mutable_data<float>()[i] =
+    variance_trans[i] =
        scale->data<float>()[i] / sqrtf(variance->data<float>()[i] + epsilon);
-    mean->mutable_data<float>()[i] =
+    mean_trans[i] =
-        mean->data<float>()[i] -
+        mean->data<float>()[i] - bias->data<float>()[i] / variance_trans[i];
-        bias->data<float>()[i] / variance->data<float>()[i];
  }
  auto input_tensor = graph->GetNode(x_var_name);
@@ -77,10 +84,14 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                         mean_tensor->mlu_tensor(),
                                         variance_tensor->mlu_tensor()));
-  graph->BindConstData(variance_var_name, variance);
+  graph->BindConstRawData(
-  graph->BindConstData(mean_var_name, mean);
+      variance_var_name, variance_trans.data(), variance_trans.size(), true);
+  graph->BindConstRawData(
+      mean_var_name, mean_trans.data(), mean_trans.size(), true);
  graph->FuseOp(bn_op);
+  CNML_CALL(cnmlDestroyBaseOp(&bn_op));
  return SUCCESS;
 }

--- a/lite/kernels/mlu/bridges/cast_op.cc
+++ b/lite/kernels/mlu/bridges/cast_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto in_dtype = op_info->GetAttr<int>("in_dtype");
+  auto out_dtype = op_info->GetAttr<int>("out_dtype");
+  CHECK(graph->HasNode(x_var_name));
+  auto x_tensor = graph->GetNode(x_var_name);
+  cnmlDataType_t out_type;
+  cnmlCastType_t cast_type;
+  if (in_dtype == 4 && out_dtype == 5) {
+    cast_type = CNML_CAST_FLOAT16_TO_FLOAT32;
+    out_type = CNML_DATA_FLOAT32;
+  } else if (in_dtype == 5 && out_dtype == 4) {
+    cast_type = CNML_CAST_FLOAT32_TO_FLOAT16;
+    out_type = CNML_DATA_FLOAT16;
+  } else {
+    CHECK(0) << "Unsupported cast type";
+  }
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, out_type);
+  cnmlBaseOp_t cast_op;
+  CNML_CALL(cnmlCreateCastOp(&cast_op,
+                             cast_type,
+                             x_tensor->mlu_tensor(),
+                             output_tensor->mlu_tensor()));
+  graph->FuseOp(cast_op);
+  CNML_CALL(cnmlDestroyBaseOp(&cast_op));
+  return SUCCESS;
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(cast,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::CastConverter);
--- a/lite/kernels/mlu/bridges/cast_op_test.cc
+++ b/lite/kernels/mlu/bridges/cast_op_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/cast_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+void test_cast_FP16_to_FP32(std::vector<int64_t> shape) {
+  // prepare input&output variables
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(shape));
+  auto* x_data = x->mutable_data<paddle::lite::fluid::float16>();
+  // initialize input&output data
+  for (int i = 0; i < x->dims().production(); i++) {
+    x_data[i] = static_cast<paddle::lite::fluid::float16>(i);
+  }
+  // initialize op desc
+  int in_dtype = 4, out_dtype = 5;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("cast");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("in_dtype", in_dtype);
+  opdesc.SetAttr("out_dtype", out_dtype);
+  auto op = CreateOp<operators::CastOp>(opdesc, &scope);
+  Tensor data;
+  data.Resize(DDim(shape));
+  auto* copy_data = data.mutable_data<paddle::lite::fluid::float16>();
+  data.CopyDataFrom(*x);
+  x->set_precision(paddle::lite_api::PrecisionType::kFP16);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], static_cast<double>(copy_data[i]), 5e-4);
+  }
+}
+void test_cast_FP32_to_FP16(std::vector<int64_t> shape) {
+  // prepare input&output variables
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(shape));
+  auto* x_data = x->mutable_data<float>();
+  // initialize input&output data
+  for (int i = 0; i < x->dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  // initialize op desc
+  int in_dtype = 5, out_dtype = 4;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("cast");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("in_dtype", in_dtype);
+  opdesc.SetAttr("out_dtype", out_dtype);
+  auto op = CreateOp<operators::CastOp>(opdesc, &scope);
+  Tensor data;
+  data.Resize(DDim(shape));
+  auto* copy_data = data.mutable_data<float>();
+  data.CopyDataFrom(*x);
+  x->set_precision(paddle::lite_api::PrecisionType::kFloat);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  // compare results
+  auto* out_data = out->mutable_data<paddle::lite::fluid::float16>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(static_cast<double>(out_data[i]), copy_data[i], 5e-4);
+  }
+}
+TEST(MLUBridges, cast) {
+  test_cast_FP16_to_FP32({2, 3, 4, 5});
+  test_cast_FP16_to_FP32({6, 3, 2, 5});
+  test_cast_FP32_to_FP16({2, 3, 4, 5});
+  test_cast_FP32_to_FP16({6, 3, 2, 5});
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+USE_SUBGRAPH_BRIDGE(cast, kMLU);
--- a/lite/kernels/mlu/bridges/concat_op.cc
+++ b/lite/kernels/mlu/bridges/concat_op.cc
@@ -44,9 +44,10 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto dims = output_dims.size();
  int axis = (param_axis < 0) ? (param_axis + dims) : param_axis;
-  CHECK_LE(axis, 4) << "Unsupport dims in mlu concat";
+  CHECK_LT(axis, dims) << "Unsupport dims in mlu concat";
-  int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2};
+  // value of nhwc2nchw_axis is index of nhwc
-  int nhwc_axis = nchw_to_nhwc_axis_map[axis];
+  // order of nhwc2nchw_axis is nchw
+  int nhwc_axis = GetAxisNHWC2NCHW<int>(dims)[axis];
  auto output_tensor = graph->AddNode(
      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
@@ -60,6 +61,7 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                 &outputs,
                                 1));
  graph->FuseOp(concat_op);
+  CNML_CALL(cnmlDestroyBaseOp(&concat_op));
  return SUCCESS;
 }

--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 #include "lite/operators/conv_op.h"
 #include <algorithm>
 #include "lite/kernels/mlu/bridges/graph.h"
 #include "lite/kernels/mlu/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -30,6 +32,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  const auto* op_info = op->op_info();
  const auto* scope = op->scope();
  VLOG(3) << "[MLU] Converting " << op_info->Type() << "... ";
+  CHECK(!op_info->HasAttr("act_type"));
  // get input, filter and op attributes
  const auto input_var_name = op_info->Input("Input").front();
@@ -43,8 +46,13 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  const auto output_shape = output->dims().Vectorize();
  const auto bs = input_dims[0];
  const auto oc = filter_dims[0];
+  const auto groups = op_info->GetAttr<int>("groups");
  CHECK_EQ(input_dims.size(), 4u);
  CHECK_EQ(filter_dims.size(), 4u);
+  CHECK(!(op_info->HasAttr("fuse_relu") &&
+          (op_info->GetAttr<bool>("fuse_relu") == true)))
+      << "UnSupported param fuse_relu is true!";
  const auto strides = op_info->GetAttr<std::vector<int>>("strides");
  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
@@ -70,13 +78,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                      padding_algorithm,
                                      input_dims,
                                      filter_dims);
+  bool is_group_mode = groups > 1;
+  bool is_depthwise_mode = false;
+  if (filter_dims[0] == groups && filter_dims[1] == 1 && dilations[0] == 1 &&
+      dilations[1] == 1) {  // depthwise filter shape = {1, ic ,kh ,kw}
+    is_depthwise_mode = true;
+    is_group_mode = false;
+  }
+  auto input_tensor = graph->GetNode(input_var_name);
  const auto output_tensor = graph->AddNode(
      output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  std::vector<int64_t> cnml_filter_shape = {
+      filter_dims[0], filter_dims[1], filter_dims[2], filter_dims[3]};
+  if (is_depthwise_mode) {
+    /*paddle filter shape is {oc , ic / groups == 1, kh, kw} while
+     cnml depthwise conv filter expect shape {oc / groups == 1 , ic , kh, kw}
+     so we should shape filter shape
+     */
+    cnml_filter_shape = {
+        filter_dims[1], filter_dims[0], filter_dims[2], filter_dims[3]};
+  }
  // Create filter node
  const auto filter_tensor = graph->AddNode(filter_var_name,
-                                            filter_dims.Vectorize(),
+                                            cnml_filter_shape,
                                            CNML_FILTER,
                                            CNML_NCHW,
                                            graph->FPType());
@@ -89,15 +116,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    dequant(filter_dequant.data(),
            filter->mutable_data<int8_t>(),
            1,
-            filter_dims[0],
+            cnml_filter_shape[0],
-            filter_dims[1] * filter_dims[2] * filter_dims[3],
+            cnml_filter_shape[1] * cnml_filter_shape[2] * cnml_filter_shape[3],
            weight_scale);
    transpose(filter_dequant.data(),
              filter->mutable_data<float>(),
-              {static_cast<int>(filter_dims[0]),
+              {static_cast<int>(cnml_filter_shape[0]),
-               static_cast<int>(filter_dims[1]),
+               static_cast<int>(cnml_filter_shape[1]),
-               static_cast<int>(filter_dims[2]),
+               static_cast<int>(cnml_filter_shape[2]),
-               static_cast<int>(filter_dims[3])},
+               static_cast<int>(cnml_filter_shape[3])},
              {0, 2, 3, 1});
    filter->set_precision(PrecisionType::kFloat);
  } else if (filter->precision() != PrecisionType::kFloat) {
@@ -116,7 +143,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    std::vector<int64_t> bias_shape;
    if (bias_data_size == oc) {
      // 0: {oc}
-      bias_shape = {oc};
+      bias_shape = {1, 1, 1, oc};
    } else if (bias_data_size == output_data_size / bs) {
      LOG(FATAL) << "Unsupported ... ...";
      // 1: {1, oc, oh, ow}
@@ -130,18 +157,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                 << " isn't supported in conv2d Op when output dimension is "
                 << output_dims;
    }
-    bias_tensor = graph->AddNode(bias_var_name,
+    bias_tensor = graph->AddNode(
-                                 bias_dims.Vectorize(),
+        bias_var_name, bias_shape, CNML_CONST, CNML_NHWC, graph->FPType());
-                                 CNML_CONST,
-                                 CNML_CNHW,
-                                 graph->FPType());
    graph->BindConstData(bias_var_name, bias);
  }
  const auto input_scale = op_info->GetAttr<float>("input_scale");
  bool use_first_conv = false;
-  if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) {
+  if (lite::TargetWrapperMlu::UseFirstConv() && input_dims[1] == 3) {
    use_first_conv = true;
  }
@@ -158,38 +182,75 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                            paddings[0],
                                            paddings[0]));
    const auto mean_tensor = graph->AddNode("first_conv_mean_tensor",
-                                            std::vector<int64_t>{3},
+                                            std::vector<int64_t>{1, 1, 1, 3},
                                            CNML_CONST,
-                                            CNML_CNHW,
+                                            CNML_NHWC,
                                            graph->FPType());
    const auto std_tensor = graph->AddNode("first_conv_std_tensor",
-                                           std::vector<int64_t>{3},
+                                           std::vector<int64_t>{1, 1, 1, 3},
                                           CNML_CONST,
-                                           CNML_CNHW,
+                                           CNML_NHWC,
                                           graph->FPType());
    graph->BindConstRawData("first_conv_mean_tensor",
-                            lite::DeviceInfo::Global().MeanVec().data(),
+                            lite::TargetWrapperMlu::MeanVec().data(),
                            3,
                            false);
    graph->BindConstRawData("first_conv_std_tensor",
-                            lite::DeviceInfo::Global().StdVec().data(),
+                            lite::TargetWrapperMlu::StdVec().data(),
                            3,
                            false);
-    graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8);
+    input_tensor->set_mlu_dtype(CNML_DATA_UINT8);
    CNML_CALL(cnmlCreateConvFirstOpForward(
        &conv_op,
        conv_param,
-        graph->GetNode(input_var_name)->mlu_tensor(),
+        input_tensor->mlu_tensor(),
        mean_tensor->mlu_tensor(),
        output_tensor->mlu_tensor(),
        filter_tensor->mlu_tensor(),
        bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
        std_tensor->mlu_tensor()));
    CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param));
+  } else if (is_depthwise_mode) {
+    cnmlConvDepthwiseOpParam_t conv_depthwise_param;
+    cnmlCreateConvDepthwiseOpParam_V2(&conv_depthwise_param,
+                                      strides[0],
+                                      strides[1],
+                                      paddings[0] * 2,
+                                      paddings[2] * 2);
+    CNML_CALL(cnmlCreateConvDepthwiseOpForward(
+        &conv_op,
+        conv_depthwise_param,
+        input_tensor->mlu_tensor(),
+        output_tensor->mlu_tensor(),
+        filter_tensor->mlu_tensor(),
+        bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
+    CNML_CALL(cnmlDestroyConvDepthwiseOpParam(&conv_depthwise_param));
+  } else if (is_group_mode) {
+    cnmlConvOpParam_t conv_param;
+    CNML_CALL(cnmlCreateConvOpParam(&conv_param,
+                                    strides[0],
+                                    strides[1],
+                                    dilations[0],
+                                    dilations[1],
+                                    paddings[0] * 2,
+                                    paddings[2] * 2));
+    CNML_CALL(cnmlCreateConvGroupOpForward(
+        &conv_op,
+        conv_param,
+        input_tensor->mlu_tensor(),
+        output_tensor->mlu_tensor(),
+        filter_tensor->mlu_tensor(),
+        bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
+        groups));
+    CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
  } else {
    cnmlConvOpParam_t conv_param;
+    VLOG(5) << "conv param (" << input_var_name << ")"
+            << "stride: " << strides[0] << ',' << strides[1] << '\t'
+            << "dilations: " << dilations[0] << ',' << dilations[1] << '\t'
+            << "paddings: " << paddings[0] << ',' << paddings[2] << std::endl;
    CNML_CALL(cnmlCreateConvOpParam(&conv_param,
                                    strides[0],
                                    strides[1],
@@ -200,19 +261,21 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    CNML_CALL(cnmlCreateConvOpForward(
        &conv_op,
        conv_param,
-        graph->GetNode(input_var_name)->mlu_tensor(),
+        input_tensor->mlu_tensor(),
        output_tensor->mlu_tensor(),
        filter_tensor->mlu_tensor(),
        bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
    CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
  }
-  graph->SetComputingDataType(
+  if (!is_depthwise_mode) {
-      conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
+    graph->SetComputingDataType(
-  graph->SetComputingDataType(
+        conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
-      conv_op,
+    graph->SetComputingDataType(
-      filter_tensor->mlu_tensor(),
+        conv_op,
-      1 / *min_element(weight_scale.begin(), weight_scale.end()));
+        filter_tensor->mlu_tensor(),
+        1 / *max_element(weight_scale.begin(), weight_scale.end()));
+  }
  CNML_CALL(cnmlSetOperationComputingLayout(conv_op, CNML_NHWC));
  if (HasInputArg(op_info, scope, "Bias")) {
    auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
@@ -220,6 +283,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  }
  graph->BindConstData(filter_var_name, filter);
  graph->FuseOp(conv_op);
+  CNML_CALL(cnmlDestroyBaseOp(&conv_op));
  return REBUILD_WHEN_SHAPE_CHANGED;
 }

--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
@@ -13,8 +13,11 @@
 // limitations under the License.
 #include "lite/operators/conv_op.h"
 #include <gtest/gtest.h>
 #include <random>
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/mlu/bridges/test_helper.h"
@@ -331,6 +334,10 @@ TEST(MLUBridges, conv) {
 #endif
 }
+TEST(MLUBridges, depthwise_conv2d) {
+  test_conv(1, 8, 8, 14, 14, false, false, false, true, 1, 1, 2, 3);
+}
 }  // namespace mlu
 }  // namespace subgraph
 }  // namespace lite

--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
@@ -23,7 +23,7 @@ namespace mlu {
 std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
  auto x_dims = x.dims();
-  CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
+  // CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
  auto y_dims = y->dims();
  CHECK_GE(x_dims.size(), y_dims.size());
@@ -117,6 +117,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  }
  graph->FuseOp(elementwise_op);
+  CNML_CALL(cnmlDestroyBaseOp(&elementwise_op));
  cnmlBaseOp_t act_op;
  if (op_type == "fusion_elementwise_add_activation") {
    auto mid_tensor = graph->GetNode(out_var_name + "_mid");
@@ -127,6 +128,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                 mid_tensor->mlu_tensor(),
                                 output_tensor->mlu_tensor()));
    graph->FuseOp(act_op);
+    CNML_CALL(cnmlDestroyBaseOp(&act_op));
  }
  return REBUILD_WHEN_SHAPE_CHANGED;
 }

--- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
@@ -153,7 +153,7 @@ void test_elementwise_add(const std::vector<int64_t>& x_shape,
  opdesc.SetOutput("Out", {out_var_name});
  opdesc.SetAttr("axis", axis);
-  // create and convert op to NPU model, then run it on NPU
+  // create and convert op to MLU model, then run it on MLU
  auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
  // execute reference implementation and save to output tensor

--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
@@ -34,7 +34,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto w_var_name = op_info->Input("W").front();
  auto output_var_name = op_info->Output("Out").front();
-  // int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  CHECK(!op_info->HasAttr("activation_type"));
  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
  auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
  auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
@@ -45,9 +45,28 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  CHECK_EQ(w_dims.size(), 2UL);
  // Create w node
-  std::vector<int64_t> w_shape{w_dims[1], w_dims[0]};
+  std::vector<int64_t> cnml_w_shape;
+  if (x_dims.size() == 4) {
+    if (x_dims[1] * x_dims[2] * x_dims[3] == w_dims[0]) {
+      cnml_w_shape = {
+          static_cast<int>(w_dims[1]),
+          static_cast<int>(x_dims[1]),  // input_c
+          static_cast<int>(x_dims[2]),  //  input_h
+          static_cast<int>(x_dims[3]),  //  input_w
+      };
+    } else {
+      LOG(FATAL)
+          << "in fc op, we expect input_h * input_w * input_c == filter_c"
+          << " but we got input_c = " << x_dims[1] << " input_h = " << x_dims[2]
+          << " input_w = " << x_dims[3] << " filter_c = " << w_dims[0]
+          << std::endl;
+    }
+  } else {
+    cnml_w_shape = {w_dims[1], w_dims[0]};
+  }
  auto w_tensor = graph->AddNode(
-      w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
+      w_var_name, cnml_w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
  auto input_scale = op_info->GetAttr<float>("input_scale");
@@ -63,15 +82,15 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (HasInputArg(op_info, scope, "Bias")) {
    bias_var_name = op_info->Input("Bias").front();
    auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-    auto bias_dims = bias->dims();
+    auto bias_dims = bias->dims().Vectorize();
    CHECK(!graph->HasNode(bias_var_name));
+    if (bias_dims.size() < 4u) {
+      bias_dims.insert(bias_dims.begin(), 4 - bias_dims.size(), 1);
+    }
    // CHECK_EQ(bias_dims.production(), n);
-    bias_tensor = graph->AddNode(bias_var_name,
+    bias_tensor = graph->AddNode(
-                                 bias_dims.Vectorize(),
+        bias_var_name, bias_dims, CNML_CONST, CNML_NHWC, graph->FPType());
-                                 CNML_CONST,
-                                 CNML_CNHW,
-                                 graph->FPType());
    graph->BindConstData(bias_var_name, bias);
  }
  cnmlBaseOp_t fc_op;
@@ -88,18 +107,46 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (w->precision() == PrecisionType::kUnk ||
      w->precision() == PrecisionType::kInt8) {
    std::vector<float> w_dequant(w->data_size());
-    dequant(w_dequant.data(),
+    if (cnml_w_shape.size() == 2) {
-            w->mutable_data<int8_t>(),
+      dequant(w_dequant.data(),
-            1,
+              w->mutable_data<int8_t>(),
-            w_dims[1],
+              1,
-            w_dims[0],
+              cnml_w_shape[0],
-            weight_scale);
+              cnml_w_shape[1],
-    for (int i = 0; i < w_dims[1]; i++) {
+              weight_scale);
-      for (int j = 0; j < w_dims[0]; j++) {
+      transpose2d(w_dequant.data(),
-        w->mutable_data<float>()[i * w_dims[0] + j] =
+                  w->mutable_data<float>(),
-            w_dequant[i + j * w_dims[1]];
+                  {static_cast<int>(cnml_w_shape[0]),
-      }
+                   static_cast<int>(cnml_w_shape[1])});
+    } else if (cnml_w_shape.size() == 4) {
+      dequant(w_dequant.data(),
+              w->mutable_data<int8_t>(),
+              1,
+              cnml_w_shape[0],
+              cnml_w_shape[1] * cnml_w_shape[2] * cnml_w_shape[3],
+              weight_scale);
+      int c_o_num = cnml_w_shape[0];
+      int c_i_num = cnml_w_shape[1];
+      int h_i_num = cnml_w_shape[2];
+      int w_i_num = cnml_w_shape[3];
+      // chw == ci * hi * wi == w_dim[0]
+      // first trans [chw, co] -> [co,chw]
+      std::vector<float> first_trans_output(w_dequant.size());
+      int chw = c_i_num * h_i_num * w_i_num;
+      transpose2d(w_dequant.data(), first_trans_output.data(), {chw, c_o_num});
+      // second trans [co,ci,hi,wi] -> [co,hi,wi,ci]
+      transpose(first_trans_output.data(),
+                w->mutable_data<float>(),
+                {c_o_num, c_i_num, h_i_num, w_i_num},
+                {0, 2, 3, 1});
+    } else {
+      LOG(FATAL) << "expect w_shape.size == 2 or 4, but got "
+                 << cnml_w_shape.size() << std::endl;
    }
    w->set_precision(PrecisionType::kFloat);
  } else if (w->precision() != PrecisionType::kFloat) {
    LOG(FATAL) << "UnSupported weight precision!";
@@ -110,9 +157,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  graph->SetComputingDataType(
      fc_op,
      w_tensor->mlu_tensor(),
-      1 / *min_element(weight_scale.begin(), weight_scale.end()));
+      1 / *max_element(weight_scale.begin(), weight_scale.end()));
  graph->FuseOp(fc_op);
+  CNML_CALL(cnmlDestroyBaseOp(&fc_op));
  return REBUILD_WHEN_SHAPE_CHANGED;
 }

--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
@@ -175,9 +175,9 @@ void test_fc(const std::vector<int64_t>& input_shape,
 TEST(MLUBridges, fc) {
  for (bool use_bias : {true, false}) {
-    // test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
+    test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
-    // test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
+    test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
-    // test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
+    test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
    test_fc({1, 1024, 1, 1}, {1024, 32}, 1, use_bias);
  }
 }

--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
@@ -27,10 +27,14 @@ std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
                                          cnmlTensorType_t tensor_type,
                                          cnmlDataOrder_t shape_order,
                                          cnmlDataType_t mlu_dtype,
+                                          cnmlDataOrder_t data_order,
                                          void* raw_ptr) {
  CHECK(!HasNode(name));
+  VLOG(5) << "add mlu node: " << name << "\t data type "
+          << static_cast<int>(mlu_dtype) << "\t data order "
+          << static_cast<int>(data_order);
  auto node = std::shared_ptr<MLUTensor>(
-      new MLUTensor(shape, tensor_type, shape_order, mlu_dtype));
+      new MLUTensor(shape, tensor_type, shape_order, mlu_dtype, data_order));
  node->set_mlu_ptr(raw_ptr);
  nodes_.insert(std::make_pair(name, node));
  return node;

--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -15,13 +15,15 @@
 #pragma once
 #include <cmath>
-#include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 #include "lite/kernels/mlu/bridges/tensor.h"
+#include "lite/utils/env.h"
 #define PRINT_HW_TIME false
@@ -45,32 +47,30 @@ class Graph {
    CNRT_CALL(cnrtCreateNotifier(&notifier_end_));
 #endif
  }
  ~Graph() {
    FreeConstData();
    CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
-    for (auto op : ops_) {
-      CNML_CALL(cnmlDestroyBaseOp(&op));
-    }
 #if PRINT_HW_TIME
    CNRT_CALL(cnrtDestroyNotifier(&notifier_start_));
    CNRT_CALL(cnrtDestroyNotifier(&notifier_end_));
    double total_time = 0;
-    for (auto& f : time_log_) {
+    if (!time_log_.empty()) {
-      total_time += f;
+      for (auto& f : time_log_) {
+        total_time += f;
+      }
+      std::cout << "cnml hardware time for " << time_log_.size()
+                << " process:" << total_time / time_log_.size() << std::endl;
    }
-    std::cout << "cnml hardware time for " << time_log_.size()
-              << " process:" << total_time / time_log_.size() << std::endl;
 #endif
  }
  // Data node
  std::shared_ptr<MLUTensor> AddNode(
      const std::string& name,
      std::vector<int64_t> shape,
      cnmlTensorType_t tensor_type = CNML_TENSOR,
-      cnmlDataOrder_t data_order = CNML_NCHW,
+      cnmlDataOrder_t shape_order = CNML_NCHW,
      cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32,
+      cnmlDataOrder_t data_order = CNML_NHWC,
      void* raw_ptr = nullptr);
  std::shared_ptr<MLUTensor> GetNode(const std::string& name) {
@@ -82,9 +82,16 @@ class Graph {
    return nodes_.find(name) != nodes_.end();
  }
-  void AddInput(std::shared_ptr<MLUTensor> tensor) {
+  void AddInput(std::shared_ptr<MLUTensor> tensor,
+                bool disable_batch_size_changeable = true) {
    inputs_.push_back(tensor->mlu_tensor());
    input_tensors_.push_back(tensor);
+    if (!disable_batch_size_changeable) {
+      constexpr int input_dimNb = 4;
+      bool input_dim_mutable[4] = {true, false, false, false};
+      CNML_CALL(cnmlSetTensorDimMutable(
+          tensor->mlu_tensor(), input_dim_mutable, input_dimNb));
+    }
  }
  void AddOutput(std::shared_ptr<MLUTensor> tensor) {
@@ -92,6 +99,22 @@ class Graph {
    output_tensors_.push_back(tensor);
  }
+  std::vector<std::shared_ptr<MLUTensor>>* MutableInputs() {
+    return &input_tensors_;
+  }
+  std::vector<std::shared_ptr<MLUTensor>>* MutableOutputs() {
+    return &output_tensors_;
+  }
+  void GenOfflineModel(const std::string& name) {
+    cnmlModel_t model;
+    const std::string& symbol = "subnet0";
+    const auto& filename = name + ".offline.cambricon";
+    CNML_CALL(cnmlCreateModel(&model, filename.c_str()));
+    CNML_CALL(cnmlAddFusionOpToModel(model, fusion_op_, symbol.c_str()));
+    CNML_CALL(cnmlSaveModel(model, filename.c_str()));
+    CNML_CALL(cnmlDestroyModel(model));
+  }
  void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); }
  void Compile(cnmlCoreVersion_t core_version, int core_number) {
@@ -103,18 +126,37 @@ class Graph {
    CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number));
    CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version));
    CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_));
-    for (auto in : input_tensors_) {
-      input_addrs_.push_back(in->mlu_data());
-    }
-    for (auto out : output_tensors_) {
-      output_addrs_.push_back(out->mlu_data());
-    }
  }
+#define MEASURE_HWTIME_START(que)                       \
+  do {                                                  \
+    CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); \
+  } while (0)
+#define MEASURE_HWTIME_END(que)                                                \
+  do {                                                                         \
+    thread_local float hw_time;                                                \
+    CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));                          \
+    CNRT_CALL(cnrtSyncQueue(que));                                             \
+    CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); \
+    hw_time /= 1000.0f;                                                        \
+    DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;       \
+    std::lock_guard<std::mutex> lk(time_mut_);                                 \
+    time_log_.push_back(hw_time);                                              \
+  } while (0)
  void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
+    input_addrs_.resize(input_tensors_.size());
+    output_addrs_.resize(output_tensors_.size());
+    for (size_t i = 0; i < input_addrs_.size(); ++i) {
+      input_addrs_[i] = input_tensors_[i]->mlu_data();
+    }
+    for (size_t i = 0; i < output_addrs_.size(); ++i) {
+      output_addrs_[i] = output_tensors_[i]->mlu_data();
+    }
 #if PRINT_HW_TIME
-    thread_local float hw_time;
+    MEASURE_HWTIME_START(que);
-    CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
 #endif
    CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
                                            input_addrs_.data(),
@@ -124,18 +166,46 @@ class Graph {
                                            &forward_param,
                                            que));
 #if PRINT_HW_TIME
-    CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
+    MEASURE_HWTIME_END(que);
 #endif
+  }
-    CNRT_CALL(cnrtSyncQueue(que));
+  void Compute(cnrtQueue_t que,
+               const std::vector<std::shared_ptr<MLUTensor>>& in,
+               const std::vector<std::shared_ptr<MLUTensor>>& out) {
+    std::vector<cnmlTensor_t> in_tensor;
+    std::vector<cnmlTensor_t> out_tensor;
+    input_addrs_.resize(in.size());
+    output_addrs_.resize(out.size());
+    for (size_t i = 0; i < input_addrs_.size(); ++i) {
+      input_addrs_[i] = in[i]->mlu_data();
+      in_tensor.push_back(in[i]->mlu_tensor());
+    }
+    for (size_t i = 0; i < output_addrs_.size(); ++i) {
+      output_addrs_[i] = out[i]->mlu_data();
+      out_tensor.push_back(out[i]->mlu_tensor());
+    }
+#if PRINT_HW_TIME
+    MEASURE_HWTIME_START(que);
+#endif
+    /* Because of using cnmlSetTensorDimMutable, cnmlComputeFusionOpForward_V3
+     * -> cnmlComputeFusionOpForward_V4 */
+    CNML_CALL(cnmlComputeFusionOpForward_V4(fusion_op_,
+                                            &in_tensor[0],
+                                            input_addrs_.data(),
+                                            input_addrs_.size(),
+                                            &out_tensor[0],
+                                            output_addrs_.data(),
+                                            output_addrs_.size(),
+                                            que,
+                                            NULL));
 #if PRINT_HW_TIME
-    CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
+    MEASURE_HWTIME_END(que);
-    hw_time /= 1000.0f;
-    DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
-    std::lock_guard<std::mutex> lk(time_mut_);
-    time_log_.push_back(hw_time);
 #endif
  }
+#undef MEASURE_HWTIME_START
+#undef MEASURE_HWTIME_END
  template <typename T>
  void* RegisterConstData(size_t len) {
@@ -165,7 +235,7 @@ class Graph {
      CNML_CALL(cnmlBindConstData_V2(
          nodes_[tensor_name]->mlu_tensor(), alloc_data, false));
    } else if (fp_type_ == CNML_DATA_FLOAT16) {
-      void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len);
+      void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
      CNRT_CALL(
          cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
                           CNRT_FLOAT32,
@@ -180,7 +250,7 @@ class Graph {
    }
  }
-  void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
+  void BindConstData(std::string tensor_name, paddle::lite::Tensor* tensor) {
    const float* data = tensor->data<float>();
    size_t len = tensor->data_size();
    if (fp_type_ == CNML_DATA_FLOAT32) {
@@ -189,10 +259,14 @@ class Graph {
          const_cast<void*>(static_cast<const void*>(data)),
          false));
    } else if (fp_type_ == CNML_DATA_FLOAT16) {
-      auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>();
+      void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
-      for (size_t i = 0; i < len; ++i) {
+      CNRT_CALL(
-        data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]);
+          cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
-      }
+                           CNRT_FLOAT32,
+                           data_fp16,
+                           CNRT_FLOAT16,
+                           len,
+                           nullptr));
      CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(),
                                     static_cast<void*>(data_fp16),
                                     false));
@@ -206,19 +280,23 @@ class Graph {
                            float scale,
                            cnmlDataType_t data_type = CNML_DATA_INT8) {
    cnmlQuantizedParam_t quant_param;
-    CNML_CALL(
+    int pos = scale2position(scale);
-        cnmlCreateQuantizedParam(&quant_param, scale2position(scale), 1, 0.0));
+    auto cnml_scale = pow(2, pos) * scale;
+    VLOG(5) << "[cnml quantized param] pos: " << pos
+            << "\tscale: " << cnml_scale << std::endl;
+    CNML_CALL(cnmlCreateQuantizedParam(&quant_param, pos, cnml_scale, 0.0));
    CNML_CALL(
        cnmlSetOperationComputingDataType(op, tensor, data_type, quant_param));
    CNML_CALL(cnmlDestroyQuantizedParam(&quant_param));
  }
-  void SetFPType(::paddle::lite_api::PrecisionType type) {
+  void SetFPType(paddle::lite_api::PrecisionType type) {
+    origin_fp_type_ = type;
    switch (type) {
-      case ::paddle::lite_api::PrecisionType::kFP16:
+      case paddle::lite_api::PrecisionType::kFP16:
        fp_type_ = CNML_DATA_FLOAT16;
        break;
-      case ::paddle::lite_api::PrecisionType::kFloat:
+      case paddle::lite_api::PrecisionType::kFloat:
        fp_type_ = CNML_DATA_FLOAT32;
        break;
      default:
@@ -230,14 +308,14 @@ class Graph {
 private:
  cnmlDataType_t fp_type_{CNML_DATA_FLOAT32};
-  std::map<std::string, std::shared_ptr<MLUTensor>> nodes_;
+  paddle::lite_api::PrecisionType origin_fp_type_{PRECISION(kFloat)};
+  std::unordered_map<std::string, std::shared_ptr<MLUTensor>> nodes_;
  std::vector<cnmlTensor_t> inputs_;
  std::vector<cnmlTensor_t> outputs_;
  std::vector<void*> input_addrs_;
  std::vector<void*> output_addrs_;
  std::vector<std::shared_ptr<MLUTensor>> input_tensors_;
  std::vector<std::shared_ptr<MLUTensor>> output_tensors_;
-  std::vector<cnmlBaseOp_t> ops_;
  cnmlFusionOp_t fusion_op_;
  std::vector<void*> const_data_storage_;
 #if PRINT_HW_TIME

--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
@@ -85,6 +85,7 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                        nn_param));
  CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param));
  graph->FuseOp(interp_op);
+  CNML_CALL(cnmlDestroyBaseOp(&interp_op));
  return SUCCESS;
 }

--- a/lite/kernels/mlu/bridges/layout_op.cc
+++ b/lite/kernels/mlu/bridges/layout_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+int LayoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+  auto x_var_name = op_info->Input("Input").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  std::shared_ptr<MLUTensor> output_tensor;
+  CHECK(graph->HasNode(x_var_name));
+  std::vector<int> axis;
+  auto x_tensor = graph->GetNode(x_var_name);
+  auto x_data_order = x_tensor->dorder();
+  auto x_dims = x->dims().Vectorize();
+  if (x_data_order == CNML_NCHW) {
+    switch (x_dims.size()) {
+      case 2:
+        axis = {0, 1};
+        break;
+      case 3:
+        axis = {0, 2, 1};
+        break;
+      case 4:
+        axis = {0, 2, 3, 1};
+        break;
+      case 5:
+        axis = {0, 2, 3, 4, 1};
+        break;
+      default:
+        CHECK(0) << "Unsupport shape";
+    }
+    output_tensor = graph->AddNode(
+        out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, x_tensor->dtype());
+    VLOG(3) << "layout transpose nchw to nhwc" << std::endl;
+  } else {
+    switch (x_dims.size()) {
+      case 2:
+        axis = {0, 1};
+        break;
+      case 3:
+        axis = {0, 2, 1};
+        break;
+      case 4:
+        axis = {0, 3, 1, 2};
+        break;
+      case 5:
+        axis = {0, 4, 1, 2, 3};
+        break;
+      default:
+        CHECK(0) << "Unsupport shpae";
+    }
+    VLOG(3) << "layout transpose nhwc to nchw" << std::endl;
+    output_tensor = graph->AddNode(out_var_name,
+                                   output_dims,
+                                   CNML_TENSOR,
+                                   CNML_NCHW,
+                                   x_tensor->dtype(),
+                                   CNML_NCHW);
+  }
+  cnmlBaseOp_t layout_op;
+  cnmlNdTransposeOpParam_t transpose_param;
+  CNML_CALL(
+      cnmlCreateNdTransposeOpParam(&transpose_param, axis.data(), axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&layout_op,
+                                       x_tensor->mlu_tensor(),
+                                       output_tensor->mlu_tensor(),
+                                       transpose_param));
+  graph->FuseOp(layout_op);
+  CNML_CALL(cnmlDestroyBaseOp(&layout_op));
+  return SUCCESS;
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(layout,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::LayoutConverter);
--- a/lite/kernels/mlu/bridges/layout_op_test.cc
+++ b/lite/kernels/mlu/bridges/layout_op_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/layout_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+void test_layout_NHWC2NCHW(std::vector<int64_t> input_shape) {
+  // prepare input&output variables
+  std::string x_var_name = "input";
+  std::string out_var_name = "out";
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(input_shape));
+  // initialize input&output data
+  FillTensor<float>(x);
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("layout");
+  opdesc.SetInput("Input", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  auto op = CreateOp<operators::LayoutOp>(opdesc, &scope);
+  // execute reference implementation and save to output tensor
+  Tensor input;
+  input.Resize(DDim(input_shape));
+  switch (input_shape.size()) {
+    case 2:
+      transpose<float>(
+          x->mutable_data<float>(),
+          input.mutable_data<float>(),
+          {static_cast<int>(input_shape[0]), static_cast<int>(input_shape[1])},
+          {0, 1});
+      break;
+    case 3:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[1])},
+                       {0, 2, 1});
+      break;
+    case 4:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3]),
+                        static_cast<int>(input_shape[1])},
+                       {0, 3, 1, 2});
+      break;
+    case 5:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3]),
+                        static_cast<int>(input_shape[4]),
+                        static_cast<int>(input_shape[1])},
+                       {0, 4, 1, 2, 3});
+      break;
+    default:
+      CHECK(0) << "Unsupport";
+  }
+  auto* x_data = input.mutable_data<float>();
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], x_data[i], 5e-4);
+  }
+}
+void test_layout_NCHW2NHWC(std::vector<int64_t> input_shape) {
+  // prepare input&output variables
+  std::string x_var_name = "input";
+  std::string out_var_name = "out";
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(input_shape));
+  // initialize input&output data
+  FillTensor<float>(x);
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("layout");
+  opdesc.SetInput("Input", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  auto op = CreateOp<operators::LayoutOp>(opdesc, &scope);
+  // execute reference implementation and save to output tensor
+  Tensor input;
+  input.Resize(DDim(input_shape));
+  switch (input_shape.size()) {
+    case 2:
+      transpose<float>(
+          x->mutable_data<float>(),
+          input.mutable_data<float>(),
+          {static_cast<int>(input_shape[0]), static_cast<int>(input_shape[1])},
+          {0, 1});
+      break;
+    case 3:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[1]),
+                        static_cast<int>(input_shape[2])},
+                       {0, 2, 1});
+      break;
+    case 4:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[1]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3])},
+                       {0, 2, 3, 1});
+      break;
+    case 5:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[1]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3]),
+                        static_cast<int>(input_shape[4])},
+                       {0, 2, 3, 4, 1});
+      break;
+    default:
+      CHECK(0) << "Unsupport";
+  }
+  auto* x_data = input.mutable_data<float>();
+  LaunchOp(op, {x_var_name}, {out_var_name}, CNML_NCHW);
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], x_data[i], 5e-4);
+  }
+}
+TEST(MLUBridges, layout) {
+  test_layout_NHWC2NCHW({12, 32, 4});
+  test_layout_NHWC2NCHW({12, 32, 44, 3});
+  test_layout_NHWC2NCHW({12, 32, 44, 3, 6});
+  test_layout_NCHW2NHWC({12, 32, 55});
+  test_layout_NCHW2NHWC({12, 32, 44, 3});
+  test_layout_NCHW2NHWC({12, 32, 44, 3, 8});
+  test_layout_NHWC2NCHW({12, 32});
+  test_layout_NCHW2NHWC({12, 32});
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+USE_SUBGRAPH_BRIDGE(layout, kMLU);
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
@@ -15,6 +15,7 @@
 #pragma once
 USE_SUBGRAPH_BRIDGE(relu, kMLU);
+USE_SUBGRAPH_BRIDGE(relu6, kMLU)
 USE_SUBGRAPH_BRIDGE(conv2d, kMLU);
 USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU);
 USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU);
@@ -26,3 +27,7 @@ USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
 USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU);
 USE_SUBGRAPH_BRIDGE(concat, kMLU);
 USE_SUBGRAPH_BRIDGE(scale, kMLU);
+USE_SUBGRAPH_BRIDGE(sigmoid, kMLU);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU);
+USE_SUBGRAPH_BRIDGE(cast, kMLU);
+USE_SUBGRAPH_BRIDGE(layout, kMLU);
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
@@ -55,6 +55,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  CHECK(!(op_info->HasAttr("exclusive") &&
+          op_info->GetAttr<bool>("exclusive") == false))
+      << "Unsupport param exclusive is false!";
  if (paddings.size() == 2L) {
    for (size_t i = 0; i < 2L; ++i) {
@@ -62,8 +65,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
    }
  }
-  int pad_height = paddings[0];
-  int pad_width = paddings[2];
  std::string padding_algorithm("");
  if (op_info->HasAttr("padding_algorithm")) {
    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
@@ -72,6 +73,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (op_info->HasAttr("adaptive")) {
    adaptive = op_info->GetAttr<bool>("adaptive");
  }
+  auto input_dims = x->dims();
  lite::operators::UpdatePadding(&paddings,
                                 global_pooling,
                                 adaptive,
@@ -80,31 +83,31 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                 strides,
                                 ksize);
-  //  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
+  if (global_pooling) {
-  //  for (size_t i = 0; i < 2; i++) {
+    ksize.resize(static_cast<size_t>(input_dims.size()) - 2);
-  //    output_shape.push_back(
+    for (size_t i = 0; i < ksize.size(); ++i) {
-  //        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] -
+      ksize[i] = static_cast<int>(input_dims[i + 2]);
-  //        ksize[0]) /
+    }
-  //            strides[i] +
+  }
-  //        1);
-  //  }
  auto output_tensor = graph->AddNode(
      output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
  cnmlPoolOpParam_t pool_param;
  CNML_CALL(
-      cnmlCreatePoolOpParam_V2(&pool_param,
+      cnmlCreatePoolOpParam_V3(&pool_param,
                               ksize[0],
                               ksize[1],
                               strides[0],
                               strides[1],
-                               pad_height,
+                               paddings[0],
-                               pad_width,
+                               paddings[1],
-                               1,  // dilation
+                               paddings[2],
-                               1,
+                               paddings[3],
+                               1,  // dilation h
+                               1,  // dilation w
                               ToCnmlPoolMode(pooling_type),
-                               ceil_mode ? CNML_POOL_KVALID : CNML_POOL_KFULL,
+                               ceil_mode ? CNML_POOL_KFULL : CNML_POOL_KVALID,
                               true, /* real */
                               1 /* blend factor */));
  cnmlBaseOp_t pool_op;
@@ -114,6 +117,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                             output_tensor->mlu_tensor()));
  CNML_CALL(cnmlDestroyPoolOpParam(&pool_param));
  graph->FuseOp(pool_op);
+  CNML_CALL(cnmlDestroyBaseOp(&pool_op));
  return SUCCESS;
 }

--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
@@ -43,6 +43,12 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
  std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
  bool global_pooling = op_info->GetAttr<bool>("global_pooling");
+  if (pooling_type == "max") {
+    for (int i = 0; i < out_dims.production(); ++i) {
+      dst_ptr[i] = -65504.f;
+    }
+  }
  int in_n = in_dims[0];
  int in_c = in_dims[1];
  int in_h = in_dims[2];
@@ -203,62 +209,46 @@ void test_pool(int bs,
 }
 TEST(MLUBridges, pool) {
-  // for (auto pooling_type : {"max", "avg"}) {
-  //   for (auto ceil_mode : {true, false}) {
-  //     for (auto global_pooling : {/*true, */ false}) {
-  //       for (auto exclusive : {true /*, false*/}) {
-  //         for (auto ksize : {2, 3}) {
-  //           for (auto stride : {1, 2}) {
-  //             for (auto padding : {0, 1}) {
-  //               for (auto bs : {1, 3}) {
-  //                 for (auto ic : {1, 3}) {
-  //                   for (auto ih : {3, 7}) {
-  //                     for (auto iw : {3, 7}) {
-  //                       test_pool(bs,
-  //                                 ic,
-  //                                 ih,
-  //                                 iw,
-  //                                 pooling_type,
-  //                                 ceil_mode,
-  //                                 global_pooling,
-  //                                 exclusive,
-  //                                 ksize,
-  //                                 stride,
-  //                                 padding);
-  //                     }
-  //                   }
-  //                 }
-  //               }
-  //             }
-  //           }
-  //         }
-  //       }
-  //     }
-  //   }
-  // }
  for (auto pooling_type : {"max", "avg"}) {
    for (auto ceil_mode : {true, false}) {
-      bool global_pooling = false;
+      for (auto global_pooling : {true, false}) {
-      bool exclusive = true;
+        for (auto exclusive : {true /*, false*/}) {
-      int ksize = 2;
+          for (auto ksize : {2, 3}) {
-      int stride = 1;
+            for (auto stride : {1, 2}) {
-      int padding = 0;
+              for (auto padding : {0, 1}) {
-      int bs = 6;
+                for (auto bs : {1, 3}) {
-      int ic = 6;
+                  for (auto ic : {1, 3}) {
-      int ih = 6;
+                    for (auto ih : {3, 7}) {
-      int iw = 6;
+                      for (auto iw : {3, 7}) {
-      test_pool(bs,
+                        LOG(INFO)
-                ic,
+                            << "shape: " << bs << ',' << ic << ',' << ih << ','
-                ih,
+                            << iw << '\t' << "pooling type: " << pooling_type
-                iw,
+                            << '\t' << "ceil model: " << ceil_mode << '\t'
-                pooling_type,
+                            << "global_pooling: " << global_pooling << '\t'
-                ceil_mode,
+                            << "exclusive: " << exclusive << '\t'
-                global_pooling,
+                            << "ksize: " << ksize << '\t'
-                exclusive,
+                            << "stride: " << stride << '\t'
-                ksize,
+                            << "padding: " << padding;
-                stride,
+                        test_pool(bs,
-                padding);
+                                  ic,
+                                  ih,
+                                  iw,
+                                  pooling_type,
+                                  ceil_mode,
+                                  global_pooling,
+                                  exclusive,
+                                  ksize,
+                                  stride,
+                                  padding);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
    }
  }
 }

--- a/lite/kernels/mlu/bridges/scale_op.cc
+++ b/lite/kernels/mlu/bridges/scale_op.cc
@@ -61,6 +61,7 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                              alpha_tensor->mlu_tensor(),
                              beta_tensor->mlu_tensor()));
  graph->FuseOp(scale_op);
+  CNML_CALL(cnmlDestroyBaseOp(&scale_op));
  return SUCCESS;
 }

--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
@@ -35,9 +35,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto out_var_name = op_info->Output("Out").front();
  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
  auto output_dims = output->dims().Vectorize();
+  auto x_shape =
+      scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims().Vectorize();
-  // nchw axis to nhwc aixs
+  // nchw axis to nhwc axis
-  int nchw_to_nhwc_aixs_map[4] = {0, 3, 1, 2};
  int axis = 1;
  if (op_info->HasAttr("axis")) {
    axis = op_info->GetAttr<int>("axis");
@@ -45,7 +46,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      axis = output_dims.size() + axis;
    }
  }
-  int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
+  // value of nhwc2nchw_axis is index of nhwc
+  // order of nhwc2nchw_axis is nchw
+  int nhwc_axis = GetAxisNHWC2NCHW<int>(x_shape.size())[axis];
  auto output_tensor = graph->AddNode(
      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
@@ -55,6 +58,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  graph->GetNode(x_var_name)->mlu_tensor(),
                                  output_tensor->mlu_tensor()));
  graph->FuseOp(softmax_op);
+  CNML_CALL(cnmlDestroyBaseOp(&softmax_op));
  return SUCCESS;
 }

--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
@@ -93,7 +93,7 @@ void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
  opdesc.SetOutput("Out", {out_var_name});
  opdesc.SetAttr("axis", axis);
-  // create and convert op to NPU model, then run it on NPU
+  // create and convert op to MLU model, then run it on MLU
  auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
  // execute reference implementation and save to output tensor
  softmax_ref<float>(op);

--- a/lite/kernels/mlu/bridges/tensor.cc
+++ b/lite/kernels/mlu/bridges/tensor.cc
@@ -16,6 +16,9 @@
 #include <glog/logging.h>
 #include <algorithm>
 #include <climits>
+#include <fstream>
+#include <sstream>
+#include <string>
 #include <vector>
 namespace paddle {
@@ -25,8 +28,9 @@ namespace mlu {
 MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
                     cnmlTensorType_t tensor_type,
-                     cnmlDataOrder_t data_order,
+                     cnmlDataOrder_t shape_order,
-                     cnmlDataType_t mlu_dtype)
+                     cnmlDataType_t mlu_dtype,
+                     cnmlDataOrder_t data_order)
    : mlu_tensor_(nullptr), tensor_type_(tensor_type), mlu_ptr_(nullptr) {
  std::vector<int> int_shape;
  for (auto i : shape) {
@@ -36,15 +40,18 @@ MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
      LOG(FATAL) << "Shape size is beyond the limitation of MLUTensor!";
    }
  }
-  remember(int_shape, tensor_type, mlu_dtype, data_order);
+  remember(int_shape, tensor_type, mlu_dtype, shape_order, data_order);
 }
 void MLUTensor::remember(const std::vector<int>& shape,
                         cnmlTensorType_t tensor_type,
                         cnmlDataType_t mlu_dtype,
-                         cnmlDataOrder_t shape_order) {
+                         cnmlDataOrder_t shape_order,
+                         cnmlDataOrder_t data_order) {
  tensor_type_ = tensor_type;
  mlu_dtype_ = mlu_dtype;
+  data_order_ = data_order;
+  origin_shape_.assign(shape.begin(), shape.end());
  int size = 4;
  if (shape.size() > 4 || shape_order == CNML_ARRAY) {
@@ -239,13 +246,22 @@ void MLUTensor::remember(const std::vector<int>& shape,
        break;
    }
  }
-  dim_ = shape_.size();
+  auto shape_NCHW = DimNHWC2NCHW(shape_);
+  shape_NCHW.erase(shape_NCHW.begin() + shape.size(), shape_NCHW.end());
+  dim_ = shape_NCHW.size();
+  shape_ = DimNCHW2NHWC(shape_NCHW);
 }
 void MLUTensor::Create() {
  if (mlu_tensor_ == nullptr) {
    CNML_CALL(cnmlCreateTensor_V2(&mlu_tensor_, tensor_type_));
    std::vector<int> dim_shape(shape_);
+    if (data_order_ == CNML_NCHW) {
+      std::transform(origin_shape_.cbegin(),
+                     origin_shape_.cend(),
+                     dim_shape.begin(),
+                     [](DDim::value_type in) { return static_cast<int>(in); });
+    }
    int* dim_strides = nullptr;
    CNML_CALL(cnmlSetTensorShape_V2(
        mlu_tensor_, dim_, dim_shape.data(), dim_strides));
@@ -258,6 +274,84 @@ cnmlTensor_t MLUTensor::mlu_tensor() {
  return mlu_tensor_;
 }
+void MLUTensor::ToFile(std::string file_name) {
+  if (mlu_ptr_) {
+    VLOG(5) << "to dump mlu ptr: " << mlu_ptr_ << " to: " << file_name;
+    int count = 1;
+    for (size_t i = 0; i < shape_.size(); i++) {
+      count *= shape_[i];
+    }
+    VLOG(6) << " dump count: " << count;
+    VLOG(6) << " dump shape: ";
+    for (size_t i = 0; i < shape_.size(); i++) {
+      VLOG(6) << shape_[i] << " ";
+    }
+    std::vector<float> cpu_data_fp32(count);
+    // fp16 to fp32
+    if (mlu_dtype_ == CNML_DATA_FLOAT16) {
+      VLOG(6) << " convert fp16 to fp32 ";
+      std::vector<uint16_t> cpu_data_fp16(count);
+      cnrtMemcpy(cpu_data_fp16.data(),
+                 mlu_ptr_,
+                 count * sizeof(uint16_t),
+                 CNRT_MEM_TRANS_DIR_DEV2HOST);
+      for (int i = 0; i < count; i++) {
+        cnrtConvertHalfToFloat(&(cpu_data_fp32[i]), cpu_data_fp16[i]);
+      }
+    } else {
+      cnrtMemcpy(cpu_data_fp32.data(),
+                 mlu_ptr_,
+                 count * sizeof(float),
+                 CNRT_MEM_TRANS_DIR_DEV2HOST);
+    }
+    // trans to nchw
+    std::vector<float> cpu_data_trans(count);
+    if (data_order_ != CNML_NCHW) {
+      switch (shape_.size()) {
+        case 4:
+          transpose(cpu_data_fp32.data(),
+                    cpu_data_trans.data(),
+                    shape_,
+                    {0, 3, 1, 2});
+          break;
+        case 3:
+          transpose(
+              cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 2, 1});
+          break;
+        case 2:
+          transpose(
+              cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 1});
+          break;
+        case 1:
+          transpose(cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0});
+          break;
+        default:
+          CHECK(0) << "ToFile only support dim <=4";
+          break;
+      }
+    }
+    // to file
+    std::ostringstream outs;
+    for (int i = 0; i < count; i++) {
+      if (data_order_ == CNML_NCHW) {
+        outs << cpu_data_fp32[i] << std::endl;
+      } else {
+        outs << cpu_data_trans[i] << std::endl;
+      }
+    }
+    std::ofstream of;
+    of.open(file_name, std::ios::out);
+    of << outs.str();
+    of.close();
+  } else {
+    LOG(FATAL) << "mlu ptr is null ,can not dump mlu content to : "
+               << file_name;
+  }
+}
 MLUTensor::~MLUTensor() {
  if (mlu_tensor_ != nullptr) {
    CNML_CALL(cnmlDestroyTensor(&mlu_tensor_));

--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
@@ -14,6 +14,8 @@
 #pragma once
+#include <fstream>
+#include <string>
 #include <vector>
 #include "lite/kernels/mlu/bridges/utility.h"
@@ -33,13 +35,15 @@ class MLUTensor {
  MLUTensor(const std::vector<int64_t>& shape,
            cnmlTensorType_t tensor_type = CNML_TENSOR,
-            cnmlDataOrder_t data_order = CNML_NCHW,
+            cnmlDataOrder_t shape_order = CNML_NCHW,
-            cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32);
+            cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32,
+            cnmlDataOrder_t data_order = CNML_NHWC);
  void remember(const std::vector<int>& shape,
                cnmlTensorType_t tensor_type,
                cnmlDataType_t mlu_dtype,
-                cnmlDataOrder_t shape_order);
+                cnmlDataOrder_t shape_order,
+                cnmlDataOrder_t data_order);
  void Create();
  cnmlTensor_t mlu_tensor();
  void* mlu_data() {
@@ -47,14 +51,21 @@ class MLUTensor {
    return mlu_ptr_;
  }
+  cnmlDataType_t dtype() { return mlu_dtype_; }
  void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; }
+  const std::vector<int64_t>& get_origin_shape() const { return origin_shape_; }
  ~MLUTensor();
+  void ToFile(std::string file_name);
+  cnmlDataOrder_t dorder() { return data_order_; }
 private:
  cnmlTensor_t mlu_tensor_;
  std::vector<int> shape_;
+  std::vector<int64_t> origin_shape_;
  cnmlTensorType_t tensor_type_;
  cnmlDataType_t mlu_dtype_;
  int dim_{0};

--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
@@ -24,18 +24,38 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
+template <lite_api::PrecisionType Dtype>
+void PrepareInput(Graph* graph,
+                  const std::string& input_name,
+                  Tensor* input_tensor,
+                  cnmlDataOrder_t order) {
+  thread_local Tensor temp_input;
+  temp_input.Resize(input_tensor->dims().Vectorize());
+  temp_input.CopyDataFrom(*input_tensor);
+  using data_type = typename MLUTypeTraits<Dtype>::type;
+  auto input_node = graph->AddNode(
+      input_name,
+      input_tensor->dims().Vectorize(),
+      CNML_TENSOR,
+      CNML_NCHW,
+      MLUTypeTraits<Dtype>::cnml_type,
+      order,
+      reinterpret_cast<void*>(
+          input_tensor->template mutable_data<data_type>(TARGET(kMLU))));
+  CHECK(input_node);
+  CNRT_CHECK(cnrtMemcpy(input_tensor->template mutable_data<data_type>(),
+                        temp_input.mutable_data<data_type>(),
+                        sizeof(data_type) * input_tensor->dims().production(),
+                        CNRT_MEM_TRANS_DIR_HOST2DEV));
+}
 void LaunchOp(const std::shared_ptr<lite::OpLite> op,
              const std::vector<std::string>& input_var_names,
-              const std::vector<std::string>& output_var_names) {
+              const std::vector<std::string>& output_var_names,
+              cnmlDataOrder_t order) {
  CNRT_CALL(cnrtInit(0));
-  ::paddle::lite::SetMluDevice(0);
+  lite::SetMluDevice(0);
  cnrtQueue_t queue_;
-  cnrtInvokeFuncParam_t forward_param;
-  u32_t affinity = 1;
-  int data_param = 1;
-  forward_param.data_parallelism = &data_param;
-  forward_param.affinity = &affinity;
-  forward_param.end = CNRT_PARAM_END;
  CNRT_CALL(cnrtCreateQueue(&queue_));
  cnrtDev_t dev_handle;
  CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0));
@@ -50,23 +70,21 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
  // Convert input data var and add it into the MLU IR graph
  for (auto& input_name : input_var_names) {
    auto input_tensor = scope->FindMutableTensor(input_name);
-    CHECK(input_tensor);
+    auto data_type = input_tensor->precision();
-    Tensor temp_input;
-    temp_input.Resize(input_tensor->dims().Vectorize());
+    switch (data_type) {
-    temp_input.CopyDataFrom(*input_tensor);
+#define PREPARE_INPUT(type__)                                                 \
-    auto input_node =
+  case PRECISION(type__):                                                     \
-        graph.AddNode(input_name,
+    PrepareInput<PRECISION(type__)>(&graph, input_name, input_tensor, order); \
-                      input_tensor->dims().Vectorize(),
+    break;
-                      CNML_TENSOR,
+      PREPARE_INPUT(kFP16)
-                      CNML_NCHW,
+      PREPARE_INPUT(kFloat)
-                      graph.FPType(),
+      PREPARE_INPUT(kInt8)
-                      reinterpret_cast<void*>(
+      PREPARE_INPUT(kInt32)
-                          input_tensor->mutable_data<float>(TARGET(kMLU))));
+#undef PREPARE_INPUT
-    CHECK(input_node);
+      default:
-    CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data<float>(),
+        CHECK(0);
-                          temp_input.mutable_data<float>(),
+    }
-                          sizeof(float) * input_tensor->dims().production(),
-                          CNRT_MEM_TRANS_DIR_HOST2DEV));
  }
  op->CheckShape();
  op->InferShape();
@@ -89,8 +107,9 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
  }
  graph.Compile(CNML_MLU270, 1);
+  graph.Compute(queue_, *(graph.MutableInputs()), *(graph.MutableOutputs()));
+  CNRT_CALL(cnrtSyncQueue(queue_));
-  graph.Compute(forward_param, queue_);
  for (auto& output_name : output_var_names) {
    auto output_tensor = scope->FindMutableTensor(output_name);
    Tensor temp_out;

--- a/lite/kernels/mlu/bridges/test_helper.h
+++ b/lite/kernels/mlu/bridges/test_helper.h
@@ -58,7 +58,8 @@ void FillTensor(Tensor* x,
 void LaunchOp(const std::shared_ptr<lite::OpLite> op,
              const std::vector<std::string>& input_var_names,
-              const std::vector<std::string>& output_var_names);
+              const std::vector<std::string>& output_var_names,
+              cnmlDataOrder_t order = CNML_NHWC);
 }  // namespace mlu
 }  // namespace subgraph

--- a/lite/kernels/mlu/bridges/utility.cc
+++ b/lite/kernels/mlu/bridges/utility.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "lite/kernels/mlu/bridges/utility.h"
 #include <utility>
 namespace paddle {
@@ -20,33 +21,21 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
-void transpose(float* input_data,
+void transpose2d(float* input_data,
-               float* output_data,
+                 float* output_data,
-               std::vector<int> input_shape,
+                 std::vector<int> input_shape) {
-               std::vector<int> axis) {
+  CHECK_EQ(input_shape.size(), 2);
  int old_index = -1;
  int new_index = -1;
-  int dim[4] = {0};
+  for (int i = 0; i < input_shape[0]; i++) {
-  std::vector<int> shape = input_shape;
+    for (int j = 0; j < input_shape[1]; j++) {
-  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
+      old_index = i * input_shape[1] + j;
-    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
+      new_index = j * input_shape[0] + i;
-      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
+      output_data[new_index] = input_data[old_index];
-        for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) {
-          old_index = dim[0] * shape[1] * shape[2] * shape[3] +
-                      dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3];
-          new_index =
-              dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
-              dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
-              dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
-          output_data[new_index] = input_data[old_index];
-        }
-      }
    }
  }
 }
-int scale2position(float scale) { return static_cast<int>(-std::log2(scale)); }
 void dequant(float* dst, int8_t* src, size_t size, float scale) {
  for (size_t i = 0; i < size; ++i) {
    dst[i] = static_cast<float>(src[i]) * scale;

--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -16,24 +16,76 @@
 #include <cnml.h>
 #include <cnrt.h>
 #include <memory>
 #include <string>
 #include <vector>
 #include "lite/backends/mlu/mlu_utils.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
-#include "lite/fluid/data_type.h"
+#include "lite/fluid/float16.h"
 namespace paddle {
 namespace lite {
 namespace subgraph {
 namespace mlu {
-void transpose(float* input_data,
+void transpose2d(float* input_data,
-               float* output_data,
+                 float* output_data,
+                 std::vector<int> input_shape);
+template <typename dtype>
+void transpose(dtype* input_data,
+               dtype* output_data,
               std::vector<int> input_shape,
-               std::vector<int> axis);
+               std::vector<int> axis) {
-int scale2position(float scale);
+  int old_index = -1;
+  int new_index = -1;
+  std::vector<int> shape;
+  std::vector<int> expand_axis;
+  if (input_shape.size() < 5u) {
+    for (size_t i = 0; i < 5 - input_shape.size(); i++) {
+      shape.push_back(1);
+      expand_axis.push_back(i);
+    }
+    for (size_t i = 0; i < input_shape.size(); i++) {
+      shape.push_back(input_shape[i]);
+      expand_axis.push_back(axis[i] + 5 - input_shape.size());
+    }
+  } else {
+    shape = input_shape;
+    expand_axis = axis;
+  }
+  int dim[5] = {0};
+  for (dim[0] = 0; dim[0] < shape[0]; dim[0]++) {
+    for (dim[1] = 0; dim[1] < shape[1]; dim[1]++) {
+      for (dim[2] = 0; dim[2] < shape[2]; dim[2]++) {
+        for (dim[3] = 0; dim[3] < shape[3]; dim[3]++) {
+          for (dim[4] = 0; dim[4] < shape[4]; dim[4]++) {
+            old_index = dim[0] * shape[1] * shape[2] * shape[3] * shape[4] +
+                        dim[1] * shape[2] * shape[3] * shape[4] +
+                        dim[2] * shape[3] * shape[4] + dim[3] * shape[4] +
+                        dim[4];
+            new_index = dim[expand_axis[0]] * shape[expand_axis[1]] *
+                            shape[expand_axis[2]] * shape[expand_axis[3]] *
+                            shape[expand_axis[4]] +
+                        dim[expand_axis[1]] * shape[expand_axis[2]] *
+                            shape[expand_axis[3]] * shape[expand_axis[4]] +
+                        dim[expand_axis[2]] * shape[expand_axis[3]] *
+                            shape[expand_axis[4]] +
+                        dim[expand_axis[3]] * shape[expand_axis[4]] +
+                        dim[expand_axis[4]];
+            output_data[new_index] = input_data[old_index];
+          }
+        }
+      }
+    }
+  }
+}
+inline int scale2position(float scale) { return std::floor(-std::log2(scale)); }
 void dequant(float* dst, int8_t* src, size_t size, float scale);
 void dequant(float* dst,
@@ -64,27 +116,94 @@ inline const ::paddle::lite::DDimLite DimNCHW2NHWC(
      std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]}));
 }
-inline const std::vector<int64_t> DimNHWC2NCHW(
+template <typename data_type>
-    const std::vector<int64_t>& dim) {
+inline const std::vector<data_type> DimNHWC2NCHW(
-  return std::vector<int64_t>({dim[0], dim[3], dim[1], dim[2]});
+    const std::vector<data_type>& dim) {
+  switch (dim.size()) {
+    case 1:
+      return dim;
+    case 2:
+      return dim;
+    case 3:
+      return std::vector<data_type>({dim[0], dim[2], dim[1]});
+    case 4:
+      return std::vector<data_type>({dim[0], dim[3], dim[1], dim[2]});
+    case 5:
+      return std::vector<data_type>({dim[0], dim[4], dim[1], dim[2], dim[3]});
+    default:
+      CHECK(0) << "unsupport dimension";
+  }
+}
+template <typename data_type>
+inline const std::vector<data_type> DimNCHW2NHWC(
+    const std::vector<data_type>& dim) {
+  switch (dim.size()) {
+    case 1:
+      return dim;
+    case 2:
+      return dim;
+    case 3:
+      return std::vector<data_type>({dim[0], dim[2], dim[1]});
+    case 4:
+      return std::vector<data_type>({dim[0], dim[2], dim[3], dim[1]});
+    case 5:
+      return std::vector<data_type>({dim[0], dim[2], dim[3], dim[4], dim[1]});
+    default:
+      CHECK(0) << "unsupport dimension";
+  }
 }
-inline const std::vector<int64_t> DimNCHW2NHWC(
+template <typename data_type>
-    const std::vector<int64_t>& dim) {
+inline std::vector<data_type> GetAxisNHWC2NCHW(size_t n_dims) {
-  return std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]});
+  std::vector<data_type> nhwc2nchw_axis(n_dims);
+  nhwc2nchw_axis[0] = 0;
+  if (n_dims > 1) nhwc2nchw_axis[1] = n_dims - 1;
+  for (size_t i = 2; i < n_dims; ++i) {
+    nhwc2nchw_axis[i] = i - 1;
+  }
+  return nhwc2nchw_axis;
+}
+template <typename data_type>
+inline std::vector<data_type> GetAxisNCHW2NHWC(size_t n_dims) {
+  std::vector<data_type> nchw2nhwc_axis(n_dims);
+  nchw2nhwc_axis[0] = 0;
+  for (size_t i = 1; i < n_dims - 1; ++i) {
+    nchw2nhwc_axis[i] = i + 1;
+  }
+  if (n_dims > 1) nchw2nhwc_axis[n_dims - 1] = 1;
+  return nchw2nhwc_axis;
 }
 template <paddle::lite_api::PrecisionType>
-struct FPTypeTraits {};
+struct MLUTypeTraits {
+  /* using type = void; */
+  /* static constexpr cnmlDataType_t cnml_type = CNML_DATA_INVALID; */
+};
+template <>
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
+  using type = float;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT32;
+};
+template <>
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
+  using type = paddle::lite::fluid::float16;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT16;
+};
 template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
-  typedef float T;
+  using type = int8_t;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT8;
 };
 template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kInt32> {
-  typedef paddle::lite::fluid::float16 T;
+  using type = int32_t;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT32;
 };
 }  // namespace mlu

--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
@@ -41,6 +41,9 @@ class IoCopyHostToMluCompute
    auto mem_size = param.x->memory_size();
    // LOG(INFO) << "copy size " << mem_size;
    auto* data = param.y->mutable_data(TARGET(kMLU), mem_size);
+    VLOG(6) << "io_copy host to mlu] memory size: " << mem_size
+            << " precision type: " << PrecisionToStr(Precision);
+    param.y->set_precision(param.x->precision());
    CopyFromHostSync(data, param.x->raw_data(), mem_size);
  }
@@ -79,6 +82,13 @@ class IoCopyMluToHostCompute
    CHECK(param.x->target() == TARGET(kMLU));
    auto mem_size = param.x->memory_size();
    auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
+    VLOG(6) << "io_copy mlu to host] memory size: " << mem_size
+            << " precision type: " << PrecisionToStr(Precision);
+    // sync queue to ensure process done
+    auto& mlu_context = this->ctx_->template As<MLUContext>();
+    CNRT_CALL(cnrtSyncQueue(mlu_context.exec_queue()));
    CopyToHostSync(data, param.x->raw_data(), mem_size);
  }
@@ -97,8 +107,14 @@ REGISTER_LITE_KERNEL(
    kNHWC,
    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFloat)>,
    host_to_device_kFloat)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Input",
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kAny))})
    .Finalize();
 REGISTER_LITE_KERNEL(
@@ -108,8 +124,31 @@ REGISTER_LITE_KERNEL(
    kNHWC,
    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFP16)>,
    host_to_device_kFP16)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Input",
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kInt32,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kInt32)>,
+    host_to_device_kInt32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kAny))})
    .Finalize();
 REGISTER_LITE_KERNEL(
@@ -119,8 +158,14 @@ REGISTER_LITE_KERNEL(
    kNHWC,
    paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFloat)>,
    device_to_host_kFloat)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Input",
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kAny))})
    .Finalize();
 REGISTER_LITE_KERNEL(
@@ -130,6 +175,29 @@ REGISTER_LITE_KERNEL(
    kNHWC,
    paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFP16)>,
    device_to_host_kFP16)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Input",
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kInt8,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kInt8)>,
+    host_to_device_to_kInt8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kAny))})
    .Finalize();
--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
@@ -24,9 +24,9 @@ namespace mlu {}  // namespace mlu
 REGISTER_LITE_KERNEL(
    layout,
-    kMLU,
+    kX86,
    kFloat,
-    kNHWC,
+    kNCHW,
    paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFloat)>,
    def_layout_nhwc2nchw_fp32)
    .BindInput("Input",
@@ -41,9 +41,9 @@ REGISTER_LITE_KERNEL(
 REGISTER_LITE_KERNEL(
    layout,
-    kMLU,
+    kX86,
    kFP16,
-    kNHWC,
+    kNCHW,
    paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFP16)>,
    def_layout_nhwc2nchw_fp16)
    .BindInput("Input",
@@ -58,9 +58,9 @@ REGISTER_LITE_KERNEL(
 REGISTER_LITE_KERNEL(
    layout,
-    kMLU,
+    kX86,
    kFloat,
-    kNHWC,
+    kNCHW,
    paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFloat)>,
    def_layout_nchw2nhwc_fp32)
    .BindInput("Input",
@@ -75,9 +75,9 @@ REGISTER_LITE_KERNEL(
 REGISTER_LITE_KERNEL(
    layout,
-    kMLU,
+    kX86,
    kFP16,
-    kNHWC,
+    kNCHW,
    paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFP16)>,
    def_layout_nchw2nhwc_fp16)
    .BindInput("Input",
@@ -92,11 +92,11 @@ REGISTER_LITE_KERNEL(
 REGISTER_LITE_KERNEL(
    layout,
-    kMLU,
+    kX86,
    kInt8,
-    kNHWC,
+    kNCHW,
    paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kInt8)>,
-    def_layout_nchw2nhwc_fp32_int8)
+    def_layout_nchw2nhwc_int8)
    .BindInput("Input",
               {LiteType::GetTensorTy(TARGET(kHost),
                                      PRECISION(kInt8),

--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
@@ -22,6 +22,7 @@
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
+#include "lite/kernels/mlu/bridges/utility.h"
 #include "lite/operators/layout_op.h"
 namespace paddle {
@@ -29,24 +30,6 @@ namespace lite {
 namespace kernels {
 namespace mlu {
-template <paddle::lite_api::PrecisionType>
-struct FPTypeTraits {};
-template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
-  typedef float T;
-};
-template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
-  typedef paddle::lite::fluid::float16 T;
-};
-template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
-  typedef int8_t T;
-};
 template <lite::TargetType Target, typename T>
 inline void LayoutTransCompute(const int dim,
                               const lite::Context<Target>& context,
@@ -73,7 +56,7 @@ inline void LayoutTransCompute(const int dim,
 template <PrecisionType Precision>
 class LayoutNchwToNhwcCompute
-    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+    : public KernelLite<TARGET(kX86), Precision, DATALAYOUT(kNCHW)> {
 public:
  using param_t = operators::LayoutParam;
@@ -81,36 +64,37 @@ class LayoutNchwToNhwcCompute
    auto& param = this->template Param<param_t>();
    auto* x = param.x;
    auto* out = param.y;
-    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
+    out->template mutable_data<
-    auto x_dims = param.x->dims().size();
+        typename subgraph::mlu::MLUTypeTraits<Precision>::type>();
+    auto x_ndims = param.x->dims().size();
    auto& context = this->ctx_->template As<X86Context>();
    const auto origin_dims = out->dims().Vectorize();
    std::vector<int> axis;
-    switch (x_dims) {
+    switch (x_ndims) {
      case 2:
        axis = {0, 1};
        break;
      case 3:
        axis = {0, 2, 1};
        out->Resize(std::vector<int64_t>{
-            out->dims()[0], out->dims()[2], out->dims()[1]});
+            origin_dims[0], origin_dims[2], origin_dims[1]});
        break;
      case 4:
        axis = {0, 2, 3, 1};
        out->Resize(std::vector<int64_t>{
-            out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
+            origin_dims[0], origin_dims[2], origin_dims[3], origin_dims[1]});
        break;
      default:
        CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc";
    }
    LayoutTransCompute<lite::TargetType::kX86,
-                       typename FPTypeTraits<Precision>::T>(
+                       typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
-        x_dims, context, *x, out, axis);
+        x_ndims, context, *x, out, axis);
-    if (x_dims > 2) {
+    if (x_ndims > 2) {
      out->Resize(origin_dims);
    }
  }
@@ -122,7 +106,7 @@ class LayoutNchwToNhwcCompute
 template <PrecisionType Precision>
 class LayoutNhwcToNchwCompute
-    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+    : public KernelLite<TARGET(kX86), Precision, DATALAYOUT(kNCHW)> {
 public:
  using param_t = operators::LayoutParam;
@@ -130,25 +114,27 @@ class LayoutNhwcToNchwCompute
    auto& param = this->template Param<param_t>();
    auto* x = param.x;
    auto* out = param.y;
-    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
+    out->template mutable_data<
-    auto x_dims = param.x->dims().size();
+        typename subgraph::mlu::MLUTypeTraits<Precision>::type>();
    auto& context = this->ctx_->template As<X86Context>();
-    const auto origin_dims = out->dims().Vectorize();
+    TensorLite tmp_t;
+    tmp_t.ShareDataWith(*x);
+    const auto x_dims = x->dims().Vectorize();
+    auto x_ndims = param.x->dims().size();
    std::vector<int> axis;
-    switch (x_dims) {
+    switch (x_ndims) {
      case 2:
        axis = {0, 1};
        break;
      case 3:
-        out->Resize(std::vector<int64_t>{
+        tmp_t.Resize(std::vector<int64_t>{x_dims[0], x_dims[2], x_dims[1]});
-            out->dims()[0], out->dims()[2], out->dims()[1]});
        axis = {0, 2, 1};
        break;
      case 4:
-        out->Resize(std::vector<int64_t>{
+        tmp_t.Resize(
-            out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]});
+            std::vector<int64_t>{x_dims[0], x_dims[2], x_dims[3], x_dims[1]});
        axis = {0, 3, 1, 2};
        break;
      default:
@@ -156,12 +142,8 @@ class LayoutNhwcToNchwCompute
    }
    LayoutTransCompute<lite::TargetType::kX86,
-                       typename FPTypeTraits<Precision>::T>(
+                       typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
-        x_dims, context, *x, out, axis);
+        x_ndims, context, tmp_t, out, axis);
-    if (x_dims > 2) {
-      out->Resize(origin_dims);
-    }
  }
  std::string doc() const override {

--- a/lite/kernels/mlu/subgraph_compute.cc
+++ b/lite/kernels/mlu/subgraph_compute.cc
@@ -36,8 +36,14 @@ REGISTER_LITE_KERNEL(
    kNHWC,
    paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFloat)>,
    def_kFloat)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Inputs",
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
    .Finalize();
 REGISTER_LITE_KERNEL(
@@ -47,6 +53,12 @@ REGISTER_LITE_KERNEL(
    kNHWC,
    paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFP16)>,
    def_FP16)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Inputs",
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
    .Finalize();
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -14,17 +14,24 @@
 #pragma once
+#include <algorithm>
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
 #include "lite/api/paddle_place.h"
 #include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
 #include "lite/core/type_system.h"
 #include "lite/core/types.h"
 #include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/tensor.h"
 #include "lite/kernels/npu/bridges/engine.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/utils/env.h"
 namespace paddle {
 namespace lite {
@@ -40,10 +47,19 @@ class SubgraphEngine : public subgraph::Engine {
                 const std::vector<std::string>& input_names,
                 const std::vector<std::string>& output_names,
                 Scope* scope,
-                 ::paddle::lite_api::PrecisionType type)
+                 paddle::lite_api::PrecisionType type)
      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {
+            ctx, block_idx, block_desc, input_names, output_names, scope),
-    graph_.SetFPType(type);
+        fp_type_(type) {
+    VLOG(4) << "[MLU] PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL is "
+            << GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL");
+    VLOG(4) << "[MLU] PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE is "
+            << GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE");
+    VLOG(4) << "[MLU] LITE_DISABLE_MLU_CAST is "
+            << GetBoolFromEnv("LITE_DISABLE_MLU_CAST");
+    if (GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE")) {
+      disable_batch_size_changeable_ = true;
+    }
  }
  int Build() {
@@ -72,24 +88,97 @@ class SubgraphEngine : public subgraph::Engine {
    return 0;
  }
+  bool InputShapeChanged() {
+    std::vector<std::vector<int64_t>> new_shape;
+    // used in batch changable situation
+    std::vector<std::vector<int64_t>> all_shape;
+    for (auto origin_itensor : origin_itensors_) {
+      if (!disable_batch_size_changeable_) {
+        auto iv = origin_itensor->dims().Vectorize();
+        all_shape.push_back(iv);
+        iv.erase(iv.begin());
+        new_shape.push_back(iv);
+      } else {
+        new_shape.push_back(origin_itensor->dims().Vectorize());
+      }
+    }
+    inputs_shape_ = new_shape;
+    all_inputs_shape_ = all_shape;
+    if (shape_graph_map_.count(inputs_shape_) > 0) {
+      return false;
+    }
+    VLOG(3) << "MLU graph input shape changed" << std::endl;
+    return true;
+  }
+  inline cnmlDataType_t PrecisionToDatatype(PrecisionType data_type) {
+    switch (data_type) {
+      case paddle::lite_api::PrecisionType::kFP16:
+        return CNML_DATA_FLOAT16;
+      case paddle::lite_api::PrecisionType::kFloat:
+        return CNML_DATA_FLOAT32;
+      case paddle::lite_api::PrecisionType::kInt32:
+        return CNML_DATA_INT32;
+      case paddle::lite_api::PrecisionType::kInt8:
+        return CNML_DATA_UINT8;
+      default:
+        return PrecisionToDatatype(fp_type_);
+    }
+  }
 protected:
  int BuildDeviceProgram() override {
+    if (!error_compile_batch_size_changeable_ &&
+        !disable_batch_size_changeable_) {
+      int status = BuildDeviceProgramImpl();
+      if (subgraph::CHECK_SUCCESS(status)) {
+        return status;
+      }
+      LOG(INFO) << "[MLU] build batch_size changeable subgraph op failed, "
+                   "changed to input_shape changeable";
+    }
+    error_compile_batch_size_changeable_ = true;
+    disable_batch_size_changeable_ = true;
+    return BuildDeviceProgramImpl();
+  }
+  int BuildDeviceProgramImpl() {
    int status = 0;
+    auto graph = std::make_shared<paddle::lite::subgraph::mlu::Graph>();
+    graph->SetFPType(fp_type_);
+    std::vector<std::vector<int64_t>> new_shape;
+    origin_itensors_.clear();
+    origin_otensors_.clear();
+    auto data_order = block_desc_->GetOp<cpp::OpDesc>(0)->Type() == "layout"
+                          ? CNML_NCHW
+                          : CNML_NHWC;
    // Convert all of input data vars and added into the MLU IR graph
+    status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
    for (auto& input_name : input_names_) {
      auto input_tensor = scope_->FindMutableTensor(input_name);
+      auto data_type = input_tensor->precision();
+      cnmlDataType_t fp_type = PrecisionToDatatype(data_type);
+      origin_itensors_.push_back(input_tensor);
+      if (!disable_batch_size_changeable_) {
+        auto iv = input_tensor->dims().Vectorize();
+        iv.erase(iv.begin());
+        new_shape.push_back(iv);
+      } else {
+        new_shape.push_back(input_tensor->dims().Vectorize());
+      }
      CHECK(input_tensor);
-      auto input_node =
+      VLOG(4) << "subgraph input tensor " << input_name << std::endl;
-          graph_.AddNode(input_name,
+      auto input_node = graph->AddNode(input_name,
-                         input_tensor->dims().Vectorize(),
+                                       input_tensor->dims().Vectorize(),
-                         CNML_TENSOR,
+                                       CNML_TENSOR,
-                         CNML_NCHW,
+                                       CNML_NCHW,
-                         graph_.FPType(),
+                                       fp_type,
-                         const_cast<void*>(input_tensor->raw_data()));
+                                       data_order);
      CHECK(input_node);
      // MLU doesn't support dynamic dimensions/shapes, so need to rebuild
      // the program when the shape of any input tensor is changed.
-      status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
    }
    LOG(INFO) << "START TO CONVERT ";
    // Convert all of ops and its weights and added into the MLU IR graph
@@ -98,6 +187,18 @@ class SubgraphEngine : public subgraph::Engine {
      auto op = inst.op();
      CHECK(op);
      std::string op_type = op->op_info()->Type();
+      // since cnml's compile api will not return error now, we simply check
+      // op's type
+      if (!disable_batch_size_changeable_ &&
+          std::find(unsupport_batch_size_changeable_op_type_.begin(),
+                    unsupport_batch_size_changeable_op_type_.end(),
+                    op_type) !=
+              unsupport_batch_size_changeable_op_type_.end()) {
+        status |= subgraph::FAILED;
+        VLOG(4) << "[MLU] found unsupported batch_size changeable op type: "
+                << op_type;
+        return status;
+      }
      op->CheckShape();
      const_cast<OpLite*>(op)->InferShape();
      if (!bridges.Exists(op_type, TARGET(kMLU))) {
@@ -106,7 +207,7 @@ class SubgraphEngine : public subgraph::Engine {
      }
      auto kernel = inst.kernel();
      status |= bridges.Select(op_type, TARGET(kMLU))(
-          reinterpret_cast<void*>(&graph_),
+          reinterpret_cast<void*>(graph.get()),
          const_cast<OpLite*>(op),
          const_cast<KernelBase*>(kernel));
      if (subgraph::CHECK_FAILED(status)) {
@@ -115,46 +216,272 @@ class SubgraphEngine : public subgraph::Engine {
    }
    // Obtain the output nodes of the MLU IR graph and build the graph to MLU
    // runtime
-    std::vector<std::string> valid_output_names;
    for (auto& output_name : output_names_) {
-      if (graph_.HasNode(output_name)) {
+      if (graph->HasNode(output_name)) {
-        graph_.AddOutput(graph_.GetNode(output_name));
+        graph->AddOutput(graph->GetNode(output_name));
        auto output_tensor = scope_->FindMutableTensor(output_name);
-        void* p_data = static_cast<void*>(
+        origin_otensors_.push_back(output_tensor);
-            output_tensor->mutable_data<typename ::paddle::lite::subgraph::mlu::
+        VLOG(4) << "subgraph output tensor " << output_name << std::endl;
-                                            FPTypeTraits<Precision>::T>(
-                TARGET(kMLU)));
+        // auto node = graph->GetNode(output_name);
-        auto node = graph_.GetNode(output_name);
+        // CHECK(p_data);
-        CHECK(p_data);
+        // node->set_mlu_ptr(p_data);
-        node->set_mlu_ptr(p_data);
-        valid_output_names.push_back(output_name);
      }
    }
    for (auto& input_name : input_names_) {
-      graph_.AddInput(graph_.GetNode(input_name));
+      graph->AddInput(graph->GetNode(input_name),
+                      disable_batch_size_changeable_);
    }
-    CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
+    CHECK(!origin_otensors_.empty()) << "[MLU] no valid output names";
    auto& mlu_context = this->ctx_->template As<MLUContext>();
    auto core_version = mlu_context.MLUCoreVersion();
    auto core_number = mlu_context.MLUCoreNumber();
-    graph_.Compile(core_version, core_number);
+    graph->Compile(core_version, core_number);
+    shape_graph_map_[new_shape] = graph;
+    if (GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL")) {
+      graph->GenOfflineModel(GetOfflineModName());
+    }
    return status;
  }
+  std::string TrimStrings(const std::string& origin_str) {
+    std::string str = origin_str;
+    std::size_t found = str.find("0x");
+    std::size_t found_end = 0;
+    const std::vector<std::string> del_strs = {
+        "/trans_io_copy", "/trans_cast", "/trans_layout"};
+    for (const auto& iterm : del_strs) {
+      found_end = str.find(iterm);
+      // trim point address and one of the del_strs
+      if (found != std::string::npos && found_end != std::string::npos) {
+        str.replace(found, found_end - found, "");
+        found_end = str.find(iterm);
+        str.replace(found_end, iterm.size(), "");
+        break;
+      }
+    }
+    return str;
+  }
+  std::string GetOfflineModName() {
+    sort(input_names_.begin(), input_names_.end());
+    sort(output_names_.begin(), output_names_.end());
+    const auto& delimiter = "__";
+    const auto& delimiter_num = "_";
+    const auto& input_shape_str = "input_shape_";
+    const auto& output_shape_str = "output_shape_";
+    std::string name = "";
+    std::string tmp = "";
+    for (const auto& input_name : input_names_) {
+      tmp = input_name;
+      name += TrimStrings(tmp) + delimiter + input_shape_str;
+      auto input_tensor = scope_->FindMutableTensor(input_name);
+      for (const auto& iterm : input_tensor->dims().Vectorize()) {
+        name += std::to_string(iterm) + delimiter_num;
+      }
+      name += delimiter;
+    }
+    for (const auto& output_name : output_names_) {
+      tmp = output_name;
+      name += TrimStrings(tmp) + delimiter + output_shape_str;
+      auto output_tensor = scope_->FindMutableTensor(output_name);
+      for (const auto& iterm : output_tensor->dims().Vectorize()) {
+        name += std::to_string(iterm) + delimiter_num;
+      }
+      name += delimiter;
+    }
+    std::replace(name.begin(), name.end(), '/', '-');
+    return name;
+  }
+  void InferOutputsShapeOnly() {
+    // infer outputs shape when enable BATCH_SIZE_CHANGEABLE
+    const auto iter = in_out_shape_map_.find(all_inputs_shape_);
+    if (iter != in_out_shape_map_.end()) {
+      for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+        origin_otensors_[i]->Resize(iter->second[i]);
+      }
+    } else {
+      for (auto& inst : origin_program_) {
+        auto op = inst.op();
+        CHECK(op);
+        op->CheckShape();
+        const_cast<OpLite*>(op)->InferShape();
+      }
+      std::vector<std::vector<int64_t>> outs_shape;
+      for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+        outs_shape.push_back(origin_otensors_[i]->dims().Vectorize());
+      }
+      in_out_shape_map_[all_inputs_shape_] = outs_shape;
+    }
+  }
+  inline void* GetOutputDataPtr(Tensor* tensor, bool use_mlu_cast) {
+    if (use_mlu_cast) {
+      // output is float, since cast fused in subgraph
+      return static_cast<void*>(tensor->mutable_data<float>(TARGET(kMLU)));
+    } else {
+      return static_cast<void*>(
+          tensor->template mutable_data<
+              typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
+              TARGET(kMLU)));
+    }
+  }
  int LaunchDeviceProgram() override {
+    // prepare input and output memory
    auto& mlu_context = this->ctx_->template As<MLUContext>();
    auto exec_queue = mlu_context.exec_queue();
-    u32_t affinity = mlu_context.affinity();
-    cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
+    auto graph = shape_graph_map_[inputs_shape_];
-    int data_param = 1;
+    auto* graph_input = graph->MutableInputs();
-    forward_param.data_parallelism = &data_param;
+    auto* graph_output = graph->MutableOutputs();
-    forward_param.affinity = &affinity;
+    CHECK_EQ(graph_input->size(), origin_itensors_.size());
-    forward_param.end = CNRT_PARAM_END;
+    CHECK_EQ(graph_output->size(), origin_otensors_.size());
-    graph_.Compute(forward_param, exec_queue);
+    bool disable_mlu_cast = GetBoolFromEnv("LITE_DISABLE_MLU_CAST");
+    if (!disable_batch_size_changeable_) {
+      std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
+          graph_in;
+      if (shape_tensor_map_in_.find(all_inputs_shape_) !=
+          shape_tensor_map_in_.end()) {
+        graph_in = shape_tensor_map_in_[all_inputs_shape_];
+        for (size_t i = 0; i < origin_itensors_.size(); ++i) {
+          graph_in[i]->set_mlu_ptr(
+              const_cast<void*>(origin_itensors_[i]->raw_data()));
+        }
+      } else {
+        graph_in.reserve(origin_itensors_.size());
+        for (size_t i = 0; i < origin_itensors_.size(); ++i) {
+          paddle::lite::subgraph::mlu::MLUTensor tmp(
+              origin_itensors_[i]->dims().Vectorize());
+          tmp.set_mlu_dtype(graph_input->at(i)->dtype());
+          tmp.set_mlu_ptr(const_cast<void*>(origin_itensors_[i]->raw_data()));
+          graph_in.push_back(
+              std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
+        }
+        shape_tensor_map_in_[all_inputs_shape_] = graph_in;
+      }
+      // TODO(zhangmingwei): we just call every op's infer_shape to get outputs'
+      // shape, may be it's better to use cnml's api to get output shape. This
+      // can be done when cnml's tensor dimension is totally equal to lite's
+      // tensor
+      // shape.
+      InferOutputsShapeOnly();
+      // const std::vector<std::vector<int64_t>> new_output_size =
+      //    graph->InferOutputsShape(graph_in);
+      std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
+          graph_out;
+      if (shape_tensor_map_out_.find(all_inputs_shape_) !=
+          shape_tensor_map_out_.end()) {
+        graph_out = shape_tensor_map_out_[all_inputs_shape_];
+        for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+          // origin_otensors_[i]->Resize(new_output_size.at(i));
+          graph_out[i]->set_mlu_ptr(
+              GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
+        }
+      } else {
+        graph_out.reserve(origin_otensors_.size());
+        for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+          // origin_otensors_[i]->Resize(new_output_size.at(i));
+          paddle::lite::subgraph::mlu::MLUTensor tmp(
+              origin_otensors_[i]->dims().Vectorize());
+          tmp.set_mlu_dtype(graph_output->at(i)->dtype());
+          tmp.set_mlu_ptr(
+              GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
+          graph_out.push_back(
+              std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
+        }
+        shape_tensor_map_out_[all_inputs_shape_] = graph_out;
+      }
+      graph->Compute(exec_queue, graph_in, graph_out);
+    } else {
+      for (size_t i = 0; i < origin_itensors_.size(); ++i) {
+        graph_input->at(i)->set_mlu_ptr(
+            const_cast<void*>(origin_itensors_[i]->raw_data()));
+      }
+      for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+        origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape());
+        graph_output->at(i)->set_mlu_ptr(
+            GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
+      }
+      // only cnmlComputeFusionOpForward_V3 need cnrtInvokeFuncParam_t
+      cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
+      int data_param = 1;
+      forward_param.data_parallelism = &data_param;
+      u32_t affinity = mlu_context.affinity();
+      forward_param.affinity = &affinity;
+      forward_param.end = CNRT_PARAM_END;
+      graph->Compute(forward_param, exec_queue);
+#ifdef MLU_DUMP_SUBGRAPH_IO
+      // Graph node store compile-time tensor while batchsize mutable is set.
+      // Only batchsize mutable is disabled, data exists in graph node at
+      // runtime
+      // =========== DUMP ===================
+      for (auto input_name : input_names_) {
+        auto input_tensor =
+            shape_graph_map_[inputs_shape_]->GetNode(input_name);
+        auto dump_name = input_name;
+        while (dump_name.find("/") != std::string::npos) {
+          dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
+        }
+        VLOG(6) << "dump_name: " << dump_name;
+        input_tensor->ToFile(dump_name);
+      }
+      for (auto output_name : output_names_) {
+        if (shape_graph_map_[inputs_shape_]->HasNode(output_name)) {
+          auto output_tensor =
+              shape_graph_map_[inputs_shape_]->GetNode(output_name);
+          auto dump_name = output_name;
+          while (dump_name.find("/") != std::string::npos) {
+            dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
+          }
+          VLOG(6) << "dump_name: " << dump_name;
+          output_tensor->ToFile(dump_name);
+        } else {
+          VLOG(6) << "graph does not have " << output_name << " as output"
+                  << std::endl;
+        }
+      }
+#endif
+      // =========== DUMP END ================
+    }
    return 0;
  }
-  paddle::lite::subgraph::mlu::Graph graph_;
+  paddle::lite_api::PrecisionType fp_type_;
+  std::vector<std::vector<int64_t>> inputs_shape_{};
+  std::vector<std::vector<int64_t>> all_inputs_shape_{};
+  std::map<std::vector<std::vector<int64_t>>,
+           std::shared_ptr<paddle::lite::subgraph::mlu::Graph>>
+      shape_graph_map_{};
+  // enable batch size changeable by default, this cound be changed by
+  // environment variable PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE and
+  // whether the op can be compiled with batch size changeable way
+  bool disable_batch_size_changeable_{false};
+  bool error_compile_batch_size_changeable_{false};
+  std::vector<std::string> unsupport_batch_size_changeable_op_type_{"concat"};
+  // search output runtime MLUTensor for certain output shape when enable
+  // BATCH_SIZE_CHANGEABLE
+  std::map<std::vector<std::vector<int64_t>>,
+           std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>>
+      shape_tensor_map_out_{};
+  // search input runtime MLUTensor for certain input shape when enable
+  // BATCH_SIZE_CHANGEABLE
+  std::map<std::vector<std::vector<int64_t>>,
+           std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>>
+      shape_tensor_map_in_{};
+  // search output shape for certain input shape when enable
+  // BATCH_SIZE_CHANGEABLE
+  std::map<std::vector<std::vector<int64_t>>, std::vector<std::vector<int64_t>>>
+      in_out_shape_map_{};
 };
 template <PrecisionType Precision>

--- a/lite/kernels/x86/activation_compute.cc
+++ b/lite/kernels/x86/activation_compute.cc
@@ -78,3 +78,13 @@ REGISTER_LITE_KERNEL(softsign,
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
    .Finalize();
+REGISTER_LITE_KERNEL(sigmoid,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SoftsignCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/lite/tools/build_mlu.sh
+++ b/lite/tools/build_mlu.sh
@@ -4,7 +4,7 @@ set -ex
 # global variables with default value
 NEUWARE_HOME="${NEUWARE_HOME}"
 TARGET_NAME="all"    # default target
-BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
+BUILD_EXTRA=ON                      # ON(with sequence ops)/OFF
 WITH_TESTING=ON                     # ON/OFF
 function print_usage {
@@ -28,16 +28,13 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t
 readonly workspace=$(pwd)
 function prepare_thirdparty {
-    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+    if [ ! -d $workspace/third-party ]; then
        rm -rf $workspace/third-party
-        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
-            wget $THIRDPARTY_TAR
-        fi
-        tar xzf third-party-05b862.tar.gz
-    else
-        git submodule update --init --recursive
    fi
+    if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+        wget $THIRDPARTY_TAR
+    fi
+    tar xvf third-party-05b862.tar.gz
 }
 # for code gen, a source file is generated after a test, but is dependended by some targets in cmake.