Merge branch 'master' of gitee.com:mindspore/mindspore

5a886794 · yanghaitao · 7f54d17b · d6d93f16 · df57a6cf · 5a886794
122 changed file
--- a/akg @ df57a6cf
+++ b/akg @ df57a6cf
-Subproject commit c460176523d039c8995f1d71089753725ebc0792
+Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -277,10 +277,11 @@ endif ()

 if (USE_GLOG)
    target_link_libraries(inference PRIVATE mindspore::glog)
-else()
-    if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-        target_link_options(inference PRIVATE -Wl,-init,mindspore_log_init)
-    elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin")
-        set_target_properties(inference PROPERTIES MACOSX_RPATH ON)
-    endif ()
 endif()
+
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    target_link_options(inference PRIVATE -Wl,-init,common_log_init)
+elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin")
+    set_target_properties(inference PROPERTIES MACOSX_RPATH ON)
+endif ()
+
--- a/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_with_bbox_op.cc
+++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_and_resize_with_bbox_op.cc
@@ -30,8 +30,7 @@ Status RandomCropAndResizeWithBBoxOp::Compute(const TensorRow &input, TensorRow
  BOUNDING_BOX_CHECK(input);
  CHECK_FAIL_RETURN_UNEXPECTED(input[0]->shape().Size() >= 2, "The shape of input is abnormal");

-  (*output).push_back(nullptr);  // init memory for return vector
-  (*output).push_back(nullptr);
+  output->resize(2);
  (*output)[1] = std::move(input[1]);  // move boxes over to output

  size_t bboxCount = input[1]->shape()[0];  // number of rows in bbox tensor

--- a/mindspore/ccsrc/dataset/kernels/image/random_crop_with_bbox_op.cc
+++ b/mindspore/ccsrc/dataset/kernels/image/random_crop_with_bbox_op.cc
@@ -36,8 +36,7 @@ Status RandomCropWithBBoxOp::Compute(const TensorRow &input, TensorRow *output)
  int32_t padded_image_h;
  int32_t padded_image_w;

-  (*output).push_back(nullptr);
-  (*output).push_back(nullptr);
+  output->resize(2);
  (*output)[1] = std::move(input[1]);  // since some boxes may be removed

  bool crop_further = true;  // Whether further cropping will be required or not, true unless required size matches

--- a/mindspore/ccsrc/dataset/kernels/image/random_vertical_flip_with_bbox_op.cc
+++ b/mindspore/ccsrc/dataset/kernels/image/random_vertical_flip_with_bbox_op.cc
@@ -45,8 +45,7 @@ Status RandomVerticalFlipWithBBoxOp::Compute(const TensorRow &input, TensorRow *
      RETURN_IF_NOT_OK(input[1]->SetItemAt({i, 1}, newBoxCorner_y));
    }

-    (*output).push_back(nullptr);
-    (*output).push_back(nullptr);
+    output->resize(2);
    (*output)[1] = std::move(input[1]);

    return VerticalFlip(input[0], &(*output)[0]);

--- a/mindspore/ccsrc/dataset/util/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/util/CMakeLists.txt
@@ -2,6 +2,8 @@ file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc"
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 add_library(utils OBJECT
    arena.cc
+    buddy.cc
+    cache_pool.cc
    circular_pool.cc
    memory_pool.cc
    cond_var.cc
@@ -11,7 +13,11 @@ add_library(utils OBJECT
    service.cc
    services.cc
    lock.cc
+    semaphore.cc
    status.cc
+    storage_container.cc
+    storage_manager.cc
+    slice.cc
    path.cc
    wait_post.cc
    sig_handler.cc)
--- a/mindspore/ccsrc/dataset/util/allocator.h
+++ b/mindspore/ccsrc/dataset/util/allocator.h
@@ -17,8 +17,10 @@
 #define DATASET_UTIL_ALLOCATOR_H_

 #include <cstdlib>
+#include <functional>
 #include <memory>
 #include <type_traits>
+#include <utility>
 #include "dataset/util/memory_pool.h"

 namespace mindspore {
@@ -84,6 +86,91 @@ class Allocator {
 private:
  std::shared_ptr<MemoryPool> pool_;
 };
+/// \brief It is a wrapper of unique_ptr with a custom allocator and acts like std::lock_guard such that the memory will
+/// be released when the object goes out of scope \tparam T The type of object to be allocated \tparam C Allocator.
+/// Default to std::allocator
+template <typename T, typename C = std::allocator<T>>
+class MemGuard {
+ public:
+  using allocator = C;
+  MemGuard() : n_(0) {}
+  explicit MemGuard(allocator a) : n_(0), alloc_(a) {}
+  // There is no copy constructor nor assignment operator because the memory is solely owned by this object.
+  MemGuard(const MemGuard &) = delete;
+  MemGuard &operator=(const MemGuard &) = delete;
+  // On the other hand, We can support move constructor
+  MemGuard(MemGuard &&lhs) noexcept : alloc_(std::move(lhs.alloc_)), ptr_(std::move(lhs.ptr_)), n_(lhs.n_) {}
+  MemGuard &operator=(MemGuard &&lhs) noexcept {
+    if (this != &lhs) {
+      this->deallocate();
+      n_ = lhs.n_;
+      alloc_ = std::move(lhs.alloc_);
+      ptr_ = std::move(lhs.ptr_);
+    }
+    return *this;
+  }
+  /// \brief Explicitly deallocate the memory if allocated
+  void deallocate() {
+    if (ptr_) {
+      auto *p = ptr_.release();
+      if (!std::is_arithmetic<T>::value && std::is_destructible<T>::value) {
+        for (auto i = 0; i < n_; ++i) {
+          p[i].~T();
+        }
+      }
+      alloc_.deallocate(p, n_);
+      n_ = 0;
+    }
+  }
+  /// \brief Allocate memory (with emplace feature). Previous one will be released. If size is 0, no new memory is
+  /// allocated.
+  /// \param n Number of objects of type T to be allocated
+  /// \tparam Args Extra arguments pass to the constructor of T
+  template <typename... Args>
+  Status allocate(size_t n, Args &&... args) noexcept {
+    try {
+      deallocate();
+      if (n > 0) {
+        T *data = alloc_.allocate(n);
+        if (!std::is_arithmetic<T>::value) {
+          for (auto i = 0; i < n; i++) {
+            std::allocator_traits<C>::construct(alloc_, &(data[i]), std::forward<Args>(args)...);
+          }
+        }
+        ptr_ = std::unique_ptr<T[]>(data);
+        n_ = n;
+      }
+    } catch (const std::bad_alloc &e) {
+      return Status(StatusCode::kOutOfMemory);
+    } catch (std::exception &e) {
+      RETURN_STATUS_UNEXPECTED(e.what());
+    }
+    return Status::OK();
+  }
+  ~MemGuard() noexcept { deallocate(); }
+  /// \brief Getter function
+  /// \return The pointer to the memory allocated
+  T *GetPointer() const { return ptr_.get(); }
+  /// \brief Getter function
+  /// \return The pointer to the memory allocated
+  T *GetMutablePointer() { return ptr_.get(); }
+  /// \brief Overload [] operator to access a particular element
+  /// \param x index to the element. Must be less than number of element allocated.
+  /// \return pointer to the x-th element
+  T *operator[](size_t x) { return GetMutablePointer() + x; }
+  /// \brief Overload [] operator to access a particular element
+  /// \param x index to the element. Must be less than number of element allocated.
+  /// \return pointer to the x-th element
+  T *operator[](size_t x) const { return GetPointer() + x; }
+  /// \brief Return how many bytes are allocated in total
+  /// \return Number of bytes allocated in total
+  size_t GetSizeInBytes() const { return n_ * sizeof(T); }
+
+ private:
+  allocator alloc_;
+  std::unique_ptr<T[], std::function<void(T *)>> ptr_;
+  size_t n_;
+};
 }  // namespace dataset
 }  // namespace mindspore


--- a/mindspore/ccsrc/dataset/util/auto_index.h
+++ b/mindspore/ccsrc/dataset/util/auto_index.h
@@ -91,7 +91,7 @@ class AutoIndexObj : public BPlusTree<int64_t, T, A> {
  }

 private:
-  static constexpr key_type kMinKey = 1;
+  static constexpr key_type kMinKey = 0;
  std::atomic<key_type> inx_;
 };
 }  // namespace dataset

--- a/mindspore/ccsrc/dataset/util/buddy.cc
+++ b/mindspore/ccsrc/dataset/util/buddy.cc
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/util/buddy.h"
+#include <iomanip>
+#include <stdexcept>
+#include "dataset/util/de_error.h"
+#include "dataset/util/memory_pool.h"
+#include "dataset/util/system_pool.h"
+#include "./securec.h"
+
+inline uint64_t BitLeftShift(uint64_t v, uint64_t n) { return (v << n); }
+
+inline uint64_t BitRightShift(uint64_t v, uint64_t n) { return (v >> n); }
+
+inline uint64_t BitOr(uint64_t rhs, uint64_t lhs) { return rhs | lhs; }
+
+inline uint64_t BitEx(uint64_t rhs, uint64_t lhs) { return rhs ^ lhs; }
+
+inline uint64_t BitAnd(uint64_t rhs, uint64_t lhs) { return rhs & lhs; }
+
+namespace mindspore {
+namespace dataset {
+Status BuddySpace::Init() {
+  if (log_min_ < 0) {
+    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
+                  "log_min must be positive : " + std::to_string(log_min_));
+  }
+  if (num_lvl_ < 3 || num_lvl_ > 18) {
+    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
+                  "num_lvl must be between 3 and 18 : " + std::to_string(num_lvl_));
+  }
+  min_ = BitLeftShift(1, log_min_);
+  max_ = BitLeftShift(1, log_min_ + num_lvl_ - 1);
+  size_t offset_1 = sizeof(rel_addr_t) * num_lvl_;
+  size_t offset_2 = sizeof(int) * num_lvl_ + offset_1;
+  size_t offset_3 = sizeof(char) * BitLeftShift(1, num_lvl_ - 3) + offset_2;
+  RETURN_IF_NOT_OK(DeMalloc(offset_3, &ptr_, true));
+  hint_ = reinterpret_cast<rel_addr_t *>(ptr_);
+  count_ = reinterpret_cast<int *>((reinterpret_cast<char *>(ptr_) + offset_1));
+  map_ = reinterpret_cast<char *>(ptr_) + offset_2;
+  count_[num_lvl_ - 1] = 1;
+  map_[0] = BitOr(MORE_BIT, num_lvl_ - 3);
+  return Status::OK();
+}
+
+Status BuddySpace::Alloc(const uint64_t sz, BSpaceDescriptor *desc, addr_t *p) noexcept {
+  std::lock_guard<std::mutex> lock(mutex_);
+  addr_t addr = AllocNoLock(sz, desc);
+  if (addr != NOSPACE) {
+    *p = addr;
+    return Status::OK();
+  } else {
+    return Status(StatusCode::kNoSpace, "BuddySpace full. Not an error. Please ignore.");
+  }
+}
+
+addr_t BuddySpace::AllocNoLock(const uint64_t sz, BSpaceDescriptor *desc) noexcept {
+  DS_ASSERT(sz <= max_);
+  uint32_t reqSize = SizeToBlock(sz);
+  rel_addr_t rel_addr = AllocBuddySeg(reqSize);
+  if (rel_addr != static_cast<rel_addr_t>(NOSPACE)) {
+    (void)memset_s(desc, sizeof(BSpaceDescriptor), 0, sizeof(BSpaceDescriptor));
+    desc->sig = static_cast<int>(0xDEADBEEF);
+    desc->addr = rel_addr;
+    desc->req_size = reqSize;
+    desc->blk_size = NextPowerOf2(reqSize);
+    return static_cast<addr_t>(rel_addr * min_);
+  } else {
+    return NOSPACE;
+  }
+}
+
+void BuddySpace::FreeNoLock(const BSpaceDescriptor *desc) {
+  DS_ASSERT(desc->sig == 0XDEADBEEF);
+  rel_addr_t rel_addr = desc->addr;
+  size_t blk_size = desc->blk_size;
+  size_t req_size = desc->req_size;
+  FreeBuddySeg(rel_addr, blk_size, req_size);
+}
+
+void BuddySpace::Free(const BSpaceDescriptor *desc) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return FreeNoLock(desc);
+}
+
+std::ostream &operator<<(std::ostream &os, const BuddySpace &s) {
+  os << "1 unit = " << s.GetMinSize() << "\n"
+     << "Size of buddy space = " << s.GetMaxSize() << "\n"
+     << "Number of levels = " << s.num_lvl_ << "\n\n"
+     << "Percent free = " << s.PercentFree() << "\n"
+     << "Dumping count array : "
+     << "\n";
+  for (int i = 0; i < s.num_lvl_; i++) {
+    os << "[" << i << "] = " << s.count_[i] << " ";
+    if (((i + 1) % 4) == 0) {
+      os << "\n";
+    }
+  }
+  os << "\n";
+  os << "Dumping allocation info:"
+     << "\n";
+  auto max_addr = static_cast<rel_addr_t>(BitLeftShift(1, s.num_lvl_ - 1));
+  rel_addr_t addr = 0;
+  while (addr < max_addr) {
+    size_t sz = 0;
+    BuddySpace::STATE st;
+    s.GetBuddySegState(addr, &sz, &st);
+    os << "Address : " << std::left << std::setw(8) << addr << " Size : " << std::setw(8) << sz << " State : "
+       << ((st == BuddySpace::STATE::kAlloc) ? "ALLOC" : ((st == BuddySpace::STATE::kFree) ? "FREE" : "Unkonwn"))
+       << "\n";
+    addr += sz;
+  }
+  return os;
+}
+
+void BuddySpace::GetBuddySegState(const rel_addr_t rel_addr, size_t *rel_sz, STATE *st) const {
+  char byte;
+  int pos;
+  int offset;
+  uint64_t val = 0;
+  int shift;
+  pos = BitRightShift(rel_addr, 2);
+  offset = rel_addr % 4;
+  shift = offset * 2;
+  byte = map_[pos];
+  switch (offset) {
+    case 0:
+      val = byte;
+      break;
+    case 1:
+    case 3:
+      if (offset == 1) {
+        val = BitLeftShift(BitAnd(byte, 0x30), shift);
+      } else {
+        val = BitLeftShift(BitAnd(byte, 0x03), shift);
+      }
+      break;
+    case 2:
+      val = BitLeftShift(BitAnd(byte, 0x0F), shift);
+      break;
+  }
+  if (BitAnd(val, ONE_BIT)) {
+    *rel_sz = 1;
+  } else if (BitAnd(val, TWO_BIT)) {
+    *rel_sz = 2;
+  } else if (BitAnd(val, MORE_BIT)) {
+    log_t lg = BitAnd(val, 0x0F);
+    *rel_sz = BitLeftShift(1, lg + 2);
+  } else {
+    *st = STATE::kEmpty;
+    return;
+  }
+  *st = BitAnd(val, ALLOC_BIT) ? STATE::kAlloc : STATE::kFree;
+}
+
+void BuddySpace::SetBuddySegState(rel_addr_t rel_addr, size_t rel_sz, STATE st) {
+  int clr;
+  int mask;
+  int pos;
+  int offset;
+  int val = 0;
+  int shift;
+  auto log_sz = static_cast<log_t>(Log2(rel_sz));
+  pos = BitRightShift(rel_addr, 2);
+  offset = rel_addr % 4;
+  shift = offset * 2;
+  if (rel_sz == 1) {
+    val = ONE_BIT;
+    mask = 0xC0;
+  } else if (rel_sz == 2) {
+    val = TWO_BIT;
+    mask = 0xF0;
+  } else {
+    val = BitOr(log_sz - 2, MORE_BIT);
+    mask = 0xFF;
+  }
+  if (st == STATE::kAlloc) {
+    val = BitOr(val, ALLOC_BIT);
+  } else if (st == STATE::kFree) {
+    val = BitAnd(val, ~(static_cast<uint64_t>(ALLOC_BIT)));
+  } else if (st == STATE::kEmpty) {
+    val = 0;
+  }
+  clr = static_cast<int>(~(BitRightShift(mask, shift)));
+  map_[pos] = static_cast<char>(BitAnd(map_[pos], clr));
+  map_[pos] = static_cast<char>(BitOr(map_[pos], BitRightShift(val, shift)));
+  if (st == STATE::kAlloc) {
+    count_[log_sz]--;
+  } else if (st == STATE::kFree) {
+    count_[log_sz]++;
+    if (rel_addr < hint_[log_sz]) {
+      hint_[log_sz] = rel_addr;
+    }
+  }
+}
+
+void BuddySpace::JoinBuddySeg(rel_addr_t addr, size_t blk_sz) {
+  while (blk_sz < BitLeftShift(1, num_lvl_)) {
+    rel_addr_t buddy = BitEx(addr, blk_sz);
+    size_t sz = 0;
+    STATE st;
+    GetBuddySegState(buddy, &sz, &st);
+    if (st == STATE::kFree && sz == blk_sz) {
+      auto log_sz = static_cast<log_t>(Log2(blk_sz));
+      rel_addr_t left = (buddy < addr) ? buddy : addr;
+      rel_addr_t right = left + blk_sz;
+      DS_ASSERT(count_[log_sz] >= 2);
+      count_[log_sz] -= 2;
+      SetBuddySegState(right, blk_sz, STATE::kEmpty);
+      SetBuddySegState(left, BitLeftShift(blk_sz, 1), STATE::kFree);
+      for (int i = 0; i < log_sz; i++) {
+        if (hint_[i] == right) {
+          hint_[i] = left;
+        }
+      }
+      addr = left;
+      blk_sz <<= 1u;
+    } else {
+      break;
+    }
+  }
+}
+
+void BuddySpace::TrimBuddySeg(rel_addr_t addr, size_t blk_sz, size_t ask_sz) {
+  DS_ASSERT(ask_sz < blk_sz);
+  uint32_t inx = Log2(blk_sz);
+  size_t remaining_sz = ask_sz;
+  for (int i = inx; i > 0; i--) {
+    size_t b_size = BitLeftShift(1, i);
+    size_t half_sz = BitRightShift(b_size, 1);
+    count_[i]--;
+    SetBuddySegState(addr, half_sz, STATE::kFree);
+    SetBuddySegState(addr + half_sz, half_sz, STATE::kFree);
+    if (remaining_sz >= half_sz) {
+      SetBuddySegState(addr, half_sz, STATE::kAlloc);
+      remaining_sz -= half_sz;
+      if (remaining_sz == 0) {
+        break;
+      }
+      addr += half_sz;
+    }
+  }
+}
+
+void BuddySpace::UnTrimBuddySeg(rel_addr_t addr, size_t blk_sz, size_t ask_sz) {
+  DS_ASSERT(ask_sz < blk_sz);
+  uint32_t inx = Log2(blk_sz);
+  size_t remaining_sz = ask_sz;
+  for (int i = inx; i > 0; i--) {
+    size_t b_size = BitLeftShift(1, i);
+    size_t half_sz = BitRightShift(b_size, 1);
+    if (remaining_sz >= half_sz) {
+#ifdef DEBUG
+      {
+        size_t sz = 0;
+        STATE st;
+        GetBuddySegState(addr, &sz, &st);
+        DS_ASSERT(sz == half_sz && st == STATE::kAlloc);
+      }
+#endif
+      SetBuddySegState(addr, half_sz, STATE::kFree);
+      remaining_sz -= half_sz;
+      if (remaining_sz == 0) {
+        JoinBuddySeg(addr, half_sz);
+        break;
+      }
+      addr += half_sz;
+    }
+  }
+}
+
+rel_addr_t BuddySpace::AllocBuddySeg(uint32_t req_size) noexcept {
+  uint32_t blk_size = NextPowerOf2(req_size);
+  int start_inx = static_cast<int>(Log2(blk_size));
+  bool found = false;
+  rel_addr_t ask_addr = 0;
+  auto max_addr = static_cast<rel_addr_t>(BitLeftShift(1, num_lvl_ - 1));
+  STATE st;
+  size_t sz = 0;
+  for (int i = start_inx; !found && i < num_lvl_; i++) {
+    DS_ASSERT(count_[i] >= 0);
+    if (count_[i] == 0) {
+      continue;
+    }
+    auto blk_sz = static_cast<size_t>(BitLeftShift(1, i));
+    ask_addr = hint_[i];
+    while (ask_addr < max_addr && !found) {
+      GetBuddySegState(ask_addr, &sz, &st);
+      if (st == STATE::kFree && sz == blk_sz) {
+        found = true;
+      } else {
+        DS_ASSERT(st != STATE::kEmpty);
+        ask_addr += ((sz > blk_sz) ? sz : blk_sz);
+      }
+    }
+  }
+  if (found) {
+    if (sz > req_size) {
+      TrimBuddySeg(ask_addr, sz, req_size);
+    } else {
+      SetBuddySegState(ask_addr, sz, STATE::kAlloc);
+      hint_[start_inx] = ask_addr;
+    }
+    return ask_addr;
+  } else {
+    return static_cast<rel_addr_t>(NOSPACE);
+  }
+}
+
+void BuddySpace::FreeBuddySeg(rel_addr_t addr, size_t blk_size, size_t req_size) {
+  if (req_size == blk_size) {
+#ifdef DEBUG
+    {
+      size_t sz = 0;
+      STATE st;
+      GetBuddySegState(addr, &sz, &st);
+    }
+#endif
+    SetBuddySegState(addr, blk_size, STATE::kFree);
+    JoinBuddySeg(addr, blk_size);
+  } else {
+    UnTrimBuddySeg(addr, blk_size, req_size);
+  }
+}
+
+int BuddySpace::PercentFree() const {
+  uint64_t total_free_sz = 0;
+  uint64_t max_sz_in_unit = BitLeftShift(1, num_lvl_ - 1);
+  // Go through the count array without lock
+  for (int i = 0; i < num_lvl_; i++) {
+    int cnt = count_[i];
+    if (cnt == 0) {
+      continue;
+    }
+    uint64_t blk_sz = BitLeftShift(1, i);
+    total_free_sz += (blk_sz * cnt);
+  }
+  return static_cast<int>(static_cast<float>(total_free_sz) / static_cast<float>(max_sz_in_unit) * 100);
+}
+
+BuddySpace::BuddySpace(int log_min, int num_lvl)
+    : hint_(nullptr),
+      count_(nullptr),
+      map_(nullptr),
+      log_min_(log_min),
+      num_lvl_(num_lvl),
+      min_(0),
+      max_(0),
+      ptr_(nullptr) {}
+
+BuddySpace::~BuddySpace() {
+  if (ptr_ != nullptr) {
+    free(ptr_);
+  }
+  hint_ = nullptr;
+  count_ = nullptr;
+  map_ = nullptr;
+}
+
+Status BuddySpace::CreateBuddySpace(std::unique_ptr<BuddySpace> *out_bs, int log_min, int num_lvl) {
+  Status rc;
+  auto bs = new (std::nothrow) BuddySpace(log_min, num_lvl);
+  if (bs == nullptr) {
+    return Status(StatusCode::kOutOfMemory);
+  }
+  rc = bs->Init();
+  if (rc.IsOk()) {
+    (*out_bs).reset(bs);
+  } else {
+    delete bs;
+  }
+  return rc;
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/util/buddy.h
+++ b/mindspore/ccsrc/dataset/util/buddy.h
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_UTIL_BUDDY_H_
+#define DATASET_UTIL_BUDDY_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include "dataset/util/status.h"
+
+using addr_t = int64_t;
+using rel_addr_t = int32_t;
+using log_t = int;
+#define ALLOC_BIT 0x80
+#define ONE_BIT 0x40
+#define TWO_BIT 0x20
+#define MORE_BIT 0x10
+#define NOSPACE ((addr_t)(-1))
+namespace mindspore {
+namespace dataset {
+struct BSpaceDescriptor {
+  int32_t sig;
+  rel_addr_t addr;
+  size_t req_size;
+  size_t blk_size;
+};
+
+class BuddySpace {
+ public:
+  // C++11 feature. Change STATE into a type safe class with
+  // the keyword. Don't take out the keyword 'class'
+  enum class STATE { kFree, kAlloc, kEmpty };
+
+  BuddySpace(const BuddySpace &) = delete;
+
+  BuddySpace &operator=(const BuddySpace &) = delete;
+
+  virtual ~BuddySpace();
+
+  Status Alloc(uint64_t sz, BSpaceDescriptor *desc, addr_t *) noexcept;
+
+  void Free(const BSpaceDescriptor *desc);
+
+  uint64_t GetMinSize() const { return min_; }
+
+  uint64_t GetMaxSize() const { return max_; }
+
+  int PercentFree() const;
+
+  friend std::ostream &operator<<(std::ostream &os, const BuddySpace &s);
+
+  static uint64_t NextPowerOf2(uint64_t n) {
+    if (n <= 1) {
+      return 1;
+    }
+    n = n - 1;
+    while (n & (n - 1)) {
+      n = n & (n - 1);
+    }
+    return n << 1;
+  }
+
+  static uint32_t Log2(uint64_t n) {
+    uint32_t cnt = 0;
+    while (n >>= 1) {
+      cnt++;
+    }
+    return cnt;
+  }
+
+  static Status CreateBuddySpace(std::unique_ptr<BuddySpace> *out_bs, int log_min = 15, int num_lvl = 18);
+
+ private:
+  rel_addr_t *hint_;
+  int *count_;
+  char *map_;
+  int log_min_;
+  int num_lvl_;
+  uint64_t min_;
+  uint64_t max_;
+  void *ptr_;
+  std::mutex mutex_;
+
+  explicit BuddySpace(int log_min = 15, int num_lvl = 18);
+
+  Status Init();
+
+  addr_t AllocNoLock(const uint64_t sz, BSpaceDescriptor *desc) noexcept;
+
+  void FreeNoLock(const BSpaceDescriptor *desc);
+
+  uint32_t SizeToBlock(const uint64_t sz) const {
+    uint32_t reqSize = (sz / min_);
+    if (sz % min_) {
+      reqSize++;
+    }
+    return reqSize;
+  }
+
+  void GetBuddySegState(const rel_addr_t rel_addr, size_t *rel_sz, STATE *st) const;
+
+  void SetBuddySegState(rel_addr_t rel_addr, size_t rel_sz, STATE st);
+
+  void JoinBuddySeg(rel_addr_t addr, size_t blk_sz);
+
+  void TrimBuddySeg(rel_addr_t addr, size_t blk_sz, size_t ask_sz);
+
+  void UnTrimBuddySeg(rel_addr_t addr, size_t blk_sz, size_t ask_sz);
+
+  rel_addr_t AllocBuddySeg(uint32_t req_size) noexcept;
+
+  void FreeBuddySeg(rel_addr_t addr, size_t blk_size, size_t req_size);
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_UTIL_BUDDY_H_
--- a/mindspore/ccsrc/dataset/util/cache_pool.cc
+++ b/mindspore/ccsrc/dataset/util/cache_pool.cc
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include "common/utils.h"
+#include "dataset/util/cache_pool.h"
+#include "dataset/util/services.h"
+
+namespace mindspore {
+namespace dataset {
+CachePool::CachePool(const value_allocator &alloc, const std::string &root)
+    : alloc_(alloc), root_(root), subfolder_(Services::GetUniqueID()), sm_(nullptr), tree_(nullptr) {}
+
+Status CachePool::DoServiceStart() {
+  tree_ = std::make_shared<data_index>();
+  // If we are given a disk path, set up the StorageManager
+  if (!root_.toString().empty()) {
+    Path spill = GetSpillPath();
+    RETURN_IF_NOT_OK(spill.CreateDirectories());
+    sm_ = std::make_shared<StorageManager>(spill);
+    RETURN_IF_NOT_OK(sm_->ServiceStart());
+    MS_LOG(INFO) << "CachePool will use disk folder: " << common::SafeCStr(spill.toString());
+  }
+  return Status::OK();
+}
+Status CachePool::DoServiceStop() {
+  Status rc;
+  Status rc2;
+  if (sm_ != nullptr) {
+    rc = sm_->ServiceStop();
+    if (rc.IsError()) {
+      rc2 = rc;
+    }
+  }
+  sm_.reset();
+  for (auto &bl : *tree_) {
+    if (bl.ptr != nullptr) {
+      alloc_.deallocate(bl.ptr, bl.sz);
+    }
+  }
+  tree_.reset();
+  if (!root_.toString().empty()) {
+    Path spill = GetSpillPath();
+    auto it = Path::DirIterator::OpenDirectory(&spill);
+    while (it->hasNext()) {
+      rc = it->next().Remove();
+      if (rc.IsError() && rc2.IsOk()) {
+        rc2 = rc;
+      }
+    }
+    rc = spill.Remove();
+    if (rc.IsError() && rc2.IsOk()) {
+      rc2 = rc;
+    }
+  }
+  return rc2;
+}
+CachePool::~CachePool() noexcept { (void)ServiceStop(); }
+Status CachePool::Insert(const std::vector<ReadableSlice> &buf, CachePool::key_type *key) {
+  DataLocator bl;
+  Status rc;
+  size_t sz = 0;
+  // We will consolidate all the slices into one piece.
+  for (auto &v : buf) {
+    sz += v.GetSize();
+  }
+  bl.sz = sz;
+  try {
+    bl.ptr = alloc_.allocate(sz);
+    // We will do a piecewise copy.
+    WritableSlice dest(bl.ptr, bl.sz);
+    size_t pos = 0;
+    for (auto &v : buf) {
+      WritableSlice out(dest, pos);
+      rc = WritableSlice::Copy(&out, v);
+      if (rc.IsError()) {
+        break;
+      }
+      pos += v.GetSize();
+    }
+    if (rc.IsError()) {
+      alloc_.deallocate(bl.ptr, sz);
+      bl.ptr = nullptr;
+      return rc;
+    }
+  } catch (std::bad_alloc &e) {
+    if (sm_ != nullptr) {
+      RETURN_IF_NOT_OK(sm_->Write(&bl.storage_key, buf));
+      // We have an assumption 0 is not a valid key from the design of AutoIndexObj.
+      // Make sure it is not 0.
+      if (bl.storage_key == 0) {
+        RETURN_STATUS_UNEXPECTED("Key 0 is returned which is unexpected");
+      }
+    } else {
+      return Status(StatusCode::kOutOfMemory, __LINE__, __FILE__);
+    }
+  }
+  rc = tree_->insert(bl, key);
+  if (rc.IsError() && bl.ptr != nullptr) {
+    alloc_.deallocate(bl.ptr, sz);
+  }
+  return rc;
+}
+Status CachePool::Read(CachePool::key_type key, WritableSlice *dest, size_t *bytesRead) const {
+  RETURN_UNEXPECTED_IF_NULL(dest);
+  auto r = tree_->Search(key);
+  if (r.second) {
+    auto &it = r.first;
+    if (it->ptr != nullptr) {
+      ReadableSlice src(it->ptr, it->sz);
+      RETURN_IF_NOT_OK(WritableSlice::Copy(dest, src));
+    } else if (sm_ != nullptr) {
+      size_t expectedLength = 0;
+      RETURN_IF_NOT_OK(sm_->Read(it->storage_key, dest, &expectedLength));
+      if (expectedLength != it->sz) {
+        MS_LOG(ERROR) << "Unexpected length. Read " << expectedLength << ". Expected " << it->sz << "."
+                      << " Internal key: " << key << "\n";
+        RETURN_STATUS_UNEXPECTED("Length mismatch. See log file for details.");
+      }
+    }
+    if (bytesRead != nullptr) {
+      *bytesRead = it->sz;
+    }
+  } else {
+    RETURN_STATUS_UNEXPECTED("Key not found");
+  }
+  return Status::OK();
+}
+const CachePool::value_allocator &CachePool::get_allocator() const { return alloc_; }
+Path CachePool::GetSpillPath() const {
+  auto spill = Path(root_) / subfolder_;
+  return spill;
+}
+CachePool::CacheStat CachePool::GetStat() const {
+  CacheStat cs{0};
+  for (auto &it : *tree_) {
+    if (it.ptr != nullptr) {
+      ++cs.num_mem_cached;
+    } else {
+      ++cs.num_disk_cached;
+    }
+  }
+  return cs;
+}
+Status CachePool::Spill(CachePool::DataLocator *dl) {
+  if (sm_ == nullptr) {
+    RETURN_STATUS_UNEXPECTED("No disk storage to spill");
+  }
+  RETURN_UNEXPECTED_IF_NULL(dl);
+  RETURN_UNEXPECTED_IF_NULL(dl->ptr);
+  if (dl->storage_key == 0) {
+    ReadableSlice data(dl->ptr, dl->sz);
+    RETURN_IF_NOT_OK(sm_->Write(&dl->storage_key, {data}));
+  }
+  alloc_.deallocate(dl->ptr, dl->sz);
+  dl->ptr = nullptr;
+  return Status::OK();
+}
+Status CachePool::Locate(CachePool::DataLocator *dl) {
+  RETURN_UNEXPECTED_IF_NULL(dl);
+  if (dl->ptr == nullptr) {
+    if (sm_ == nullptr) {
+      RETURN_STATUS_UNEXPECTED("No disk storage to locate the data");
+    }
+    try {
+      dl->ptr = alloc_.allocate(dl->sz);
+      WritableSlice dest(dl->ptr, dl->sz);
+      Status rc = Read(dl->storage_key, &dest);
+      if (rc.IsError()) {
+        alloc_.deallocate(dl->ptr, dl->sz);
+        dl->ptr = nullptr;
+        return rc;
+      }
+    } catch (const std::bad_alloc &e) {
+      return Status(StatusCode::kOutOfMemory, __LINE__, __FILE__);
+    }
+  }
+  return Status::OK();
+}
+size_t CachePool::GetSize(CachePool::key_type key) const {
+  auto r = tree_->Search(key);
+  if (r.second) {
+    auto &it = r.first;
+    return it->sz;
+  } else {
+    return 0;
+  }
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/util/cache_pool.h
+++ b/mindspore/ccsrc/dataset/util/cache_pool.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_UTIL_CACHE_POOL_H_
+#define DATASET_UTIL_CACHE_POOL_H_
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+#include "dataset/util/allocator.h"
+#include "dataset/util/service.h"
+#include "dataset/util/slice.h"
+#include "dataset/util/storage_manager.h"
+#include "dataset/util/auto_index.h"
+
+namespace mindspore {
+namespace dataset {
+/// \brief A CachePool provides service for backup/restore a buffer. A buffer can be represented in a form of vector of
+/// ReadableSlice where all memory blocks will be copied to one contiguous block which can be in memory or spilled to
+/// disk (if a disk directory is provided). Every buffer insert will return a generated key which can be used to
+/// restore the buffer.
+/// \see ReadableSlice
+class CachePool : public Service {
+ public:
+  using base_type = uint8_t;
+  using pointer = base_type *;
+  using const_pointer = const base_type *;
+  using reference = base_type &;
+  using const_reference = const base_type &;
+  using value_allocator = Allocator<base_type>;
+
+  // An internal class to locate the whereabouts of a backed up buffer which can be either in
+  class DataLocator {
+   public:
+    DataLocator() : ptr(nullptr), sz(0), storage_key(0) {}
+    ~DataLocator() = default;
+    DataLocator(const DataLocator &other) = default;
+    DataLocator &operator=(const DataLocator &other) = default;
+    DataLocator(DataLocator &&other) noexcept {
+      ptr = other.ptr;
+      sz = other.sz;
+      storage_key = other.storage_key;
+      other.ptr = nullptr;
+      other.sz = 0;
+      other.storage_key = 0;
+    }
+    DataLocator &operator=(DataLocator &&other) noexcept {
+      if (&other != this) {
+        ptr = other.ptr;
+        sz = other.sz;
+        storage_key = other.storage_key;
+        other.ptr = nullptr;
+        other.sz = 0;
+        other.storage_key = 0;
+      }
+      return *this;
+    }
+    pointer ptr;
+    size_t sz;
+    StorageManager::key_type storage_key;
+  };
+
+  using data_index = AutoIndexObj<DataLocator>;
+  using key_type = data_index::key_type;
+  using bl_alloc_type = typename value_allocator::template rebind<DataLocator>::other;
+
+  /// \brief Simple statistics returned from CachePool like how many elements are cached in memory and
+  /// how many elements are spilled to disk.
+  struct CacheStat {
+    int64_t num_mem_cached;
+    int64_t num_disk_cached;
+  };
+
+  /// \brief Constructor
+  /// \param alloc Allocator to allocate memory from
+  /// \param root Optional disk folder to spill
+  explicit CachePool(const value_allocator &alloc, const std::string &root = "");
+
+  CachePool(const CachePool &) = delete;
+  CachePool(CachePool &&) = delete;
+  CachePool &operator=(const CachePool &) = delete;
+  CachePool &operator=(CachePool &&) = delete;
+  ~CachePool() noexcept;
+
+  Status DoServiceStart() override;
+  Status DoServiceStop() override;
+
+  Path GetSpillPath() const;
+
+  /// \brief Insert a sequence of ReadableSlice objects into the pool.
+  /// All memory blocks will be consolidated into one contiguous block and be cached in either memory or on disk.
+  /// \param[in] buf A sequence of ReadableSlice objects.
+  /// \param[out] key Generated key
+  /// \return Error code
+  Status Insert(const std::vector<ReadableSlice> &buf, key_type *key);
+  /// \brief Restore a cached buffer (from memory or disk)
+  /// \param[in] key A previous key returned from Insert
+  /// \param[out] dest The cached buffer will be copied to this destination represented by a WritableSlice
+  /// \param[out] bytesRead Optional. Number of bytes read.
+  /// \return Error code
+  Status Read(key_type key, WritableSlice *dest, size_t *bytesRead = nullptr) const;
+
+  Status Spill(DataLocator *dl);
+
+  Status Locate(DataLocator *dl);
+
+  size_t GetSize(key_type key) const;
+
+  /// \brief Get statistics.
+  /// \return CacheStat object
+  CacheStat GetStat() const;
+
+  const value_allocator &get_allocator() const;
+
+  std::string MyName() const { return subfolder_; }
+
+ private:
+  value_allocator alloc_;
+  Path root_;
+  const std::string subfolder_;
+  std::shared_ptr<StorageManager> sm_;
+  std::shared_ptr<data_index> tree_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/dataset/util/list.h
+++ b/mindspore/ccsrc/dataset/util/list.h
@@ -106,6 +106,24 @@ struct List {
    ++count;
  }

+  // Insert elem2 before elem1 in the list.
+  virtual void InsertBefore(pointer elem1, pointer elem2) {
+    DS_ASSERT(elem1 != elem2);
+    Node<T> &elem1_node = elem1->*node;
+    Node<T> &elem2_node = elem2->*node;
+    elem2_node.next = elem1;
+    elem2_node.prev = elem1_node.prev;
+    if (elem1_node.prev != nullptr) {
+      Node<T> &prev_node = elem1_node.prev->*node;
+      prev_node.next = elem2;
+    }
+    elem1_node.prev = elem2;
+    if (head == elem1) {
+      head = elem2;
+    }
+    ++count;
+  }
+
  // Remove an element in the list
  virtual void Remove(pointer elem) noexcept {
    Node<T> &elem_node = elem->*node;

--- a/mindspore/ccsrc/dataset/util/memory_pool.h
+++ b/mindspore/ccsrc/dataset/util/memory_pool.h
@@ -44,20 +44,6 @@ class MemoryPool {
  virtual ~MemoryPool() {}
 };

-// Used by unique_ptr
-template <typename T>
-class Deleter {
- public:
-  explicit Deleter(std::shared_ptr<MemoryPool> &mp) : mp_(mp) {}
-
-  ~Deleter() = default;
-
-  void operator()(T *ptr) const { mp_->Deallocate(ptr); }
-
- private:
-  std::shared_ptr<MemoryPool> mp_;
-};
-
 Status DeMalloc(std::size_t s, void **p, bool);
 }  // namespace dataset
 }  // namespace mindspore

--- a/mindspore/ccsrc/dataset/util/path.cc
+++ b/mindspore/ccsrc/dataset/util/path.cc
@@ -16,6 +16,8 @@
 #include "dataset/util/path.h"

 #include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
 #include <new>
 #include <sstream>
 #include <utility>
@@ -26,7 +28,7 @@

 namespace mindspore {
 namespace dataset {
-#ifdef _WIN32
+#if defined(_WIN32) || defined(_WIN64)
 char Path::separator_ = '\\';
 #else
 char Path::separator_ = '/';
@@ -132,7 +134,7 @@ Status Path::CreateDirectory() {
 #if defined(_WIN32) || defined(_WIN64)
    int rc = mkdir(common::SafeCStr(path_));
 #else
-    int rc = mkdir(common::SafeCStr(path_), 0700);
+    int rc = mkdir(common::SafeCStr(path_), S_IRUSR | S_IWUSR | S_IXUSR);
 #endif
    if (rc) {
      std::ostringstream oss;
@@ -182,6 +184,111 @@ Status Path::CreateDirectories() {
  return Status::OK();
 }

+Status Path::Remove() {
+  if (Exists()) {
+    if (IsDirectory()) {
+      errno_t err = rmdir(common::SafeCStr(path_));
+      if (err == -1) {
+        std::ostringstream oss;
+        oss << "Unable to delete directory " << path_ << ". Errno = " << errno;
+        RETURN_STATUS_UNEXPECTED(oss.str());
+      }
+    } else {
+      errno_t err = unlink(common::SafeCStr(path_));
+      if (err == -1) {
+        std::ostringstream oss;
+        oss << "Unable to delete file " << path_ << ". Errno = " << errno;
+        RETURN_STATUS_UNEXPECTED(oss.str());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status Path::CreateFile(int *file_descriptor) { return OpenFile(file_descriptor, true); }
+
+Status Path::OpenFile(int *file_descriptor, bool create) {
+  int fd;
+  if (file_descriptor == nullptr) {
+    RETURN_STATUS_UNEXPECTED("null pointer");
+  }
+  if (IsDirectory()) {
+    std::ostringstream oss;
+    oss << "Unable to create file " << path_ << " which is a directory.";
+    RETURN_STATUS_UNEXPECTED(oss.str());
+  }
+  // Convert to canonical form.
+  if (strlen(common::SafeCStr(path_)) > PATH_MAX) {
+    RETURN_STATUS_UNEXPECTED(strerror(errno));
+  }
+  char canonical_path[PATH_MAX + 1] = {0x00};
+#if defined(_WIN32) || defined(_WIN64)
+  if (_fullpath(canonical_path, common::SafeCStr(path_), PATH_MAX) == nullptr) {
+#else
+  if (realpath(common::SafeCStr(path_), canonical_path) == nullptr) {
+#endif
+    if (errno == ENOENT && create) {
+      // File doesn't exist and we are to create it. Let's break it down.
+      auto file_part = Basename();
+      auto parent_part = ParentPath();
+#if defined(_WIN32) || defined(_WIN64)
+      if (_fullpath(canonical_path, common::SafeCStr(parent_part), PATH_MAX) == nullptr) {
+#else
+      if (realpath(common::SafeCStr(parent_part), canonical_path) == nullptr) {
+#endif
+        RETURN_STATUS_UNEXPECTED(strerror(errno));
+      }
+      auto cur_inx = strlen(canonical_path);
+      if ((cur_inx + file_part.length() + 1) > PATH_MAX) {
+        RETURN_STATUS_UNEXPECTED(strerror(errno));
+      }
+      canonical_path[cur_inx++] = separator_;
+      if (strncpy_s(canonical_path + cur_inx, PATH_MAX - cur_inx, common::SafeCStr(file_part), file_part.length()) !=
+          EOK) {
+        RETURN_STATUS_UNEXPECTED(strerror(errno));
+      }
+    } else {
+      RETURN_STATUS_UNEXPECTED(strerror(errno));
+    }
+  }
+  if (create) {
+    fd = open(canonical_path, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP);
+  } else {
+    fd = open(canonical_path, O_RDWR);
+  }
+  if (fd == -1) {
+    RETURN_STATUS_UNEXPECTED(strerror(errno));
+  }
+  *file_descriptor = fd;
+  return Status::OK();
+}
+
+Status Path::CloseFile(int fd) const {
+  if (close(fd) < 0) {
+    RETURN_STATUS_UNEXPECTED(strerror(errno));
+  }
+  return Status::OK();
+}
+
+Status Path::TruncateFile(int fd) const {
+  int rc;
+  rc = ftruncate(fd, 0);
+  if (rc == 0) {
+    return Status::OK();
+  } else {
+    RETURN_STATUS_UNEXPECTED(strerror(errno));
+  }
+}
+
+std::string Path::Basename() {
+  std::size_t found = path_.find_last_of(separator_);
+  if (found != std::string::npos) {
+    return path_.substr(found + 1);
+  } else {
+    return path_;
+  }
+}
+
 std::shared_ptr<Path::DirIterator> Path::DirIterator::OpenDirectory(Path *f) {
  auto it = new (std::nothrow) DirIterator(f);

@@ -208,7 +315,7 @@ Path::DirIterator::~DirIterator() {

 Path::DirIterator::DirIterator(Path *f) : dir_(f), dp_(nullptr), entry_(nullptr) {
  MS_LOG(DEBUG) << "Open directory " << f->toString() << ".";
-  dp_ = opendir(common::SafeCStr(f->toString()));
+  dp_ = opendir(f->toString().c_str());
 }

 bool Path::DirIterator::hasNext() {
@@ -225,5 +332,10 @@ bool Path::DirIterator::hasNext() {
 }

 Path Path::DirIterator::next() { return (*(this->dir_) / Path(entry_->d_name)); }
+
+std::ostream &operator<<(std::ostream &os, const Path &s) {
+  os << s.path_;
+  return os;
+}
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/dataset/util/path.h
+++ b/mindspore/ccsrc/dataset/util/path.h
@@ -90,6 +90,20 @@ class Path {

  std::string ParentPath();

+  Status Remove();
+
+  Status CreateFile(int *fd);
+
+  Status OpenFile(int *fd, bool create = false);
+
+  Status CloseFile(int fd) const;
+
+  Status TruncateFile(int fd) const;
+
+  std::string Basename();
+
+  friend std::ostream &operator<<(std::ostream &os, const Path &s);
+
 private:
  static char separator_;
  std::string path_;

--- a/mindspore/ccsrc/dataset/util/semaphore.cc
+++ b/mindspore/ccsrc/dataset/util/semaphore.cc
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/util/semaphore.h"
+#include "dataset/util/task_manager.h"
+
+namespace mindspore {
+namespace dataset {
+Status Semaphore::P() {
+  std::unique_lock<std::mutex> lck(mutex_);
+  RETURN_IF_NOT_OK(wait_cond_.Wait(&lck, [this]() { return value_ > 0; }));
+  --value_;
+  return Status::OK();
+}
+void Semaphore::V() {
+  std::unique_lock<std::mutex> lck(mutex_);
+  ++value_;
+  wait_cond_.NotifyOne();
+}
+int Semaphore::Peek() {
+  std::unique_lock<std::mutex> lck(mutex_);
+  return value_;
+}
+Status Semaphore::Register(TaskGroup *vg) { return wait_cond_.Register(vg->GetIntrpService()); }
+Status Semaphore::Deregister() { return (wait_cond_.Deregister()); }
+void Semaphore::ResetIntrpState() { wait_cond_.ResetIntrpState(); }
+
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/util/semaphore.h
+++ b/mindspore/ccsrc/dataset/util/semaphore.h
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_UTIL_SEMAPHORE_H_
+#define DATASET_UTIL_SEMAPHORE_H_
+
+#include "dataset/util/cond_var.h"
+
+namespace mindspore {
+namespace dataset {
+class TaskGroup;
+
+/// \brief A counting semaphore. There are two external functions P and V. P decrements the internal count and will be
+/// blocked if the count is 0 (zero). V increments the internal count and wake up one of the waiters.
+class Semaphore {
+ public:
+  /// \brief Constructor
+  /// \param init Initial value of the internal counter.
+  explicit Semaphore(int init) : value_(init) {}
+
+  virtual ~Semaphore() {}
+  /// \brief Decrement the internal counter. Will be blocked if the value is 0.
+  /// \return Error code. Can get interrupt.
+  Status P();
+  /// \brief Increment the internal counter. Wakeup on of the watiers if any.
+  void V();
+  /// \brief Peek the internal value
+  /// \return The internal value
+  int Peek();
+  Status Register(TaskGroup *vg);
+  Status Deregister();
+  void ResetIntrpState();
+
+ private:
+  int value_;
+
+  std::mutex mutex_;
+  CondVar wait_cond_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_UTIL_SEMAPHORE_H_
--- a/mindspore/ccsrc/dataset/util/slice.cc
+++ b/mindspore/ccsrc/dataset/util/slice.cc
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+#include "dataset/util/slice.h"
+
+namespace mindspore {
+namespace dataset {
+WritableSlice::WritableSlice(const WritableSlice &src, off64_t offset, size_t len) : ReadableSlice(src, offset, len) {
+  mutable_data_ = static_cast<char *>(src.mutable_data_) + offset;
+}
+WritableSlice::WritableSlice(const WritableSlice &src, off64_t offset)
+    : WritableSlice(src, offset, src.GetSize() - offset) {}
+Status WritableSlice::Copy(WritableSlice *dest, const ReadableSlice &src) {
+  RETURN_UNEXPECTED_IF_NULL(dest);
+  RETURN_UNEXPECTED_IF_NULL(dest->GetMutablePointer());
+  if (dest->GetSize() <= 0) {
+    RETURN_STATUS_UNEXPECTED("Destination length is non-positive");
+  }
+  auto err = memcpy_s(dest->GetMutablePointer(), dest->GetSize(), src.GetPointer(), src.GetSize());
+  if (err) {
+    RETURN_STATUS_UNEXPECTED(std::to_string(err));
+  }
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/util/slice.h
+++ b/mindspore/ccsrc/dataset/util/slice.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_UTIL_SLICE_H_
+#define DATASET_UTIL_SLICE_H_
+
+#include <unistd.h>
+#include <cstddef>
+#include <utility>
+#include "./securec.h"
+#include "dataset/util/allocator.h"
+#include "dataset/util/status.h"
+namespace mindspore {
+namespace dataset {
+/// \brief A ReadableSlice wraps a const pointer in memory and its size.
+/// \see WritableSlice for a non-const version
+///
+class ReadableSlice {
+ public:
+  ReadableSlice() : ptr_(nullptr), sz_(0) {}
+  ReadableSlice(const void *ptr, size_t sz) : ptr_(ptr), sz_(sz) {}
+  ReadableSlice(const ReadableSlice &src, off64_t offset, size_t len) {
+    ptr_ = static_cast<const char *>(src.GetPointer()) + offset;
+    sz_ = len;
+  }
+  ReadableSlice(const ReadableSlice &src, off64_t offset) : ReadableSlice(src, offset, src.sz_ - offset) {}
+  ReadableSlice(const ReadableSlice &lhs) {
+    ptr_ = lhs.ptr_;
+    sz_ = lhs.sz_;
+  }
+  ReadableSlice &operator=(const ReadableSlice &lhs) {
+    if (this != &lhs) {
+      ptr_ = lhs.ptr_;
+      sz_ = lhs.sz_;
+    }
+    return *this;
+  }
+  ReadableSlice(ReadableSlice &&lhs) noexcept {
+    if (this != &lhs) {
+      ptr_ = lhs.ptr_;
+      sz_ = lhs.sz_;
+      lhs.ptr_ = nullptr;
+      lhs.sz_ = 0;
+    }
+  }
+  ReadableSlice &operator=(ReadableSlice &&lhs) noexcept {
+    if (this != &lhs) {
+      ptr_ = lhs.ptr_;
+      sz_ = lhs.sz_;
+      lhs.ptr_ = nullptr;
+      lhs.sz_ = 0;
+    }
+    return *this;
+  }
+  /// \brief Getter function
+  /// \return Const version of the pointer
+  const void *GetPointer() const { return ptr_; }
+  /// \brief Getter function
+  /// \return Size of the slice
+  size_t GetSize() const { return sz_; }
+  bool empty() const { return ptr_ == nullptr; }
+
+ private:
+  const void *ptr_;
+  size_t sz_;
+};
+/// \brief A WritableSlice inherits from ReadableSlice to allow
+/// one to write to the address pointed to by the pointer.
+///
+class WritableSlice : public ReadableSlice {
+ public:
+  friend class StorageContainer;
+  /// \brief Default constructor
+  WritableSlice() : ReadableSlice(), mutable_data_(nullptr) {}
+  /// \brief This form of a constructor takes a pointer and its size.
+  WritableSlice(void *ptr, size_t sz) : ReadableSlice(ptr, sz), mutable_data_(ptr) {}
+  WritableSlice(const WritableSlice &src, off64_t offset, size_t len);
+  WritableSlice(const WritableSlice &src, off64_t offset);
+  WritableSlice(const WritableSlice &lhs) : ReadableSlice(lhs) { mutable_data_ = lhs.mutable_data_; }
+  WritableSlice &operator=(const WritableSlice &lhs) {
+    if (this != &lhs) {
+      mutable_data_ = lhs.mutable_data_;
+      ReadableSlice::operator=(lhs);
+    }
+    return *this;
+  }
+  WritableSlice(WritableSlice &&lhs) noexcept : ReadableSlice(std::move(lhs)) {
+    if (this != &lhs) {
+      mutable_data_ = lhs.mutable_data_;
+      lhs.mutable_data_ = nullptr;
+    }
+  }
+  WritableSlice &operator=(WritableSlice &&lhs) noexcept {
+    if (this != &lhs) {
+      mutable_data_ = lhs.mutable_data_;
+      lhs.mutable_data_ = nullptr;
+      ReadableSlice::operator=(std::move(lhs));
+    }
+    return *this;
+  }
+  /// \brief Copy the content from one slice onto another.
+  static Status Copy(WritableSlice *dest, const ReadableSlice &src);
+
+ private:
+  void *mutable_data_;
+  void *GetMutablePointer() { return mutable_data_; }
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_UTIL_SLICE_H_
--- a/mindspore/ccsrc/dataset/util/storage_container.cc
+++ b/mindspore/ccsrc/dataset/util/storage_container.cc
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/util/storage_container.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <vector>
+#include "common/utils.h"
+#include "dataset/util/de_error.h"
+#include "dataset/util/path.h"
+#include "dataset/util/status.h"
+#include "utils/log_adapter.h"
+
+namespace mindspore {
+namespace dataset {
+Status StorageContainer::Create() {
+  RETURN_IF_NOT_OK(BuddySpace::CreateBuddySpace(&bs_));
+  RETURN_IF_NOT_OK(cont_.CreateFile(&fd_));
+  is_open_ = true;
+  MS_LOG(INFO) << "Container " << cont_ << " created";
+  return Status::OK();
+}
+
+Status StorageContainer::Open() noexcept {
+  std::lock_guard<std::mutex> lck(mutex_);
+  // Check again
+  if (!is_open_) {
+    RETURN_IF_NOT_OK(cont_.OpenFile(&fd_));
+    is_open_ = true;
+  }
+  return Status::OK();
+}
+
+Status StorageContainer::Close() noexcept {
+  if (is_open_) {
+    std::lock_guard<std::mutex> lck(mutex_);
+    // Check again
+    if (is_open_) {
+      RETURN_IF_NOT_OK(cont_.CloseFile(fd_));
+      is_open_ = false;
+      fd_ = -1;
+    }
+  }
+  return Status::OK();
+}
+
+Status StorageContainer::Read(WritableSlice *dest, off64_t offset) const noexcept {
+  DS_ASSERT(is_open_);
+  RETURN_UNEXPECTED_IF_NULL(dest);
+  auto sz = dest->GetSize();
+#if defined(_WIN32) || defined(_WIN64)
+  // Doesn't seem there is any pread64 on mingw.
+  // So we will do a seek and then a read under
+  // a protection of mutex.
+  std::lock_guard<std::mutex> lck(mutex_);
+  auto seek_err = lseek(fd_, offset, SEEK_SET);
+  if (seek_err < 0) {
+    RETURN_STATUS_UNEXPECTED(strerror(errno));
+  }
+  auto r_sz = read(fd_, dest->GetMutablePointer(), sz);
+#else
+  auto r_sz = pread64(fd_, dest->GetMutablePointer(), sz, offset);
+#endif
+  if (r_sz != sz) {
+    errno_t err = (r_sz == 0) ? EOF : errno;
+    RETURN_STATUS_UNEXPECTED(strerror(err));
+  }
+  return Status::OK();
+}
+
+Status StorageContainer::Write(const ReadableSlice &dest, off64_t offset) const noexcept {
+  DS_ASSERT(is_open_);
+  auto sz = dest.GetSize();
+#if defined(_WIN32) || defined(_WIN64)
+  // Doesn't seem there is any pwrite64 on mingw.
+  // So we will do a seek and then a read under
+  // a protection of mutex.
+  std::lock_guard<std::mutex> lck(mutex_);
+  auto seek_err = lseek(fd_, offset, SEEK_SET);
+  if (seek_err < 0) {
+    RETURN_STATUS_UNEXPECTED(strerror(errno));
+  }
+  auto r_sz = write(fd_, dest.GetPointer(), sz);
+#else
+  auto r_sz = pwrite64(fd_, dest.GetPointer(), sz, offset);
+#endif
+  if (r_sz != sz) {
+    errno_t err = (r_sz == 0) ? EOF : errno;
+    RETURN_STATUS_UNEXPECTED(strerror(err));
+  }
+  return Status::OK();
+}
+
+Status StorageContainer::Insert(const std::vector<ReadableSlice> &buf, off64_t *offset) noexcept {
+  size_t sz = 0;
+  for (auto &v : buf) {
+    sz += v.GetSize();
+  }
+  if (sz == 0) {
+    RETURN_STATUS_UNEXPECTED("Unexpected 0 length");
+  }
+  if (sz > bs_->GetMaxSize()) {
+    RETURN_STATUS_UNEXPECTED("Request size too big");
+  }
+  BSpaceDescriptor bspd{0};
+  addr_t addr = 0;
+  RETURN_IF_NOT_OK(bs_->Alloc(sz, &bspd, &addr));
+  *offset = static_cast<off64_t>(addr);
+  // We will do piecewise copy of the data to disk.
+  for (auto &v : buf) {
+    RETURN_IF_NOT_OK(Write(v, addr));
+    addr += v.GetSize();
+  }
+  return Status::OK();
+}
+
+Status StorageContainer::Truncate() const noexcept {
+  if (is_open_) {
+    RETURN_IF_NOT_OK(cont_.TruncateFile(fd_));
+    MS_LOG(INFO) << "Container " << cont_ << " truncated";
+  }
+  return Status::OK();
+}
+
+StorageContainer::~StorageContainer() noexcept {
+  (void)Truncate();
+  (void)Close();
+}
+
+std::ostream &operator<<(std::ostream &os, const StorageContainer &s) {
+  os << "File path : " << s.cont_ << "\n" << *(s.bs_.get());
+  return os;
+}
+
+Status StorageContainer::CreateStorageContainer(std::shared_ptr<StorageContainer> *out_sc, const std::string &path) {
+  Status rc;
+  auto sc = new (std::nothrow) StorageContainer(path);
+  if (sc == nullptr) {
+    return Status(StatusCode::kOutOfMemory);
+  }
+  rc = sc->Create();
+  if (rc.IsOk()) {
+    (*out_sc).reset(sc);
+  } else {
+    delete sc;
+  }
+  return rc;
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/util/storage_container.h
+++ b/mindspore/ccsrc/dataset/util/storage_container.h
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_UTIL_STORAGE_CONTAINER_H_
+#define DATASET_UTIL_STORAGE_CONTAINER_H_
+
+#include <limits.h>
+#include <unistd.h>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+#include "dataset/util/system_pool.h"
+#include "dataset/util/buddy.h"
+#include "dataset/util/path.h"
+#include "dataset/util/slice.h"
+#include "dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class StorageManager;
+
+class StorageContainer {
+ public:
+  friend class StorageManager;
+
+  ~StorageContainer() noexcept;
+
+  StorageContainer(const StorageContainer &) = delete;
+
+  StorageContainer &operator=(const StorageContainer &) = delete;
+
+  friend std::ostream &operator<<(std::ostream &os, const StorageContainer &s);
+
+  Status Open() noexcept;
+
+  Status Close() noexcept;
+
+  Status Insert(const std::vector<ReadableSlice> &buf, off64_t *offset) noexcept;
+
+  Status Write(const ReadableSlice &dest, off64_t offset) const noexcept;
+
+  Status Read(WritableSlice *dest, off64_t offset) const noexcept;
+
+  Status Truncate() const noexcept;
+
+  bool IsOpen() const { return is_open_; }
+
+  static Status CreateStorageContainer(std::shared_ptr<StorageContainer> *out_sc, const std::string &path);
+
+ private:
+  mutable std::mutex mutex_;
+  Path cont_;
+  int fd_;
+  bool is_open_;
+  std::unique_ptr<BuddySpace> bs_;
+
+  // Use the default value of BuddySpace
+  // which can map upto 4G of space.
+  explicit StorageContainer(const std::string &path) : cont_(path), fd_(-1), is_open_(false), bs_(nullptr) {}
+
+  Status Create();
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_UTIL_STORAGE_CONTAINER_H_
--- a/mindspore/ccsrc/dataset/util/storage_manager.cc
+++ b/mindspore/ccsrc/dataset/util/storage_manager.cc
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/util/storage_manager.h"
+
+#include <iomanip>
+#include <sstream>
+#include <stdexcept>
+#include <utility>
+#include "common/utils.h"
+#include "dataset/util/path.h"
+#include "dataset/util/services.h"
+#include "dataset/util//de_error.h"
+#include "utils/log_adapter.h"
+
+namespace mindspore {
+namespace dataset {
+std::string StorageManager::GetBaseName(const std::string &prefix, int32_t file_id) {
+  std::ostringstream oss;
+  oss << prefix << std::setfill('0') << std::setw(5) << file_id;
+  return oss.str();
+}
+
+std::string StorageManager::ConstructFileName(const std::string &prefix, int32_t file_id, const std::string &suffix) {
+  std::string base_name = GetBaseName(prefix, file_id);
+  return (base_name + "." + suffix);
+}
+
+Status StorageManager::AddOneContainer() {
+  const std::string kPrefix = "IMG";
+  const std::string kSuffix = "LB";
+  Path container_name = root_ / ConstructFileName(kPrefix, file_id_, kSuffix);
+  std::shared_ptr<StorageContainer> sc;
+  RETURN_IF_NOT_OK(StorageContainer::CreateStorageContainer(&sc, container_name.toString()));
+  containers_.push_back(sc);
+  file_id_++;
+  return Status::OK();
+}
+
+Status StorageManager::DoServiceStart() {
+  containers_.reserve(1000);
+  if (root_.IsDirectory()) {
+    RETURN_IF_NOT_OK(AddOneContainer());
+  } else {
+    RETURN_STATUS_UNEXPECTED("Not a directory");
+  }
+  return Status::OK();
+}
+
+Status StorageManager::Write(key_type *key, const std::vector<ReadableSlice> &buf) {
+  RETURN_UNEXPECTED_IF_NULL(key);
+  size_t sz = 0;
+  for (auto &v : buf) {
+    sz += v.GetSize();
+  }
+  if (sz == 0) {
+    RETURN_STATUS_UNEXPECTED("Unexpected 0 length");
+  }
+  std::shared_ptr<StorageContainer> cont;
+  key_type out_key;
+  value_type out_value;
+  bool create_new_container = false;
+  do {
+    SharedLock lock_s(&rw_lock_);
+    size_t num_containers = containers_.size();
+    if (create_new_container) {
+      // Upgrade to exclusvie lock.
+      lock_s.Upgrade();
+      create_new_container = false;
+      // Check again if someone has already added a
+      // new container after we got the x lock
+      if (containers_.size() == num_containers) {
+        RETURN_IF_NOT_OK(AddOneContainer());
+      }
+      // Refresh how many containers there are.
+      num_containers = containers_.size();
+      // Downgrade back to shared lock
+      lock_s.Downgrade();
+    }
+    if (num_containers == 0) {
+      RETURN_STATUS_UNEXPECTED("num_containers is zero");
+    }
+    // Go to the last container to insert.
+    cont = containers_.at(num_containers - 1);
+    off64_t offset;
+    Status rc = cont->Insert(buf, &offset);
+    if (rc.IsNoSpace()) {
+      create_new_container = true;
+    } else if (rc.IsOk()) {
+      out_value = std::make_pair(num_containers - 1, std::make_pair(offset, sz));
+      RETURN_IF_NOT_OK(index_.insert(out_value, &out_key));
+      *key = out_key;
+      break;
+    } else {
+      return rc;
+    }
+  } while (true);
+  return Status::OK();
+}
+
+Status StorageManager::Read(StorageManager::key_type key, WritableSlice *dest, size_t *bytesRead) const {
+  RETURN_UNEXPECTED_IF_NULL(dest);
+  auto r = index_.Search(key);
+  if (r.second) {
+    auto &it = r.first;
+    value_type v = *it;
+    int container_inx = v.first;
+    off_t offset = v.second.first;
+    size_t sz = v.second.second;
+    if (dest->GetSize() < sz) {
+      std::string errMsg = "Destination buffer too small. Expect at least " + std::to_string(sz) +
+                           " but length = " + std::to_string(dest->GetSize());
+      RETURN_STATUS_UNEXPECTED(errMsg);
+    }
+    if (bytesRead != nullptr) {
+      *bytesRead = sz;
+    }
+    auto cont = containers_.at(container_inx);
+    RETURN_IF_NOT_OK(cont->Read(dest, offset));
+  } else {
+    RETURN_STATUS_UNEXPECTED("Key not found");
+  }
+  return Status::OK();
+}
+
+Status StorageManager::DoServiceStop() noexcept {
+  Status rc;
+  Status rc1;
+  for (auto const &p : containers_) {
+    // The destructor of StorageContainer is not called automatically until the use
+    // count drops to 0. But it is not always the case. We will do it ourselves.
+    rc = p.get()->Truncate();
+    if (rc.IsError()) {
+      rc1 = rc;
+    }
+  }
+  containers_.clear();
+  file_id_ = 0;
+  return rc1;
+}
+
+StorageManager::StorageManager(const Path &root) : root_(root), file_id_(0), index_() {}
+
+StorageManager::~StorageManager() { (void)StorageManager::DoServiceStop(); }
+
+std::ostream &operator<<(std::ostream &os, const StorageManager &s) {
+  os << "Dumping all containers ..."
+     << "\n";
+  for (auto const &p : s.containers_) {
+    os << *(p.get());
+  }
+  return os;
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/util/storage_manager.h
+++ b/mindspore/ccsrc/dataset/util/storage_manager.h
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_UTIL_STORAGE_MANAGER_H_
+#define DATASET_UTIL_STORAGE_MANAGER_H_
+
+#include <unistd.h>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "dataset/util/allocator.h"
+#include "dataset/util/auto_index.h"
+#include "dataset/util/lock.h"
+#include "dataset/util/memory_pool.h"
+#include "dataset/util/path.h"
+#include "dataset/util/service.h"
+#include "dataset/util/slice.h"
+#include "dataset/util/storage_container.h"
+
+using ListOfContainers = std::vector<std::shared_ptr<mindspore::dataset::StorageContainer>>;
+namespace mindspore {
+namespace dataset {
+class StorageManager : public Service {
+ public:
+  using storage_index = AutoIndexObj<std::pair<int, std::pair<off_t, size_t>>>;
+  using key_type = storage_index::key_type;
+  using value_type = storage_index::value_type;
+
+  explicit StorageManager(const Path &);
+
+  ~StorageManager() override;
+
+  StorageManager(const StorageManager &) = delete;
+
+  StorageManager &operator=(const StorageManager &) = delete;
+
+  Status Write(key_type *out_key, const std::vector<ReadableSlice> &buf);
+
+  Status Read(key_type key, WritableSlice *dest, size_t *bytesRead) const;
+
+  Status DoServiceStart() override;
+
+  Status DoServiceStop() noexcept override;
+
+  friend std::ostream &operator<<(std::ostream &os, const StorageManager &s);
+
+ private:
+  Path root_;
+  ListOfContainers containers_;
+  int file_id_;
+  RWLock rw_lock_;
+  storage_index index_;
+
+  std::string GetBaseName(const std::string &prefix, int32_t file_id);
+
+  std::string ConstructFileName(const std::string &prefix, int32_t file_id, const std::string &suffix);
+
+  Status AddOneContainer();
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_UTIL_STORAGE_MANAGER_H_
--- a/mindspore/ccsrc/dataset/util/system_pool.h
+++ b/mindspore/ccsrc/dataset/util/system_pool.h
@@ -19,8 +19,10 @@
 #include <cstddef>
 #include <cstdlib>
 #include <limits>
+#include <memory>
 #include <new>
 #include "./securec.h"
+#include "dataset/util/allocator.h"
 #include "dataset/util/memory_pool.h"

 namespace mindspore {
@@ -61,6 +63,11 @@ class SystemPool : public MemoryPool {
  uint64_t get_max_size() const override { return std::numeric_limits<uint64_t>::max(); }

  int PercentFree() const override { return 100; }
+
+  template <typename T>
+  static Allocator<T> GetAllocator() {
+    return Allocator<T>(std::make_shared<SystemPool>());
+  }
 };
 }  // namespace dataset
 }  // namespace mindspore

--- a/mindspore/ccsrc/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/device/kernel_runtime.cc
@@ -30,6 +30,7 @@
 #include "kernel/common_utils.h"
 #include "kernel/oplib/oplib.h"
 #include "ir/value.h"
+#include "pre_activate/common/helper.h"
 using mindspore::kernel::Address;
 using mindspore::kernel::AddressPtr;

@@ -632,7 +633,7 @@ void KernelRuntime::AssignWorkSpaceMem(int flag, const AnfNodePtr &node) {
  }
 }

-void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
+void KernelRuntime::GenLaunchArgs(const session::KernelGraph &graph, const mindspore::AnfNodePtr &kernel,
                                  AddressPtrList *kernel_inputs, AddressPtrList *const kernel_workspaces,
                                  AddressPtrList *kernel_outputs) {
  MS_EXCEPTION_IF_NULL(kernel);
@@ -644,9 +645,15 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod
  if (AnfAlgo::GetCNodeName(cnode) == kAtomicAddrCleanOpName) {
    return GenAddrCleanLaunchArgs(cnode, kernel_inputs);
  }
+  auto is_all_nop_node = opt::IsAllNopNode(&graph);
  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
    auto real_input = AnfAlgo::GetRealInputIndex(kernel, i);
-    auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, real_input);
+    DeviceAddressPtr device_address;
+    if (is_all_nop_node) {
+      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, real_input, false);
+    } else {
+      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, real_input, true);
+    }
    MS_EXCEPTION_IF_NULL(device_address);
    kernel::AddressPtr input = std::make_shared<kernel::Address>();
    MS_EXCEPTION_IF_NULL(input);
@@ -656,8 +663,16 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod
    kernel_inputs->emplace_back(input);
  }

-  for (size_t i = 0; i < kernel_mod.GetOutputSizeList().size(); ++i) {
-    auto device_address = AnfAlgo::GetOutputAddr(kernel, i);
+  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
+  MS_EXCEPTION_IF_NULL(kernel_mod);
+  for (size_t i = 0; i < kernel_mod->GetOutputSizeList().size(); ++i) {
+    DeviceAddressPtr device_address;
+    if (is_all_nop_node) {
+      device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
+    } else {
+      device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, true);
+    }
+    MS_EXCEPTION_IF_NULL(device_address);
    kernel::AddressPtr output = std::make_shared<kernel::Address>();
    MS_EXCEPTION_IF_NULL(output);
    output->addr = device_address->ptr_;
@@ -666,7 +681,7 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod
    kernel_outputs->emplace_back(output);
  }

-  for (size_t i = 0; i < kernel_mod.GetWorkspaceSizeList().size(); ++i) {
+  for (size_t i = 0; i < kernel_mod->GetWorkspaceSizeList().size(); ++i) {
    auto device_address = AnfAlgo::GetWorkspaceAddr(kernel, i);
    kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
    MS_EXCEPTION_IF_NULL(workspace);
@@ -721,7 +736,7 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
    AddressPtrList kernel_inputs;
    AddressPtrList kernel_workspaces;
    AddressPtrList kernel_outputs;
-    GenLaunchArgs(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
+    GenLaunchArgs(graph, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs);
    auto ret = kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
    if (!ret) {
      MS_LOG(ERROR) << "Launch kernel failed.";

--- a/mindspore/ccsrc/device/kernel_runtime.h
+++ b/mindspore/ccsrc/device/kernel_runtime.h
@@ -96,8 +96,8 @@ class KernelRuntime {

 private:
  void AssignStaticMemoryOutput(const session::KernelGraph *graph);
-  void GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod, const AnfNodePtr &kernel,
-                     AddressPtrList *kernel_inputs, AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs);
+  void GenLaunchArgs(const session::KernelGraph &graph, const AnfNodePtr &kernel, AddressPtrList *kernel_inputs,
+                     AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs);
  bool LaunchKernelMod(const session::KernelGraph &graph);
  void GenAddrCleanLaunchArgs(const CNodePtr &cnode, AddressPtrList *kernel_inputs);
  size_t CountNodeDeviceMemorySize(const AnfNodePtr &node, size_t output_index);

--- a/mindspore/ccsrc/ir/optimizer_caller.h
+++ b/mindspore/ccsrc/ir/optimizer_caller.h
@@ -17,13 +17,23 @@
 #ifndef MINDSPORE_CCSRC_IR_OPTIMIZER_CALLER_H_
 #define MINDSPORE_CCSRC_IR_OPTIMIZER_CALLER_H_

+#include <memory>
+
 #include "ir/anf.h"
-#include "optimizer/opt.h"

 namespace mindspore {
+namespace opt {
+class Optimizer;
+using OptimizerPtr = std::shared_ptr<Optimizer>;
+using OptimizerWeakPtr = std::weak_ptr<Optimizer>;
+
+using PredicateFuncType = std::function<bool(const AnfNodePtr &)>;
+}  // namespace opt
+
 class OptimizerCaller {
 public:
  virtual AnfNodePtr operator()(const opt::OptimizerPtr &, const AnfNodePtr &) { return nullptr; }
 };
+using OptimizerCallerPtr = std::shared_ptr<OptimizerCaller>;
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_IR_OPTIMIZER_CALLER_H_
--- a/mindspore/ccsrc/kernel/kernel_query.cc
+++ b/mindspore/ccsrc/kernel/kernel_query.cc
@@ -23,6 +23,7 @@
 #include "kernel/tbe/tbe_kernel_select/tbe_kernel_select.h"
 #include "kernel/akg/akg_kernel_metadata.h"
 #include "session/anf_runtime_algorithm.h"
+#include "utils/context/ms_context.h"

 namespace mindspore {
 namespace kernel {
@@ -97,6 +98,12 @@ void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel

  std::string op_name = AnfAlgo::GetCNodeName(kernel_node);

+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (context_ptr->enable_graph_kernel() && IsPrimitiveCNode(kernel_node, prim::kPrimBatchMatMul)) {
+    kernel_type = KernelType::AKG_KERNEL;
+  }
+
  switch (kernel_type) {
    case KernelType::AKG_KERNEL:
      AkgMetadataInfo(kernel_node, kernel_info_list);

--- a/mindspore/ccsrc/optimizer/cse.cc
+++ b/mindspore/ccsrc/optimizer/cse.cc
@@ -89,15 +89,28 @@ bool CSE::BuildOrderGroupAndDoReplace(const FuncGraphManagerPtr manager) const {

  return changed;
 }
-
+// The op like print, summary, or the op do not has true output, and always as a depend node input.
+static bool HasSideEffect(const AnfNodePtr &node) {
+  auto prim = GetCNodePrimitive(node);
+  if (prim == nullptr) {
+    return false;
+  }
+  auto side_effect_v = prim->GetAttr(GRAPH_FLAG_SIDE_EFFECT);
+  if (side_effect_v != nullptr && side_effect_v->isa<BoolImm>()) {
+    return GetValue<bool>(side_effect_v);
+  }
+  return false;
+}
+// If true do not merge the node.
 bool CSE::CheckRandomEffect(const AnfNodePtr &main, const AnfNodePtr &node) const {
  bool has_random_effect = false;
  auto prim_main = GetCNodePrimitive(main);
  auto prim_node = GetCNodePrimitive(node);
-  if (prim_main == prim_node) {
-    return false;
-  }
+  // if has random effect, when generate by different op (not same object), do not merge.
  if (prim_main != nullptr) {
+    if (prim_main == prim_node) {
+      return false;
+    }
    auto effect_val = prim_main->GetAttr(GRAPH_FLAG_RANDOM_EFFECT);
    if (effect_val != nullptr && effect_val->isa<BoolImm>()) {
      has_random_effect = GetValue<bool>(effect_val);
@@ -106,45 +119,58 @@ bool CSE::CheckRandomEffect(const AnfNodePtr &main, const AnfNodePtr &node) cons
  return has_random_effect;
 }

-bool CSE::CheckReplace(const AnfNodePtr &main, const AnfNodePtr &node) const {
+bool CSE::CheckReplace(const AnfNodePtr &main, const AnfNodePtr &node, bool check_side_effect) const {
  MS_EXCEPTION_IF_NULL(main);
  MS_EXCEPTION_IF_NULL(node);

-  bool replace = false;
  if (main->isa<ValueNode>() && node->isa<ValueNode>()) {
    auto main_value = GetValueNode(main);
    auto node_value = GetValueNode(node);
-    replace = (AbsOf(main) == AbsOf(node)) && (*main_value == *node_value);
+    return (AbsOf(main) == AbsOf(node)) && (*main_value == *node_value);
  } else if (main->isa<CNode>() && node->isa<CNode>()) {
    auto c_main = main->cast<CNodePtr>();
    auto c_node = node->cast<CNodePtr>();
+    // When appsame is true, check if has side effect, do not merge.
+    if (check_side_effect && HasSideEffect(main)) {
+      return false;
+    }
    const auto &inp1 = c_main->inputs();
    const auto &inp2 = c_node->inputs();
-    if (inp1.size() == inp2.size()) {
-      bool appsame = true;
-      for (size_t j = 0; j < inp1.size(); j++) {
-        MS_EXCEPTION_IF_NULL(inp1[j]);
-        MS_EXCEPTION_IF_NULL(inp2[j]);
-        if (!(*inp1[j] == *inp2[j])) {
-          // Handle the case of two different Tensor, but with the same value
-          if (IsValueNode<tensor::Tensor>(inp1[j]) && IsValueNode<tensor::Tensor>(inp2[j])) {
-            auto tensor1 = GetValueNode<tensor::TensorPtr>(inp1[j]);
-            auto tensor2 = GetValueNode<tensor::TensorPtr>(inp2[j]);
-            if (tensor1->ValueEqual(*tensor2)) {
-              continue;
-            }
+    if (inp1.size() != inp2.size()) {
+      return false;
+    }
+    for (size_t j = 0; j < inp1.size(); j++) {
+      auto inp1_j = inp1[j];
+      auto inp2_j = inp2[j];
+      MS_EXCEPTION_IF_NULL(inp1_j);
+      MS_EXCEPTION_IF_NULL(inp2_j);
+      if (!(*inp1_j == *inp2_j)) {
+        // Handle the case of two different Tensor, but with the same value
+        if (IsValueNode<tensor::Tensor>(inp1_j) && IsValueNode<tensor::Tensor>(inp2_j)) {
+          auto tensor1 = GetValueNode<tensor::TensorPtr>(inp1_j);
+          auto tensor2 = GetValueNode<tensor::TensorPtr>(inp2_j);
+          if (tensor1->ValueEqual(*tensor2)) {
+            continue;
+          }
+        } else if (HasSideEffect(inp1_j) && HasSideEffect(inp2_j)) {
+          // When the same side effect node as another two nodes' inputs, we still merge the node.
+          // Because the node only can be the inputs of `depend`, when the `depend` is duplicated merge the depend the
+          // node.
+          if (CheckReplace(inp1_j, inp2_j, false)) {
+            continue;
          }
-          appsame = false;
-          break;
        }
+        return false;
      }
-      if (CheckRandomEffect(c_main, c_node)) {
-        appsame = false;
-      }
-      replace = appsame;
    }
+    // When appsame is true, check if has random effect do not merge
+    if (CheckRandomEffect(c_main, c_node)) {
+      return false;
+    }
+    return true;
  }
-  return replace;
+  // a parameter node.
+  return false;
 }

 bool CSE::DoReplace(const FuncGraphManagerPtr manager, const std::vector<std::size_t> &order_group,

--- a/mindspore/ccsrc/optimizer/cse.h
+++ b/mindspore/ccsrc/optimizer/cse.h
@@ -41,7 +41,7 @@ class CSE {
    return chg && report_changes_;
  }

-  virtual bool CheckReplace(const AnfNodePtr &main, const AnfNodePtr &node) const;
+  virtual bool CheckReplace(const AnfNodePtr &main, const AnfNodePtr &node, bool check_side_effect = true) const;

  virtual bool CheckRandomEffect(const AnfNodePtr &main, const AnfNodePtr &node) const;


--- a/mindspore/ccsrc/optimizer/irpass.cc
+++ b/mindspore/ccsrc/optimizer/irpass.cc
@@ -14,140 +14,154 @@
 * limitations under the License.
 */

-#include "optimizer/irpass.h"
-
 #include <string>

-#include "optimizer/irpass/symbol_resolver.h"
+#include "optimizer/irpass.h"
 #include "optimizer/irpass/arithmetic_simplify.h"
-#include "optimizer/irpass/special_op_eliminate.h"
-#include "optimizer/irpass/item_tuple_eliminate.h"
-#include "optimizer/irpass/env_item_eliminate.h"
-#include "optimizer/irpass/tile_eliminate.h"
-#include "optimizer/irpass/cast_eliminate.h"
-#include "optimizer/irpass/reshape_eliminate.h"
-#include "optimizer/irpass/transpose_eliminate.h"
-#include "optimizer/irpass/reduce_eliminate.h"
-#include "optimizer/irpass/partial_eliminate.h"
-#include "optimizer/irpass/ref_eliminate.h"
-#include "optimizer/irpass/merge_addn.h"
 #include "optimizer/irpass/branch_culling.h"
+#include "optimizer/irpass/cast_eliminate.h"
+#include "optimizer/irpass/convert.h"
+#include "optimizer/irpass/env_item_eliminate.h"
+#include "optimizer/irpass/grad_var_prepare.h"
 #include "optimizer/irpass/gradient_eliminate.h"
-#include "optimizer/irpass/minmax_grad.h"
 #include "optimizer/irpass/inline.h"
-#include "optimizer/irpass/convert.h"
-#include "optimizer/irpass/specialize_transform.h"
-#include "optimizer/irpass/incorporate_getitem.h"
 #include "optimizer/irpass/incorporate_call.h"
-#include "optimizer/irpass/grad_var_prepare.h"
-#include "optimizer/irpass/param_replace.h"
+#include "optimizer/irpass/incorporate_getitem.h"
+#include "optimizer/irpass/item_tuple_eliminate.h"
 #include "optimizer/irpass/mark_interface_fusion.h"
+#include "optimizer/irpass/merge_addn.h"
+#include "optimizer/irpass/minmax_grad.h"
+#include "optimizer/irpass/param_replace.h"
+#include "optimizer/irpass/partial_eliminate.h"
+#include "optimizer/irpass/reduce_eliminate.h"
+#include "optimizer/irpass/ref_eliminate.h"
+#include "optimizer/irpass/reshape_eliminate.h"
+#include "optimizer/irpass/special_op_eliminate.h"
+#include "optimizer/irpass/specialize_transform.h"
+#include "optimizer/irpass/symbol_resolver.h"
+#include "optimizer/irpass/tile_eliminate.h"
+#include "optimizer/irpass/transpose_eliminate.h"
 #include "optimizer/opt.h"

 namespace mindspore {
 namespace opt {
 namespace irpass {
 OptimizeIRPassLib::OptimizeIRPassLib() {
-  arithmetic_simplify_ = MakeSubstitution(ArithmeticSimplify(), "arithmetic_simplify",
+  arithmetic_simplify_ = MakeSubstitution(std::make_shared<ArithmeticSimplify>(), "arithmetic_simplify",
                                          {prim::kPrimScalarAdd, prim::kPrimScalarMul, prim::kPrimTensorAdd,
                                           prim::kPrimIdentity, prim::kPrimMomentum, prim::kPrimMul, prim::kPrimPow});
-  arithmetic_simplify2_ = MakeSubstitution(ArithmeticSimplify2(), "arithmetic_simplify2", {prim::kPrimMul});
+  arithmetic_simplify2_ =
+    MakeSubstitution(std::make_shared<ArithmeticSimplify2>(), "arithmetic_simplify2", {prim::kPrimMul});
  special_op_eliminate_ =
-    MakeSubstitution(SpecialOpEliminater(), "special_op_eliminate",
+    MakeSubstitution(std::make_shared<SpecialOpEliminater>(), "special_op_eliminate",
                     {prim::kPrimInsertGradientOf, prim::kPrimStopGradient, prim::kPrimHookBackward,
                      prim::kPrimPrintShapeType, prim::kPrimGetRefKey, prim::kPrimMirror, prim::kPrimVirtualDiv});
-  zero_like_fill_zero_ = MakeSubstitution(ZeroLikeFillZero(), "zero_like_fill_zero", prim::kPrimZerosLike);
-  adjust_all_reduce_mul_add_ = MakeSubstitution(AdjustAllReduceMulAdd(), "adjust_all_reduce_mul_add", prim::kPrimAddN);
+  zero_like_fill_zero_ =
+    MakeSubstitution(std::make_shared<ZeroLikeFillZero>(), "zero_like_fill_zero", prim::kPrimZerosLike);
+  adjust_all_reduce_mul_add_ =
+    MakeSubstitution(std::make_shared<AdjustAllReduceMulAdd>(), "adjust_all_reduce_mul_add", prim::kPrimAddN);

  // ops eliminate
-  item_tuple_eliminate_ =
-    MakeSubstitution(ItemTupleEliminater(), "item_tuple_eliminate", {prim::kPrimTupleGetItem, prim::kPrimTupleSetItem});
-  tile_eliminate_ = MakeSubstitution(TileMultiplyByOne(), "tile_eliminate", prim::kPrimTile);
-  cast_eliminate_ = MakeSubstitution(CastEliminater(), "cast_eliminate", prim::kPrimCast);
-  reshape_eliminate_ = MakeSubstitution(ReshapeEliminater(), "reshape_eliminate", prim::kPrimReshape);
-  transpose_eliminate_ = MakeSubstitution(TransposeSameIOEliminater(), "transpose_eliminate", prim::kPrimTranspose);
+  item_tuple_eliminate_ = MakeSubstitution(std::make_shared<ItemTupleEliminater>(), "item_tuple_eliminate",
+                                           {prim::kPrimTupleGetItem, prim::kPrimTupleSetItem});
+  tile_eliminate_ = MakeSubstitution(std::make_shared<TileMultiplyByOne>(), "tile_eliminate", prim::kPrimTile);
+  cast_eliminate_ = MakeSubstitution(std::make_shared<CastEliminater>(), "cast_eliminate", prim::kPrimCast);
+  reshape_eliminate_ = MakeSubstitution(std::make_shared<ReshapeEliminater>(), "reshape_eliminate", prim::kPrimReshape);
+  transpose_eliminate_ =
+    MakeSubstitution(std::make_shared<TransposeSameIOEliminater>(), "transpose_eliminate", prim::kPrimTranspose);
  reduce_eliminate_ = MakeSubstitution(
-    ReduceOneEliminater(), "reduce_eliminate",
+    std::make_shared<ReduceOneEliminater>(), "reduce_eliminate",
    {prim::kPrimReduceMean, prim::kPrimReduceAll, prim::kPrimReduceSum, prim::kPrimReduceMax, prim::kPrimReduceMin});
-  partial_eliminate_ = MakeSubstitution(PartialEliminater(), "partial_eliminate", IsCNodeDup);
-  same_eliminate_ = MakeSubstitution(SameEliminater(), "same_eliminate", prim::kPrimSameTypeShape);
-  check_bprop_eliminate_ = MakeSubstitution(CheckBpropEliminater(), "check_bprop_eliminate", prim::kPrimCheckBprop);
-  reset_defer_inline_ = MakeSubstitution(ResetDeferInline(), "reset_defer_inline", IsValueNode<FuncGraph>);
-  depend_value_elim_ = MakeSubstitution(DependValueElim(), "depend_value_elim", prim::kPrimDepend);
+  partial_eliminate_ = MakeSubstitution(std::make_shared<PartialEliminater>(), "partial_eliminate", IsCNodeDup);
+  same_eliminate_ = MakeSubstitution(std::make_shared<SameEliminater>(), "same_eliminate", prim::kPrimSameTypeShape);
+  check_bprop_eliminate_ =
+    MakeSubstitution(std::make_shared<CheckBpropEliminater>(), "check_bprop_eliminate", prim::kPrimCheckBprop);
+  reset_defer_inline_ =
+    MakeSubstitution(std::make_shared<ResetDeferInline>(), "reset_defer_inline", IsValueNode<FuncGraph>);
+  depend_value_elim_ = MakeSubstitution(std::make_shared<DependValueElim>(), "depend_value_elim", prim::kPrimDepend);

  // Env Item Eliminate
-  env_get_item_eliminate_ = MakeSubstitution(EnvGetItemEliminater(), "env_get_item_eliminate", prim::kPrimEnvGetItem);
-  new_env_get_item_ = MakeSubstitution(NewEnvGetItem(), "new_env_get_item", prim::kPrimEnvGetItem);
+  env_get_item_eliminate_ =
+    MakeSubstitution(std::make_shared<EnvGetItemEliminater>(), "env_get_item_eliminate", prim::kPrimEnvGetItem);
+  new_env_get_item_ = MakeSubstitution(std::make_shared<NewEnvGetItem>(), "new_env_get_item", prim::kPrimEnvGetItem);
  incorporate_env_getitem_ =
-    MakeSubstitution(IncorporateEnvGetitem(), "incorporate_env_get_item", prim::kPrimEnvGetItem);
-  incorporate_env_getitem_switch_ =
-    MakeSubstitution(IncorporateEnvGetitemSwitch(), "incorporate_env_getitem_switch", prim::kPrimEnvGetItem);
+    MakeSubstitution(std::make_shared<IncorporateEnvGetitem>(), "incorporate_env_get_item", prim::kPrimEnvGetItem);
+  incorporate_env_getitem_switch_ = MakeSubstitution(std::make_shared<IncorporateEnvGetitemSwitch>(),
+                                                     "incorporate_env_getitem_switch", prim::kPrimEnvGetItem);

  // Ref eliminate
-  make_ref_eliminate_ = MakeSubstitution(MakeRefEliminater(), "make_ref_eliminate", prim::kPrimMakeRef);
-  get_ref_param_eliminate_ = MakeSubstitution(GetRefParamEliminater(), "get_ref_param_eliminate",
+  make_ref_eliminate_ =
+    MakeSubstitution(std::make_shared<MakeRefEliminater>(), "make_ref_eliminate", prim::kPrimMakeRef);
+  get_ref_param_eliminate_ = MakeSubstitution(std::make_shared<GetRefParamEliminater>(), "get_ref_param_eliminate",
                                              {prim::kPrimGetRefValue, prim::kPrimGetRefOrigin});
-  get_make_ref_eliminate_ = MakeSubstitution(GetMakeRefEliminater(), "get_make_ref_eliminate",
+  get_make_ref_eliminate_ = MakeSubstitution(std::make_shared<GetMakeRefEliminater>(), "get_make_ref_eliminate",
                                             {prim::kPrimGetRefKey, prim::kPrimGetRefValue, prim::kPrimGetRefOrigin});

-  replace_refkey_by_param_ =
-    MakeSubstitution(ReplaceRefkeyByParam(), "replace_refkey_by_param", IsValueNode<RefKey>, opt::FORCE_RENORM);
-  replace_old_param_ = MakeSubstitution(ReplaceOldParam(), "replace_old_param", IsParam);
+  replace_refkey_by_param_ = MakeSubstitution(std::make_shared<ReplaceRefkeyByParam>(), "replace_refkey_by_param",
+                                              IsValueNode<RefKey>, opt::FORCE_RENORM);
+  replace_old_param_ = MakeSubstitution(std::make_shared<ReplaceOldParam>(), "replace_old_param", IsParam);
  // Gradient transforms
-  expand_jprim_ = MakeSubstitution(ExpandJPrim(), "expand_jprim", prim::kPrimJ);
-  minmaximum_grad_ = MakeSubstitution(MinMaximumGrad(), "minmaximum_grad", prim::kPrimTupleGetItem);
+  expand_jprim_ = MakeSubstitution(std::make_shared<ExpandJPrim>(), "expand_jprim", prim::kPrimJ);
+  minmaximum_grad_ = MakeSubstitution(std::make_shared<MinMaximumGrad>(), "minmaximum_grad", prim::kPrimTupleGetItem);

  // branch culling
-  switch_simplify_ = MakeSubstitution(SwitchSimplify(), "switch_simplify", prim::kPrimSwitch);
-  float_tuple_getitem_switch_ =
-    MakeSubstitution(FloatTupleGetItemSwitch(), "float_tuple_getitem_switch", prim::kPrimTupleGetItem);
+  switch_simplify_ = MakeSubstitution(std::make_shared<SwitchSimplify>(), "switch_simplify", prim::kPrimSwitch);
+  float_tuple_getitem_switch_ = MakeSubstitution(std::make_shared<FloatTupleGetItemSwitch>(),
+                                                 "float_tuple_getitem_switch", prim::kPrimTupleGetItem);
  float_env_getitem_switch_ =
-    MakeSubstitution(FloatEnvGetItemSwitch(), "float_env_getitem_switch", prim::kPrimEnvGetItem);
-  convert_switch_replacement_ = MakeSubstitution(ConvertSwitchReplacement(), "convert_switch_replacement", IsCNodeDup);
+    MakeSubstitution(std::make_shared<FloatEnvGetItemSwitch>(), "float_env_getitem_switch", prim::kPrimEnvGetItem);
+  convert_switch_replacement_ =
+    MakeSubstitution(std::make_shared<ConvertSwitchReplacement>(), "convert_switch_replacement", IsCNodeDup);

  // Addn
-  merge_addn_ = MakeSubstitution(MergeAddN(), "merge_addn", prim::kPrimAddN);
-  addn_zero_filter_ = MakeSubstitution(AddNZeroFilter(), "addn_zero_filter", prim::kPrimAddN);
+  merge_addn_ = MakeSubstitution(std::make_shared<MergeAddN>(), "merge_addn", prim::kPrimAddN);
+  addn_zero_filter_ = MakeSubstitution(std::make_shared<AddNZeroFilter>(), "addn_zero_filter", prim::kPrimAddN);

  // inline
-  inline_ = MakeSubstitution(Inliner(), "inline", IsCNodeGraph);
-  replace_applicator_ = MakeSubstitution(ReplaceApplicator(), "replace_applicator", IsValueNode<FuncGraph>);
-  specialize_transform_ = MakeSubstitution(SpecializeOnGraphArguments(), "specialize_transform", IsCNodeGraph);
+  inline_ = MakeSubstitution(std::make_shared<Inliner>(), "inline", IsCNodeGraph);
+  replace_applicator_ =
+    MakeSubstitution(std::make_shared<ReplaceApplicator>(), "replace_applicator", IsValueNode<FuncGraph>);
+  specialize_transform_ =
+    MakeSubstitution(std::make_shared<SpecializeOnGraphArguments>(), "specialize_transform", IsCNodeGraph);

  // Incorporation
  incorporate_getitem_set_ =
-    MakeSubstitution(IncorporateGetitemSet(), "incorporate_getitem_set", prim::kPrimTupleGetItem);
-  incorporate_getitem_from_param_ =
-    MakeSubstitution(IncorporateGetitemFromParam(), "incorporate_getitem_from_param", IsCNodeGraphKernel);
-  incorporate_call_ = MakeSubstitution(IncorporateCall(), "incorporate_call", IsCNodeDup);
-  incorporate_call_switch_ = MakeSubstitution(IncorporateCallSwitch(), "incorporate_call_switch", IsCNodeDup);
+    MakeSubstitution(std::make_shared<IncorporateGetitemSet>(), "incorporate_getitem_set", prim::kPrimTupleGetItem);
+  incorporate_getitem_from_param_ = MakeSubstitution(std::make_shared<IncorporateGetitemFromParam>(),
+                                                     "incorporate_getitem_from_param", IsCNodeGraphKernel);
+  incorporate_call_ = MakeSubstitution(std::make_shared<IncorporateCall>(), "incorporate_call", IsCNodeDup);
+  incorporate_call_switch_ =
+    MakeSubstitution(std::make_shared<IncorporateCallSwitch>(), "incorporate_call_switch", IsCNodeDup);

  // Virtual Dataset
-  virtual_dataset_eliminate_ =
-    MakeSubstitution(VirtualDatasetEliminater(), "virtual_dataset_eliminate", prim::kPrimVirtualDataset);
+  virtual_dataset_eliminate_ = MakeSubstitution(std::make_shared<VirtualDatasetEliminater>(),
+                                                "virtual_dataset_eliminate", prim::kPrimVirtualDataset);

  // Convert
-  print_tuple_wrapper_ = MakeSubstitution(PrintTupleWrapper(), "print_tuple_wrapper", prim::kPrimPrint);
+  print_tuple_wrapper_ =
+    MakeSubstitution(std::make_shared<PrintTupleWrapper>(), "print_tuple_wrapper", prim::kPrimPrint);

  // Unused parameter eliminate
  unused_parameter_eliminate_ =
-    MakeSubstitution(UnusedParasEliminater(), "unused_parameter_eliminate", IsCNodeGraphKernel);
-  unused_output_eliminate_ = MakeSubstitution(UnusedOutputEliminater(), "unused_output_eliminate", IsCNodeGraphKernel);
+    MakeSubstitution(std::make_shared<UnusedParasEliminater>(), "unused_parameter_eliminate", IsCNodeGraphKernel);
+  unused_output_eliminate_ =
+    MakeSubstitution(std::make_shared<UnusedOutputEliminater>(), "unused_output_eliminate", IsCNodeGraphKernel);

  // AddN eliminate
-  addn_eliminate_ = MakeSubstitution(AddNEliminater(), "addn_eliminate", IsCNodeGraphKernel);
+  addn_eliminate_ = MakeSubstitution(std::make_shared<AddNEliminater>(), "addn_eliminate", IsCNodeGraphKernel);

  // Mark interface fusion
-  mark_interface_fusion_ = MakeSubstitution(MarkInterfaceFusion(), "mark_interface_fusion", prim::kPrimSelect);
+  mark_interface_fusion_ =
+    MakeSubstitution(std::make_shared<MarkInterfaceFusion>(), "mark_interface_fusion", prim::kPrimSelect);
 }

 ResolveIRPassLib::ResolveIRPassLib() {
-  resolver_resolve_ = MakeSubstitution(ResolverResolve(), "resolver_resolve", prim::kPrimResolve);
-  resolver_getattr_ = MakeSubstitution(ResolverGetattr(), "resolver_getattr", prim::kPrimGetAttr);
+  resolver_resolve_ = MakeSubstitution(std::make_shared<ResolverResolve>(), "resolver_resolve", prim::kPrimResolve);
+  resolver_getattr_ = MakeSubstitution(std::make_shared<ResolverGetattr>(), "resolver_getattr", prim::kPrimGetAttr);
 }

 InferenceOptPrepareLib::InferenceOptPrepareLib() {
-  grad_var_prepare_ = MakeSubstitution(GradVarPrepare(), "grad_var_prepare", IsCNode);
+  grad_var_prepare_ = MakeSubstitution(std::make_shared<GradVarPrepare>(), "grad_var_prepare", IsCNode);
 }
 }  // namespace irpass
 }  // namespace opt

--- a/mindspore/ccsrc/optimizer/irpass/arithmetic_simplify.h
+++ b/mindspore/ccsrc/optimizer/irpass/arithmetic_simplify.h
@@ -17,15 +17,16 @@
 #ifndef MINDSPORE_CCSRC_OPTIMIZER_IRPASS_ARITHMETIC_SIMPLIFY_H_
 #define MINDSPORE_CCSRC_OPTIMIZER_IRPASS_ARITHMETIC_SIMPLIFY_H_

-#include <vector>
-#include <memory>
 #include <algorithm>
+#include <memory>
+#include <vector>

-#include "optimizer/optimizer.h"
-#include "optimizer/irpass.h"
-#include "optimizer/irpass/prim_eliminate.h"
+#include "ir/optimizer_caller.h"
 #include "ir/visitor.h"
 #include "operator/ops.h"
+#include "optimizer/irpass.h"
+#include "optimizer/irpass/prim_eliminate.h"
+#include "optimizer/optimizer.h"

 namespace mindspore {
 namespace opt {
@@ -739,17 +740,17 @@ class AdjustAllReduceMulAdd : public AnfVisitor {
  FuncGraphPtr all_reduce_fg_{nullptr};
 };

-class ArithmeticSimplify {
+class ArithmeticSimplify : public OptimizerCaller {
 public:
  ArithmeticSimplify()
-      : multiply_by_zero_or_one_(),
-        tensor_multiply_by_one_(),
-        add_by_zero_(),
-        tensor_add_by_zero_(),
-        identity_(prim::kPrimIdentity),
-        opt_update_zero_tensor_(),
-        constant_duplicate_mul_(),
-        power_one_() {
+      : multiply_by_zero_or_one_(std::make_shared<MultiplyByZeroOrOne>()),
+        tensor_multiply_by_one_(std::make_shared<TensorMultiplyByOne>()),
+        add_by_zero_(std::make_shared<AddByZero>()),
+        tensor_add_by_zero_(std::make_shared<TensorAddByZero>()),
+        identity_(std::make_shared<PrimEliminater>(prim::kPrimIdentity)),
+        opt_update_zero_tensor_(std::make_shared<OptUpdateZeroTensor>()),
+        constant_duplicate_mul_(std::make_shared<ConstantDuplicateMul>()),
+        power_one_(std::make_shared<PowerOneEliminate>()) {
    eliminaters_.emplace_back(multiply_by_zero_or_one_);
    eliminaters_.emplace_back(tensor_multiply_by_one_);
    eliminaters_.emplace_back(add_by_zero_);
@@ -761,10 +762,10 @@ class ArithmeticSimplify {
  }
  ~ArithmeticSimplify() = default;

-  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override {
    AnfNodePtr new_node;
    for (auto &eliminater : eliminaters_) {
-      new_node = eliminater(optimizer, node);
+      new_node = (*eliminater)(optimizer, node);
      if (new_node != nullptr) {
        return new_node;
      }
@@ -773,15 +774,9 @@ class ArithmeticSimplify {
  }

 private:
-  MultiplyByZeroOrOne multiply_by_zero_or_one_;
-  TensorMultiplyByOne tensor_multiply_by_one_;
-  AddByZero add_by_zero_;
-  TensorAddByZero tensor_add_by_zero_;
-  PrimEliminater identity_;
-  OptUpdateZeroTensor opt_update_zero_tensor_;
-  ConstantDuplicateMul constant_duplicate_mul_;
-  PowerOneEliminate power_one_;
-  std::vector<TransformFuncType> eliminaters_{};
+  OptimizerCallerPtr multiply_by_zero_or_one_, tensor_multiply_by_one_, add_by_zero_, tensor_add_by_zero_, identity_,
+    opt_update_zero_tensor_, constant_duplicate_mul_, power_one_;
+  std::vector<OptimizerCallerPtr> eliminaters_{};
 };

 // Arithmetic Simplifications should be done after step_parallel.
@@ -789,15 +784,17 @@ class ArithmeticSimplify {
 // with shape(weight), but after step_parallel, shape of weight may be changed, so the
 // shape of the constant tensor should also be changed. So this pass is seperated from
 // ArithmeticSimplify and deferred until step_parallel.
-class ArithmeticSimplify2 {
+class ArithmeticSimplify2 : public OptimizerCaller {
 public:
-  ArithmeticSimplify2() : tensor_multiply_by_zero_() { eliminaters_.emplace_back(tensor_multiply_by_zero_); }
+  ArithmeticSimplify2() : tensor_multiply_by_zero_(std::make_shared<TensorMultiplyByZero>()) {
+    eliminaters_.emplace_back(tensor_multiply_by_zero_);
+  }
  ~ArithmeticSimplify2() = default;

-  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override {
    AnfNodePtr new_node;
    for (auto &eliminater : eliminaters_) {
-      new_node = eliminater(optimizer, node);
+      new_node = (*eliminater)(optimizer, node);
      if (new_node != nullptr) {
        return new_node;
      }
@@ -806,8 +803,8 @@ class ArithmeticSimplify2 {
  }

 private:
-  TensorMultiplyByZero tensor_multiply_by_zero_;
-  std::vector<TransformFuncType> eliminaters_{};
+  OptimizerCallerPtr tensor_multiply_by_zero_;
+  std::vector<OptimizerCallerPtr> eliminaters_{};
 };
 }  // namespace irpass
 }  // namespace opt

--- a/mindspore/ccsrc/optimizer/irpass/cast_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/cast_eliminate.h
@@ -17,9 +17,9 @@
 #ifndef MINDSPORE_CCSRC_OPTIMIZER_IRPASS_CAST_ELIMINATE_H_
 #define MINDSPORE_CCSRC_OPTIMIZER_IRPASS_CAST_ELIMINATE_H_

+#include "ir/visitor.h"
 #include "optimizer/irpass.h"
 #include "optimizer/optimizer.h"
-#include "ir/visitor.h"

 namespace mindspore {
 namespace opt {
@@ -52,12 +52,12 @@ class TwoCastEliminater : public AnfVisitor {
  AnfNodePtr x_{nullptr}, t_{nullptr};
 };

-class CastEliminater {
+class CastEliminater : public OptimizerCaller {
 public:
  CastEliminater() : cast_same_type_eliminater_(), two_cast_eliminater_() {}
  ~CastEliminater() = default;

-  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override {
    auto new_node = cast_same_type_eliminater_(optimizer, node);
    if (new_node != nullptr) {
      return new_node;

--- a/mindspore/ccsrc/optimizer/irpass/env_item_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/env_item_eliminate.h
@@ -17,18 +17,19 @@
 #ifndef MINDSPORE_CCSRC_OPTIMIZER_IRPASS_ENV_ITEM_ELIMINATE_H_
 #define MINDSPORE_CCSRC_OPTIMIZER_IRPASS_ENV_ITEM_ELIMINATE_H_

-#include <vector>
-#include <utility>
 #include <algorithm>
-#include <unordered_map>
 #include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>

-#include "optimizer/irpass.h"
-#include "optimizer/optimizer.h"
-#include "ir/visitor.h"
 #include "ir/func_graph.h"
 #include "ir/func_graph_cloner.h"
+#include "ir/optimizer_caller.h"
+#include "ir/visitor.h"
 #include "operator/ops.h"
+#include "optimizer/irpass.h"
+#include "optimizer/optimizer.h"
 #include "utils/symbolic.h"

 namespace mindspore {
@@ -225,19 +226,22 @@ class EnvGetSetItem : public AnfVisitor {
  bool is_match_{false};
 };

-class EnvGetItemEliminater {
+class EnvGetItemEliminater : public OptimizerCaller {
 public:
-  EnvGetItemEliminater() : new_env_get_item_(), add_env_get_item_(), env_get_set_item_() {
+  EnvGetItemEliminater()
+      : new_env_get_item_(std::make_shared<NewEnvGetItem>()),
+        add_env_get_item_(std::make_shared<AddEnvGetItem>()),
+        env_get_set_item_(std::make_shared<EnvGetSetItem>()) {
    eliminaters_.emplace_back(new_env_get_item_);
    eliminaters_.emplace_back(add_env_get_item_);
    eliminaters_.emplace_back(env_get_set_item_);
  }
  ~EnvGetItemEliminater() = default;

-  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override {
    AnfNodePtr new_node;
    for (auto &eliminater : eliminaters_) {
-      new_node = eliminater(optimizer, node);
+      new_node = (*eliminater)(optimizer, node);
      if (new_node != nullptr) {
        return new_node;
      }
@@ -246,10 +250,8 @@ class EnvGetItemEliminater {
  }

 private:
-  NewEnvGetItem new_env_get_item_;
-  AddEnvGetItem add_env_get_item_;
-  EnvGetSetItem env_get_set_item_;
-  std::vector<TransformFuncType> eliminaters_{};
+  OptimizerCallerPtr new_env_get_item_, add_env_get_item_, env_get_set_item_;
+  std::vector<OptimizerCallerPtr> eliminaters_{};
 };

 // {prim::kPrimEnvGetItem, {G, Xs}, C, Y}

--- a/mindspore/ccsrc/optimizer/irpass/incorporate_getitem.h
+++ b/mindspore/ccsrc/optimizer/irpass/incorporate_getitem.h
@@ -17,18 +17,20 @@
 #ifndef MINDSPORE_CCSRC_OPTIMIZER_IRPASS_INCORPORATE_GETITEM_H_
 #define MINDSPORE_CCSRC_OPTIMIZER_IRPASS_INCORPORATE_GETITEM_H_

-#include <vector>
 #include <algorithm>
-#include <unordered_map>
 #include <memory>
+#include <unordered_map>
 #include <unordered_set>
+#include <vector>

-#include "optimizer/irpass.h"
-#include "optimizer/optimizer.h"
-#include "ir/visitor.h"
 #include "ir/func_graph.h"
 #include "ir/func_graph_cloner.h"
+#include "ir/optimizer_caller.h"
+#include "ir/visitor.h"
 #include "operator/ops.h"
+#include "optimizer/irpass.h"
+#include "optimizer/optimizer.h"
+
 namespace mindspore {
 namespace opt {
 namespace irpass {
@@ -383,18 +385,20 @@ class IncorporateGetitemSwitch : public AnfVisitor {
  internal::GetitemTransform getitem_transform_;
 };

-class IncorporateGetitemSet {
+class IncorporateGetitemSet : public OptimizerCaller {
 public:
-  IncorporateGetitemSet() : incorporate_getitem_(), incorporate_getitem_switch_() {
+  IncorporateGetitemSet()
+      : incorporate_getitem_(std::make_shared<IncorporateGetitem>()),
+        incorporate_getitem_switch_(std::make_shared<IncorporateGetitemSwitch>()) {
    eliminaters_.emplace_back(incorporate_getitem_);
    eliminaters_.emplace_back(incorporate_getitem_switch_);
  }
  ~IncorporateGetitemSet() = default;

-  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override {
    AnfNodePtr new_node;
    for (auto &eliminater : eliminaters_) {
-      new_node = eliminater(optimizer, node);
+      new_node = (*eliminater)(optimizer, node);
      if (new_node != nullptr) {
        return new_node;
      }
@@ -403,9 +407,8 @@ class IncorporateGetitemSet {
  }

 private:
-  IncorporateGetitem incorporate_getitem_;
-  IncorporateGetitemSwitch incorporate_getitem_switch_;
-  std::vector<TransformFuncType> eliminaters_{};
+  OptimizerCallerPtr incorporate_getitem_, incorporate_getitem_switch_;
+  std::vector<OptimizerCallerPtr> eliminaters_{};
 };
 }  // namespace irpass
 }  // namespace opt

--- a/mindspore/ccsrc/optimizer/irpass/item_tuple_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/item_tuple_eliminate.h
@@ -17,13 +17,15 @@
 #ifndef MINDSPORE_CCSRC_OPTIMIZER_IRPASS_ITEM_TUPLE_ELIMINATE_H_
 #define MINDSPORE_CCSRC_OPTIMIZER_IRPASS_ITEM_TUPLE_ELIMINATE_H_

-#include <vector>
 #include <algorithm>
+#include <memory>
+#include <vector>

-#include "optimizer/irpass.h"
-#include "optimizer/optimizer.h"
+#include "ir/optimizer_caller.h"
 #include "ir/visitor.h"
 #include "operator/ops.h"
+#include "optimizer/irpass.h"
+#include "optimizer/optimizer.h"

 namespace mindspore {
 namespace opt {
@@ -261,14 +263,14 @@ class GetitemDependReorder : public AnfVisitor {
  AnfNodePtr x_{nullptr}, y_{nullptr}, c_{nullptr};
 };

-class ItemTupleEliminater {
+class ItemTupleEliminater : public OptimizerCaller {
 public:
  ItemTupleEliminater()
-      : get_item_eliminater_(),
-        get_item_const_eliminater_(),
-        set_item_eliminater_(),
-        get_set_item_eliminater_(),
-        get_item_depend_reorder_() {
+      : get_item_eliminater_(std::make_shared<GetitemEliminater>()),
+        get_item_const_eliminater_(std::make_shared<GetitemConstEliminater>()),
+        set_item_eliminater_(std::make_shared<SetitemEliminater>()),
+        get_set_item_eliminater_(std::make_shared<GetSetitemEliminater>()),
+        get_item_depend_reorder_(std::make_shared<GetitemDependReorder>()) {
    eliminaters_.emplace_back(get_item_eliminater_);
    eliminaters_.emplace_back(get_item_const_eliminater_);
    eliminaters_.emplace_back(set_item_eliminater_);
@@ -277,10 +279,10 @@ class ItemTupleEliminater {
  }
  ~ItemTupleEliminater() = default;

-  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override {
    AnfNodePtr new_node;
    for (auto &eliminater : eliminaters_) {
-      new_node = eliminater(optimizer, node);
+      new_node = (*eliminater)(optimizer, node);
      if (new_node != nullptr) {
        return new_node;
      }
@@ -289,12 +291,9 @@ class ItemTupleEliminater {
  }

 private:
-  GetitemEliminater get_item_eliminater_;
-  GetitemConstEliminater get_item_const_eliminater_;
-  SetitemEliminater set_item_eliminater_;
-  GetSetitemEliminater get_set_item_eliminater_;
-  GetitemDependReorder get_item_depend_reorder_;
-  std::vector<TransformFuncType> eliminaters_{};
+  OptimizerCallerPtr get_item_eliminater_, get_item_const_eliminater_, set_item_eliminater_, get_set_item_eliminater_,
+    get_item_depend_reorder_;
+  std::vector<OptimizerCallerPtr> eliminaters_{};
 };
 }  // namespace irpass
 }  // namespace opt

--- a/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/ref_eliminate.h
@@ -19,9 +19,9 @@

 #include <memory>

-#include "optimizer/optimizer.h"
-#include "optimizer/irpass.h"
 #include "ir/pattern_matcher.h"
+#include "optimizer/irpass.h"
+#include "optimizer/optimizer.h"

 namespace mindspore {
 namespace opt {

--- a/mindspore/ccsrc/optimizer/irpass/reshape_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/reshape_eliminate.h
@@ -19,11 +19,12 @@

 #include <vector>

-#include "optimizer/irpass.h"
-#include "optimizer/optimizer.h"
-#include "ir/visitor.h"
 #include "ir/func_graph.h"
+#include "ir/optimizer_caller.h"
+#include "ir/visitor.h"
 #include "operator/ops.h"
+#include "optimizer/irpass.h"
+#include "optimizer/optimizer.h"
 #include "pipeline/static_analysis/dshape.h"

 namespace mindspore {
@@ -124,12 +125,12 @@ class TwoReshapeEliminater : public AnfVisitor {
  AnfNodePtr x_{nullptr}, shape_{nullptr};
 };

-class ReshapeEliminater {
+class ReshapeEliminater : public OptimizerCaller {
 public:
  ReshapeEliminater() : reshape_same_shape_eliminater_(), two_reshape_eliminater_() {}
  ~ReshapeEliminater() = default;

-  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override {
    auto new_node = reshape_same_shape_eliminater_(optimizer, node);
    if (new_node != nullptr) {
      return new_node;

--- a/mindspore/ccsrc/optimizer/irpass/special_op_eliminate.h
+++ b/mindspore/ccsrc/optimizer/irpass/special_op_eliminate.h
@@ -18,31 +18,31 @@
 #define MINDSPORE_CCSRC_OPTIMIZER_IRPASS_SPECIAL_OP_ELIMINATE_H_

 #include <securec.h>
-#include <vector>
-#include <memory>
 #include <algorithm>
+#include <memory>
+#include <vector>

-#include "optimizer/optimizer.h"
-#include "optimizer/irpass.h"
 #include "ir/optimizer_caller.h"
-#include "optimizer/irpass/prim_eliminate.h"
+#include "ir/pattern_matcher.h"
 #include "ir/visitor.h"
 #include "operator/ops.h"
-#include "ir/pattern_matcher.h"
+#include "optimizer/irpass.h"
+#include "optimizer/irpass/prim_eliminate.h"
+#include "optimizer/optimizer.h"

 namespace mindspore {
 namespace opt {
 namespace irpass {
-class SpecialOpEliminater {
+class SpecialOpEliminater : public OptimizerCaller {
 public:
  SpecialOpEliminater()
-      : insert_gradient_of_(prim::kPrimInsertGradientOf),
-        stop_gradient_(prim::kPrimStopGradient),
-        hook_backward_(prim::kPrimHookBackward),
-        print_shape_type_(prim::kPrimPrintShapeType),
-        get_ref_value_(prim::kPrimGetRefValue),
-        mirror_(prim::kPrimMirror),
-        virtual_div_(prim::kPrimVirtualDiv) {
+      : insert_gradient_of_(std::make_shared<PrimEliminater>(prim::kPrimInsertGradientOf)),
+        stop_gradient_(std::make_shared<PrimEliminater>(prim::kPrimStopGradient)),
+        hook_backward_(std::make_shared<PrimEliminater>(prim::kPrimHookBackward)),
+        print_shape_type_(std::make_shared<PrimEliminater>(prim::kPrimPrintShapeType)),
+        get_ref_value_(std::make_shared<PrimEliminater>(prim::kPrimGetRefValue)),
+        mirror_(std::make_shared<PrimEliminater>(prim::kPrimMirror)),
+        virtual_div_(std::make_shared<PrimEliminater>(prim::kPrimVirtualDiv)) {
    eliminaters_.emplace_back(insert_gradient_of_);
    eliminaters_.emplace_back(stop_gradient_);
    eliminaters_.emplace_back(hook_backward_);
@@ -53,10 +53,10 @@ class SpecialOpEliminater {
  }
  ~SpecialOpEliminater() = default;

-  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) override {
    AnfNodePtr new_node;
    for (auto &eliminater : eliminaters_) {
-      new_node = eliminater(optimizer, node);
+      new_node = (*eliminater)(optimizer, node);
      if (new_node != nullptr) {
        return new_node;
      }
@@ -65,9 +65,9 @@ class SpecialOpEliminater {
  }

 private:
-  PrimEliminater insert_gradient_of_, stop_gradient_, hook_backward_, print_shape_type_, get_ref_value_, mirror_,
+  OptimizerCallerPtr insert_gradient_of_, stop_gradient_, hook_backward_, print_shape_type_, get_ref_value_, mirror_,
    virtual_div_;
-  std::vector<TransformFuncType> eliminaters_{};
+  std::vector<OptimizerCallerPtr> eliminaters_{};
 };

 // {PrimVirtualDataset, X} -> X

--- a/mindspore/ccsrc/optimizer/opt.cc
+++ b/mindspore/ccsrc/optimizer/opt.cc
@@ -16,28 +16,27 @@

 #include "optimizer/opt.h"

+#include <algorithm>
+#include <deque>
 #include <memory>
 #include <unordered_set>
-#include <deque>
-#include <algorithm>

 #include "ir/anf.h"
 #include "ir/manager.h"
-#include "utils/ordered_set.h"
-
-#include "utils/log_adapter.h"
 #include "optimizer/optimizer.h"
+#include "utils/log_adapter.h"
+#include "utils/ordered_set.h"

 namespace mindspore {
 /* namespace to support opt */
 namespace opt {
-SubstitutionPtr MakeSubstitution(const TransformFuncType &transform, const std::string &name, const PrimitivePtr &prim,
+SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name, const PrimitivePtr &prim,
                                 const RenormAction &renorm_action) {
  auto fn = [prim](const AnfNodePtr &node) -> bool { return IsPrimitiveCNode(node, prim); };
  return std::make_shared<Substitution>(transform, name, fn, renorm_action);
 }

-SubstitutionPtr MakeSubstitution(const TransformFuncType &transform, const std::string &name,
+SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name,
                                 const std::vector<PrimitivePtr> &prims, const RenormAction &renorm_action) {
  auto fn = [prims](const AnfNodePtr &node) -> bool {
    if (!node->isa<CNode>()) {
@@ -64,16 +63,16 @@ SubstitutionPtr MakeSubstitution(const TransformFuncType &transform, const std::
  return std::make_shared<Substitution>(transform, name, fn, renorm_action);
 }

-SubstitutionPtr MakeSubstitution(const TransformFuncType &transform, const std::string &name,
+SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name,
                                 const PredicateFuncType &predicate, const RenormAction &renorm_action) {
  return std::make_shared<Substitution>(transform, name, predicate, renorm_action);
 }

-AnfNodePtr Substitution::operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) const {
+AnfNodePtr Substitution::operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) {
 #ifdef ENABLE_PROFILE
  double t = GetTime();
 #endif
-  AnfNodePtr result = transform_(optimizer, node);
+  AnfNodePtr result = (*transform_)(optimizer, node);
 #ifdef ENABLE_PROFILE
  if (optimizer != nullptr) {
    auto time = GetTime();

--- a/mindspore/ccsrc/optimizer/opt.h
+++ b/mindspore/ccsrc/optimizer/opt.h
@@ -17,24 +17,18 @@
 #ifndef MINDSPORE_CCSRC_OPTIMIZER_OPT_H_
 #define MINDSPORE_CCSRC_OPTIMIZER_OPT_H_

-#include <vector>
-#include <string>
 #include <memory>
+#include <string>
+#include <vector>

 #include "ir/anf.h"
 #include "ir/func_graph.h"
+#include "ir/optimizer_caller.h"
 #include "operator/ops.h"

 namespace mindspore {
 /* namespace to support opt */
 namespace opt {
-class Optimizer;
-
-using OptimizerPtr = std::shared_ptr<Optimizer>;
-using OptimizerWeakPtr = std::weak_ptr<Optimizer>;
-
-using PredicateFuncType = std::function<bool(const AnfNodePtr &)>;
-using TransformFuncType = std::function<AnfNodePtr(const OptimizerPtr &, const AnfNodePtr &)>;

 // Define the interaction mode between an Optimize pass and Renormalize pass
 // FORCE_RENORM: if the pass modified the graph then the next Renormalize will be executed
@@ -43,26 +37,26 @@ enum RenormAction : int { FORCE_RENORM = 0, CHECK_RENORM };

 class Substitution {
 public:
-  TransformFuncType transform_{nullptr};
+  OptimizerCallerPtr transform_;
  std::string name_;
  PredicateFuncType predicate_{nullptr};
  // an enum to mark this Substitution relation to renormalize pass
  RenormAction renorm_action_;
-  Substitution(const TransformFuncType &transform, const std::string &name, const PredicateFuncType &predicate,
+  Substitution(const OptimizerCallerPtr &transform, const std::string &name, const PredicateFuncType &predicate,
               const RenormAction &renorm_action)
      : transform_(transform), name_(name), predicate_(predicate), renorm_action_(renorm_action) {}
  ~Substitution() = default;
-  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node) const;
+  AnfNodePtr operator()(const OptimizerPtr &optimizer, const AnfNodePtr &node);
 };

 using SubstitutionPtr = std::shared_ptr<Substitution>;

-SubstitutionPtr MakeSubstitution(const TransformFuncType &transform, const std::string &name, const PrimitivePtr &prim,
+SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name, const PrimitivePtr &prim,
                                 const RenormAction &action_renorm = CHECK_RENORM);
-SubstitutionPtr MakeSubstitution(const TransformFuncType &transform, const std::string &name,
+SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name,
                                 const std::vector<PrimitivePtr> &prims,
                                 const RenormAction &action_renorm = CHECK_RENORM);
-SubstitutionPtr MakeSubstitution(const TransformFuncType &transform, const std::string &name,
+SubstitutionPtr MakeSubstitution(const OptimizerCallerPtr &transform, const std::string &name,
                                 const PredicateFuncType &predicate, const RenormAction &action_renorm = CHECK_RENORM);

 class SubstitutionList {

--- a/mindspore/ccsrc/parallel/auto_parallel/operator_costmodel.cc
+++ b/mindspore/ccsrc/parallel/auto_parallel/operator_costmodel.cc
@@ -465,7 +465,7 @@ double ReshapeCost::GetForwardCommCost(const std::vector<TensorInfo> &inputs, co
  CheckGlobalDeviceManager();
  MS_EXCEPTION_IF_NULL(g_device_manager);
  RankList dev_list = g_device_manager->GetDeviceListByStageId(stage_id);
-  TensorRedistribution tensor_redistribution;
+  TensorRedistribution tensor_redistribution(false, true);
  if (tensor_redistribution.Init(inputs[0].tensor_layout(), outputs[0].tensor_layout(), dev_list) == FAILED) {
    MS_LOG(EXCEPTION) << "Failure: tensor_redistribution init failed.";
  }
@@ -503,7 +503,7 @@ double ReshapeCost::GetForwardComputationCost(const std::vector<TensorInfo> &inp
  CheckGlobalDeviceManager();
  MS_EXCEPTION_IF_NULL(g_device_manager);
  RankList dev_list = g_device_manager->GetDeviceListByStageId(stage_id);
-  TensorRedistribution tensor_redistribution;
+  TensorRedistribution tensor_redistribution(false, true);
  if (tensor_redistribution.Init(inputs[0].tensor_layout(), outputs[0].tensor_layout(), dev_list) == FAILED) {
    MS_LOG(EXCEPTION) << "Failure: tensor_redistribution init failed.";
  }

--- a/mindspore/ccsrc/parallel/context.cc
+++ b/mindspore/ccsrc/parallel/context.cc
@@ -62,6 +62,7 @@ void ParallelContext::Reset() {
  enable_all_reduce_fusion_ = false;
  strategy_ckpt_load_file_ = "";
  strategy_ckpt_save_file_ = "";
+  enable_parallel_optimizer_ = false;
 }

 void ParallelContext::set_device_num(int32_t device_num) {

--- a/mindspore/ccsrc/parallel/context.h
+++ b/mindspore/ccsrc/parallel/context.h
@@ -100,6 +100,11 @@ class ParallelContext {
  void set_strategy_ckpt_save_file(const std::string &strategy_ckpt_save_file);
  std::string strategy_ckpt_save_file() const { return strategy_ckpt_save_file_; }

+  void set_enable_parallel_optimizer(bool enable_parallel_optimizer) {
+    enable_parallel_optimizer_ = enable_parallel_optimizer;
+  }
+  bool enable_parallel_optimizer() const { return enable_parallel_optimizer_; }
+
  void Reset();

 private:
@@ -123,6 +128,7 @@ class ParallelContext {
  std::map<std::string, std::vector<uint32_t>> all_reduce_fusion_split_sizes_;
  std::string strategy_ckpt_load_file_;
  std::string strategy_ckpt_save_file_;
+  bool enable_parallel_optimizer_;
 };

 void ParallelParameterContextInit(const FuncGraphPtr &func_graph);

--- a/mindspore/ccsrc/pipeline/init.cc
+++ b/mindspore/ccsrc/pipeline/init.cc
@@ -205,6 +205,10 @@ PYBIND11_MODULE(_c_expression, m) {
    .def("get_strategy_ckpt_save_file", &ParallelContext::strategy_ckpt_save_file, "Get strategy checkpoint save file.")
    .def("set_full_batch", &ParallelContext::set_full_batch, "Set whether load full batch on each device.")
    .def("get_full_batch", &ParallelContext::full_batch, "Get whether load full batch on each device.")
+    .def("set_enable_parallel_optimizer", &ParallelContext::set_enable_parallel_optimizer,
+         "Set enable/disable parallel optimizer.")
+    .def("get_enable_parallel_optimizer", &ParallelContext::enable_parallel_optimizer,
+         "Get enable/disable parallel optimizer.")
    .def("reset", &ParallelContext::Reset, "Reset auto parallel context.");

  (void)py::class_<CostModelContext, std::shared_ptr<CostModelContext>>(m, "CostModelContext")

--- a/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.cc
+++ b/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.cc
@@ -35,7 +35,7 @@ bool CheckEqualKernelBuildInfo(const AnfNodePtr &main, const AnfNodePtr &node) {
 }
 }  // namespace

-bool BackendCSE::CheckReplace(const AnfNodePtr &main, const AnfNodePtr &node) const {
+bool BackendCSE::CheckReplace(const AnfNodePtr &main, const AnfNodePtr &node, bool) const {
  MS_EXCEPTION_IF_NULL(main);
  MS_EXCEPTION_IF_NULL(node);


--- a/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.h
+++ b/mindspore/ccsrc/pre_activate/pass/common_subexpression_elimination.h
@@ -31,7 +31,7 @@ class BackendCSE : public CSE {
 public:
  BackendCSE() = default;
  ~BackendCSE() override = default;
-  bool CheckReplace(const AnfNodePtr &main, const AnfNodePtr &node) const override;
+  bool CheckReplace(const AnfNodePtr &main, const AnfNodePtr &node, bool check_side_effect = true) const override;
 };
 }  // namespace opt
 }  // namespace mindspore

--- a/mindspore/ccsrc/pybind_api/export_flags.cc
+++ b/mindspore/ccsrc/pybind_api/export_flags.cc
@@ -33,5 +33,6 @@ const char GRAPH_FLAG_LOOP_CAN_UNROLL[] = "loop_can_unroll";
 const char GRAPH_FLAG_HAS_EFFECT[] = "has_effect";
 const char GRAPH_FLAG_EFFECT_PATIAL_ORDER[] = "_effect_patial_order";
 const char GRAPH_FLAG_RANDOM_EFFECT[] = "_random_effect";
+const char GRAPH_FLAG_SIDE_EFFECT[] = "_side_effect";

 }  // namespace mindspore
--- a/mindspore/ccsrc/pybind_api/export_flags.h
+++ b/mindspore/ccsrc/pybind_api/export_flags.h
@@ -34,7 +34,7 @@ extern const char GRAPH_FLAG_LOOP_CAN_UNROLL[];
 extern const char GRAPH_FLAG_HAS_EFFECT[];
 extern const char GRAPH_FLAG_EFFECT_PATIAL_ORDER[];
 extern const char GRAPH_FLAG_RANDOM_EFFECT[];
-
+extern const char GRAPH_FLAG_SIDE_EFFECT[];
 }  // namespace mindspore

 #endif  // PYBIND_API_EXPORT_FLAGS_H_
--- a/mindspore/ccsrc/session/ascend_control_parser.cc
+++ b/mindspore/ccsrc/session/ascend_control_parser.cc
@@ -33,6 +33,21 @@ static constexpr size_t kCNodeSwitchLayerLength = 3;

 namespace mindspore {
 namespace session {
+static CNodePtr GetJumpNode(NotNull<KernelGraphPtr> parent_graph, NotNull<KernelGraphPtr> child_graph) {
+  auto &nodes = parent_graph->execution_order();
+  for (auto &node : nodes) {
+    if (IsPrimitiveCNode(node, prim::kPrimLabelGoto) && child_graph->get_start_label() == node->input(kCNodeCallArg)) {
+      return node;
+    } else if (IsPrimitiveCNode(node, prim::kPrimLabelSwitch) &&
+               (child_graph->get_start_label() == node->input(kCNodeSwitchFalse) ||
+                child_graph->get_start_label() == node->input(kCNodeSwitchTrue))) {
+      return node;
+    }
+  }
+  MS_LOG(INFO) << "Cannot find jump node from " << parent_graph->ToString() << " to " << child_graph->ToString();
+  return nullptr;
+}
+
 static void InitUnionFindSet(NotNull<KernelGraphPtr> kg, const NotNull<UnionFindSet<AnfNodePtr> *> union_find_set,
                             const NotNull<std::set<KernelGraphPtr> *> memo) {
  if (memo->find(kg.get()) != memo->end()) {
@@ -200,7 +215,8 @@ void AscendControlParser::ChildGraphDataAssign(const std::map<uint32_t, KernelGr
        if (target_graph_iter == graph_id_map.end()) {
          MS_LOG(EXCEPTION) << "Graph id " << AnfAlgo::GetGraphId(arg.get()) << " not found.";
        }
-        InsertMultipleAssignToGraph(NOT_NULL(target_graph_iter->second), NOT_NULL(arg), NOT_NULL(parameter));
+        InsertMultipleAssignToGraph(NOT_NULL(target_graph_iter->second), NOT_NULL(kg), NOT_NULL(arg),
+                                    NOT_NULL(parameter));
      }
    }
  }
@@ -263,7 +279,7 @@ NotNull<CNodePtr> AscendControlParser::ProcessKernelGraph(NotNull<KernelGraphPtr
      RecurseSwitchLayer(kg, NOT_NULL(cnode), GetNextRealKernel(nodes, i + 1), memo);
    }
  }
-
+  kg->SetExecOrderByDefault();
  MS_LOG(INFO) << "End KernelGraph process: " << kg->ToString();
  return NOT_NULL(start_label);
 }
@@ -433,7 +449,8 @@ std::tuple<CNodePtr, KernelGraphPtr> AscendControlParser::ParsePartial(NotNull<A
  return {partial_cnode, branch_kg};
 }

-void AscendControlParser::InsertMultipleAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from,
+void AscendControlParser::InsertMultipleAssignToGraph(NotNull<KernelGraphPtr> from_graph,
+                                                      NotNull<KernelGraphPtr> to_graph, NotNull<AnfNodePtr> from,
                                                      NotNull<AnfNodePtr> to) {
  std::vector<AnfNodePtr> from_outputs = AnfAlgo::GetAllOutput(from, {prim::kPrimTupleGetItem});
  std::vector<AnfNodePtr> to_outputs = AnfAlgo::GetAllOutput(to, {prim::kPrimTupleGetItem});
@@ -443,18 +460,24 @@ void AscendControlParser::InsertMultipleAssignToGraph(NotNull<KernelGraphPtr> kg
                      << to_outputs.size() << "]";
  }
  for (size_t i = 0; i < from_outputs.size(); i++) {
-    InsertAssignToGraph(kg, NOT_NULL(from_outputs[i]), NOT_NULL(to_outputs[i]));
+    auto assign_node = InsertAssignToGraph(from_graph, NOT_NULL(from_outputs[i]), NOT_NULL(to_outputs[i]));
+    if (assign_node != nullptr) {
+      auto jump_node = GetJumpNode(from_graph, to_graph);
+      if (jump_node != nullptr) {
+        InsertControlDependToGraph(from_graph, NOT_NULL(assign_node), NOT_NULL(jump_node));
+      }
+    }
  }
 }

-void AscendControlParser::InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from,
-                                              NotNull<AnfNodePtr> to) {
+AnfNodePtr AscendControlParser::InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from,
+                                                    NotNull<AnfNodePtr> to) {
  if (AnfAlgo::OutputAddrExist(from, 0) && AnfAlgo::OutputAddrExist(to, 0) &&
      AnfAlgo::GetOutputAddr(from, 0) == AnfAlgo::GetOutputAddr(to, 0)) {
-    return;
+    return nullptr;
  }
  if (from.get() == to.get()) {
-    return;
+    return nullptr;
  }
  MS_LOG(INFO) << "Insert assign to graph " << kg->ToString() << " from " << from->DebugString() << " to "
               << to->DebugString();
@@ -466,6 +489,7 @@ void AscendControlParser::InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNul
  assign_node->set_abstract(to->abstract());
  // append the assign at the end of from graph
  InsertDependToGraph(kg, NOT_NULL(assign_node));
+  return assign_node;
 }

 std::vector<CNodePtr> AscendControlParser::RecurseGraph(NotNull<KernelGraphPtr> graph,

--- a/mindspore/ccsrc/session/ascend_control_parser.h
+++ b/mindspore/ccsrc/session/ascend_control_parser.h
@@ -52,8 +52,9 @@ class AscendControlParser {
                              const CNodePtr &last_label);
  static std::tuple<CNodePtr, KernelGraphPtr> ParsePartial(NotNull<AnfNodePtr> node);

-  static void InsertMultipleAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);
-  static void InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);
+  static void InsertMultipleAssignToGraph(NotNull<KernelGraphPtr> from_graph, NotNull<KernelGraphPtr> to_graph,
+                                          NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);
+  static AnfNodePtr InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);

  // root graph order
  static bool CheckLabelIndex(uint32_t order_index, uint32_t label_index, const CNodePtr &cnode,

--- a/mindspore/ccsrc/session/kernel_graph.cc
+++ b/mindspore/ccsrc/session/kernel_graph.cc
@@ -521,6 +521,47 @@ std::vector<AnfNodePtr> KernelGraph::GetOutputNodes(const AnfNodePtr &node) {
  return output_nodes;
 }

+// Find control_depend real input nodes.
+void GetAllFatherRealNode(const AnfNodePtr &anf_node, std::vector<AnfNodePtr> *result, std::set<AnfNodePtr> *visited) {
+  MS_EXCEPTION_IF_NULL(anf_node);
+  MS_EXCEPTION_IF_NULL(result);
+  MS_EXCEPTION_IF_NULL(visited);
+  if (visited->find(anf_node) != visited->end()) {
+    MS_LOG(WARNING) << "Node:" << anf_node->fullname_with_scope() << " has alreday been visited";
+    return;
+  }
+  visited->insert(anf_node);
+  if (AnfAlgo::IsRealKernel(anf_node)) {
+    result->emplace_back(anf_node);
+    return;
+  }
+  if (!anf_node->isa<CNode>()) {
+    return;
+  }
+  auto cnode = anf_node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  if (cnode->inputs().empty()) {
+    MS_LOG(EXCEPTION) << "Illegal null input of cnode(%s)" << anf_node->DebugString();
+  }
+  auto input0 = cnode->input(0);
+  if (IsPrimitive(input0, prim::kPrimMakeTuple)) {
+    for (size_t i = 1; i < cnode->inputs().size(); ++i) {
+      GetAllFatherRealNode(cnode->input(i), result, visited);
+    }
+  } else if (IsPrimitive(input0, prim::kPrimTupleGetItem)) {
+    if (cnode->inputs().size() != kTupleGetItemInputSize) {
+      MS_LOG(EXCEPTION) << "The node tuple_get_item must have 2 inputs!";
+    }
+    GetAllFatherRealNode(cnode->input(kRealInputNodeIndexInTupleGetItem), result, visited);
+  } else if (IsPrimitive(input0, prim::kPrimDepend)) {
+    if (cnode->inputs().size() != kDependInputSize) {
+      MS_LOG(EXCEPTION) << "Depend node must have 2 inputs!";
+    }
+    GetAllFatherRealNode(cnode->input(kRealInputIndexInDepend), result, visited);
+    GetAllFatherRealNode(cnode->input(kDependAttachNodeIndex), result, visited);
+  }
+}
+
 // update the depend relations of control depend
 void KernelGraph::UpdateControlDependRelations(const std::vector<AnfNodePtr> &depends) {
  for (const auto &node : depends) {
@@ -551,11 +592,24 @@ void KernelGraph::UpdateControlDependRelations(const std::vector<AnfNodePtr> &de
    if (depend_node->isa<Parameter>() && depend_mode == 1) {
      depend_nodes = GetOutputNodes(depend_node);
    }
-    for (auto &first_node : prior_nodes) {
+
+    std::vector<AnfNodePtr> real_prior_nodes;
+    std::set<AnfNodePtr> prior_visited;
+    for (const auto &tmp : prior_nodes) {
+      GetAllFatherRealNode(tmp, &real_prior_nodes, &prior_visited);
+    }
+
+    std::vector<AnfNodePtr> real_depend_nodes;
+    std::set<AnfNodePtr> depend_visited;
+    for (const auto &tmp : depend_nodes) {
+      GetAllFatherRealNode(tmp, &real_depend_nodes, &depend_visited);
+    }
+
+    for (auto &first_node : real_prior_nodes) {
      if (AnfAlgo::CheckPrimitiveType(first_node, prim::kPrimControlDepend)) {
        continue;
      }
-      for (auto &second_node : depend_nodes) {
+      for (auto &second_node : real_depend_nodes) {
        if (AnfAlgo::CheckPrimitiveType(second_node, prim::kPrimControlDepend)) {
          continue;
        }

--- a/mindspore/ccsrc/session/session.cc
+++ b/mindspore/ccsrc/session/session.cc
@@ -33,9 +33,14 @@
 namespace py = pybind11;
 namespace mindspore::inference {
 std::shared_ptr<FuncGraph> LoadModel(const char *model_buf, size_t size, const std::string &device) {
-  inference::Session::RegAllOp();
-  auto anf_graph = lite::AnfConverter::RunAnfConverter(model_buf, size);
-  return anf_graph;
+  try {
+    inference::Session::RegAllOp();
+    auto anf_graph = lite::AnfConverter::RunAnfConverter(model_buf, size);
+    return anf_graph;
+  } catch (std::exception &e) {
+    MS_LOG(ERROR) << "Inference LoadModel failed";
+    return nullptr;
+  }
 }

 void ExitInference() {
@@ -51,12 +56,17 @@ void ExitInference() {
 }

 std::shared_ptr<MSSession> MSSession::CreateSession(const std::string &device, uint32_t device_id) {
-  auto session = std::make_shared<inference::Session>();
-  auto ret = session->Init(device, device_id);
-  if (ret != 0) {
+  try {
+    auto session = std::make_shared<inference::Session>();
+    auto ret = session->Init(device, device_id);
+    if (ret != 0) {
+      return nullptr;
+    }
+    return session;
+  } catch (std::exception &e) {
+    MS_LOG(ERROR) << "Inference CreatSession failed";
    return nullptr;
  }
-  return session;
 }

 void Session::RegAllOp() {
@@ -113,47 +123,71 @@ void Session::RegAllOp() {

 uint32_t Session::CompileGraph(std::shared_ptr<FuncGraph> funcGraphPtr) {
  MS_ASSERT(session_impl_ != nullptr);
-  auto graph_id = session_impl_->CompileGraph(NOT_NULL(funcGraphPtr));
-  py::gil_scoped_release gil_release;
-  return graph_id;
+  try {
+    auto graph_id = session_impl_->CompileGraph(NOT_NULL(funcGraphPtr));
+    py::gil_scoped_release gil_release;
+    return graph_id;
+  } catch (std::exception &e) {
+    MS_LOG(ERROR) << "Inference CompileGraph failed";
+    return static_cast<uint32_t>(-1);
+  }
 }

 MultiTensor Session::RunGraph(uint32_t graph_id, const std::vector<std::shared_ptr<inference::MSTensor>> &inputs) {
-  std::vector<tensor::TensorPtr> inTensors;
-  inTensors.resize(inputs.size());
-  bool has_error = false;
-  std::transform(inputs.begin(), inputs.end(), inTensors.begin(),
-                 [&has_error](const std::shared_ptr<inference::MSTensor> &tensor_ptr) -> tensor::TensorPtr {
-                   if (tensor_ptr == nullptr) {
-                     MS_LOG(WARNING) << "input MSTensor is nullptr, return nullptr";
-                     has_error = true;
-                     return nullptr;
-                   }
-                   auto tensor = static_cast<inference::Tensor *>(tensor_ptr.get());
-                   if (tensor == nullptr) {
-                     MS_LOG(ERROR) << "Can not cast input MSTensor to tensor";
-                     has_error = true;
-                     return nullptr;
-                   }
-                   return tensor->tensor();
-                 });
-  if (has_error) {
-    MS_LOG(ERROR) << "Init Tensor failed, returning empty result";
-    std::vector<std::shared_ptr<inference::MSTensor>> multiTensor;
-    return multiTensor;
-  }
-  VectorRef outputs;
-  session_impl_->RunGraph(graph_id, inTensors, &outputs);
+  try {
+    std::vector<tensor::TensorPtr> inTensors;
+    inTensors.resize(inputs.size());
+    bool has_error = false;
+    std::transform(inputs.begin(), inputs.end(), inTensors.begin(),
+                   [&has_error](const std::shared_ptr<inference::MSTensor> &tensor_ptr) -> tensor::TensorPtr {
+                     if (tensor_ptr == nullptr) {
+                       MS_LOG(WARNING) << "input MSTensor is nullptr, return nullptr";
+                       has_error = true;
+                       return nullptr;
+                     }
+                     auto tensor = static_cast<inference::Tensor *>(tensor_ptr.get());
+                     if (tensor == nullptr) {
+                       MS_LOG(ERROR) << "Can not cast input MSTensor to tensor";
+                       has_error = true;
+                       return nullptr;
+                     }
+                     return tensor->tensor();
+                   });
+    if (has_error) {
+      MS_LOG(ERROR) << "Init Tensor failed, returning empty result";
+      std::vector<std::shared_ptr<inference::MSTensor>> multiTensor;
+      return multiTensor;
+    }
+    VectorRef outputs;
+    session_impl_->RunGraph(graph_id, inTensors, &outputs);

-  return TransformVectorRefToMultiTensor(outputs);
+    return TransformVectorRefToMultiTensor(outputs);
+  } catch (std::exception &e) {
+    MS_LOG(ERROR) << "Inference Rungraph failed";
+    return MultiTensor();
+  }
 }
-
+namespace {
+string AjustTargetName(const std::string &device) {
+  if (device == kAscendDevice) {
+    return std::string(kAscendDevice) + "Inference";
+  } else {
+    MS_LOG(ERROR) << "Only support device Ascend right now";
+    return "";
+  }
+}
+}  // namespace
 int Session::Init(const std::string &device, uint32_t device_id) {
  RegAllOp();
  auto ms_context = MsContext::GetInstance();
  ms_context->set_execution_mode(kGraphMode);
-  ms_context->set_device_target(kAscendDevice);
-  session_impl_ = session::SessionFactory::Get().Create(device);
+  ms_context->set_device_id(device_id);
+  auto ajust_device = AjustTargetName(device);
+  if (ajust_device == "") {
+    return -1;
+  }
+  ms_context->set_device_target(device);
+  session_impl_ = session::SessionFactory::Get().Create(ajust_device);
  if (session_impl_ == nullptr) {
    MS_LOG(ERROR) << "Session create failed!, please make sure target device:" << device << " is available.";
    return -1;

--- a/mindspore/ccsrc/session/session_basic.cc
+++ b/mindspore/ccsrc/session/session_basic.cc
@@ -81,7 +81,15 @@ BaseRef CreateOneTensor(const AnfNodePtr &node, size_t output_index, const Kerne
    }
  }
  // if proccess reach here,it remarks item_with_index is a real node(Parameter,or executable CNode)
-  auto address = AnfAlgo::GetOutputAddr(node, output_index);
+  DeviceAddressPtr address;
+  auto is_all_nop_node = opt::IsAllNopNode(&graph);
+  if (is_all_nop_node) {
+    // The graph does not remove the nop node.
+    address = AnfAlgo::GetMutableOutputAddr(node, output_index, false);
+  } else {
+    // The graph removes the nop node.
+    address = AnfAlgo::GetMutableOutputAddr(node, output_index, true);
+  }
  MS_EXCEPTION_IF_NULL(address);
  auto shape = AnfAlgo::GetOutputInferShape(node, output_index);
  TypeId type_id = kNumberTypeFloat32;
@@ -93,7 +101,7 @@ BaseRef CreateOneTensor(const AnfNodePtr &node, size_t output_index, const Kerne
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (ms_context->execution_mode() == kPynativeMode || ms_context->device_target() == kGPUDevice) {
-    tensor->set_device_address(AnfAlgo::GetMutableOutputAddr(node, output_index));
+    tensor->set_device_address(address);
    tensor->set_dirty(false);
  } else if (!address->SyncDeviceToHost(trans::GetRuntimePaddingShape(node, output_index),
                                        LongToSize(tensor->data().nbytes()), tensor->data_type(), tensor->data_c())) {

--- a/mindspore/ccsrc/transform/convert.cc
+++ b/mindspore/ccsrc/transform/convert.cc
@@ -1646,7 +1646,7 @@ bool DfGraphConvertor::GetControlDependList(const CNodePtr &node,
    dst_ops_list->insert(dst_ops_list->end(), converted_list.begin(), converted_list.end());
  }
  if (src_ops_list->empty() || dst_ops_list->empty()) {
-    MS_LOG(WARNING) << "Control depend node's src or dest node is not a apply node, ignore it";
+    MS_LOG(DEBUG) << "Control depend node's src or dest node is not a CNode, ignore it";
    error_ = SUCCESS;
  }
  return true;
@@ -1690,6 +1690,8 @@ void DfGraphConvertor::ConvertControlDependNode(const CNodePtr node) {
                         });
  } else if (src_ops_list->size() == 1 && dst_ops_list->size() == 1) {
    control_edges.push_back({(*src_ops_list)[0], (*dst_ops_list)[0]});
+  } else if (src_ops_list->empty() || dst_ops_list->empty()) {
+    MS_LOG(DEBUG) << "Depend list of src or dst is empty, ignore it";
  } else {
    MS_LOG(ERROR) << "Convert control depend node to operator failed, depend src:" << src_ops_list->size()
                  << " -> dst:" << dst_ops_list->size();

--- a/mindspore/ccsrc/utils/log_adapter.cc
+++ b/mindspore/ccsrc/utils/log_adapter.cc
@@ -463,7 +463,7 @@ void InitSubModulesLogLevel() {

  // set submodule's log level
  auto submodule = GetEnv("MS_SUBMODULE_LOG_v");
-  MS_LOG(INFO) << "MS_SUBMODULE_LOG_v=`" << submodule << "`";
+  MS_LOG(DEBUG) << "MS_SUBMODULE_LOG_v=`" << submodule << "`";
  LogConfigParser parser(submodule);
  auto configs = parser.Parse();
  for (const auto &cfg : configs) {
@@ -489,22 +489,14 @@ void InitSubModulesLogLevel() {
 }  // namespace mindspore

 extern "C" {
-// shared lib init hook
 #if defined(_WIN32) || defined(_WIN64)
-__attribute__((constructor)) void mindspore_log_init(void) {
+__attribute__((constructor)) void common_log_init(void) {
 #else
-void mindspore_log_init(void) {
+void common_log_init(void) {
 #endif
 #ifdef USE_GLOG
  // do not use glog predefined log prefix
  FLAGS_log_prefix = false;
-  static bool is_glog_initialzed = false;
-  if (!is_glog_initialzed) {
-#if !defined(_WIN32) && !defined(_WIN64)
-    google::InitGoogleLogging("mindspore");
-#endif
-    is_glog_initialzed = true;
-  }
  // set default log level to WARNING
  if (mindspore::GetEnv("GLOG_v").empty()) {
    FLAGS_v = mindspore::WARNING;
@@ -525,4 +517,22 @@ void mindspore_log_init(void) {
 #endif
  mindspore::InitSubModulesLogLevel();
 }
+
+// shared lib init hook
+#if defined(_WIN32) || defined(_WIN64)
+__attribute__((constructor)) void mindspore_log_init(void) {
+#else
+void mindspore_log_init(void) {
+#endif
+#ifdef USE_GLOG
+  static bool is_glog_initialzed = false;
+  if (!is_glog_initialzed) {
+#if !defined(_WIN32) && !defined(_WIN64)
+    google::InitGoogleLogging("mindspore");
+#endif
+    is_glog_initialzed = true;
+  }
+#endif
+  common_log_init();
+}
 }
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
@@ -252,6 +252,7 @@ constexpr auto kControlDependMode = "depend_mode";
 // index define of depend
 constexpr auto kRealInputIndexInDepend = 1;
 constexpr auto kDependAttachNodeIndex = 2;
+constexpr auto kDependInputSize = 3;
 // format
 constexpr auto kOpFormat_DEFAULT = "DefaultFormat";
 constexpr auto kOpFormat_NC1KHKWHWC0 = "NC1KHKWHWC0";

--- a/mindspore/common/tensor.py
+++ b/mindspore/common/tensor.py
@@ -22,6 +22,10 @@ from . import dtype as mstype
 from ._register_for_tensor import tensor_operator_registry

 __all__ = ['Tensor', 'MetaTensor']
+np_types = (np.int8, np.int16, np.int32, np.int64,
+            np.uint8, np.uint16, np.uint32, np.uint64, np.float16,
+            np.float32, np.float64, np.bool_)
+


 class Tensor(Tensor_):
@@ -54,6 +58,10 @@ class Tensor(Tensor_):
    """

    def __init__(self, input_data, dtype=None):
+        # If input data is numpy number, convert it to np array
+        if isinstance(input_data, np_types):
+            input_data = np.array(input_data)
+
        # If input_data is tuple/list/numpy.ndarray, it's support in check_type method.
        check_type('tensor input_data', input_data, (Tensor_, float, int))
        if dtype is not None:

--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -1040,7 +1040,7 @@ class Dataset:

        Args:
            columns (list[str], optional): List of columns to be used to specify the order of columns
-                (defaults=None, means all columns).
+                (default=None, means all columns).

        Returns:
            Iterator, list of ndarray.
@@ -3382,7 +3382,7 @@ class ManifestDataset(MappableDataset):
        class_indexing (dict, optional): A str-to-int mapping from label name to index
            (default=None, the folder names will be sorted alphabetically and each
            class will be given a unique index starting from 0).
-        decode (bool, optional): decode the images after reading (defaults=False).
+        decode (bool, optional): decode the images after reading (default=False).
        num_shards (int, optional): Number of shards that the dataset should be divided
            into (default=None).
        shard_id (int, optional): The shard ID within num_shards (default=None). This
@@ -4760,7 +4760,7 @@ class _NumpySlicesDataset:

    def process_dict(self, input_data):
        """
-        Convert the dict like data into tuple format, when input is a tuple of dict then compose it into a dict first.
+        Convert the dict like data into tuple format, when input is a tuple of dicts then compose it into a dict first.
        """
        # Convert pandas like dict(has "values" column) into General dict
        data_keys = list(input_data.keys())

--- a/mindspore/dataset/transforms/vision/c_transforms.py
+++ b/mindspore/dataset/transforms/vision/c_transforms.py
@@ -202,7 +202,7 @@ class RandomHorizontalFlip(cde.RandomHorizontalFlipOp):
    Flip the input image horizontally, randomly with a given probability.

    Args:
-        prob (float): Probability of the image being flipped (default=0.5).
+        prob (float, optional): Probability of the image being flipped (default=0.5).
    """

    @check_prob
@@ -217,7 +217,7 @@ class RandomHorizontalFlipWithBBox(cde.RandomHorizontalFlipWithBBoxOp):
    Maintains data integrity by also flipping bounding boxes in an object detection pipeline.

    Args:
-        prob (float): Probability of the image being flipped (default=0.5).
+        prob (float, optional): Probability of the image being flipped (default=0.5).
    """

    @check_prob
@@ -231,7 +231,7 @@ class RandomVerticalFlip(cde.RandomVerticalFlipOp):
    Flip the input image vertically, randomly with a given probability.

    Args:
-        prob (float): Probability of the image being flipped (default=0.5).
+        prob (float, optional): Probability of the image being flipped (default=0.5).
    """

    @check_prob

--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -29,8 +29,9 @@ from .optimizer import Optimizer
 _adam_opt = C.MultitypeFuncGraph("adam_opt")


-@_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
-def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag):
+@_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
+                    "Tensor", "Bool", "Bool")
+def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag, optim_filter):
    """
    Update parameters.

@@ -44,38 +45,44 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, grad
        m (Tensor): m value of parameters.
        v (Tensor): v value of parameters.
        gradient (Tensor): Gradient of parameters.
+        decay_flag (bool): Applies weight decay or not.
+        optim_filter (bool): Applies parameter update or not.

    Returns:
        Tensor, the new value of v after updating.
    """
-    op_mul = P.Mul()
-    op_square = P.Square()
-    op_sqrt = P.Sqrt()
-    op_cast = P.Cast()
-    op_reshape = P.Reshape()
-    op_shape = P.Shape()
+    if optim_filter:
+        op_mul = P.Mul()
+        op_square = P.Square()
+        op_sqrt = P.Sqrt()
+        op_cast = P.Cast()
+        op_reshape = P.Reshape()
+        op_shape = P.Shape()

-    param_fp32 = op_cast(param, mstype.float32)
-    m_fp32 = op_cast(m, mstype.float32)
-    v_fp32 = op_cast(v, mstype.float32)
-    gradient_fp32 = op_cast(gradient, mstype.float32)
+        param_fp32 = op_cast(param, mstype.float32)
+        m_fp32 = op_cast(m, mstype.float32)
+        v_fp32 = op_cast(v, mstype.float32)
+        gradient_fp32 = op_cast(gradient, mstype.float32)

-    next_m = op_mul(beta1, m_fp32) + op_mul(op_cast(F.tuple_to_array((1.0,)), mstype.float32) - beta1, gradient_fp32)
+        next_m = op_mul(beta1, m_fp32) + op_mul(op_cast(F.tuple_to_array((1.0,)), mstype.float32)
+                                                - beta1, gradient_fp32)

-    next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(F.tuple_to_array((1.0,)), mstype.float32)
-                                            - beta2, op_square(gradient_fp32))
+        next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(F.tuple_to_array((1.0,)), mstype.float32)
+                                                - beta2, op_square(gradient_fp32))

-    update = next_m / (eps + op_sqrt(next_v))
-    if decay_flag:
-        update = op_mul(weight_decay_tensor, param_fp32) + update

-    update_with_lr = op_mul(lr, update)
-    next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32))
+        update = next_m / (eps + op_sqrt(next_v))
+        if decay_flag:
+            update = op_mul(weight_decay_tensor, param_fp32) + update

-    next_v = F.depend(next_v, F.assign(param, op_cast(next_param, F.dtype(param))))
-    next_v = F.depend(next_v, F.assign(m, op_cast(next_m, F.dtype(m))))
-    next_v = F.depend(next_v, F.assign(v, op_cast(next_v, F.dtype(v))))
-    return next_v
+        update_with_lr = op_mul(lr, update)
+        next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32))
+
+        next_param = F.depend(next_param, F.assign(param, op_cast(next_param, F.dtype(param))))
+        next_param = F.depend(next_param, F.assign(m, op_cast(next_m, F.dtype(m))))
+        next_param = F.depend(next_param, F.assign(v, op_cast(next_v, F.dtype(v))))
+        return next_param
+    return gradient


 def _check_param_value(beta1, beta2, eps, weight_decay, prim_name):
@@ -300,7 +307,7 @@ class AdamWeightDecay(Optimizer):
        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.

    Outputs:
-        tuple[Parameter], the updated velocity value, the shape is the same as `params`.
+        tuple[bool], all elements are True.

    Examples:
        >>> net = Net()
@@ -328,11 +335,13 @@ class AdamWeightDecay(Optimizer):

    def construct(self, gradients):
        lr = self.get_lr()
-        updated_velocity = self.hyper_map(F.partial(_adam_opt, self.beta1, self.beta2, self.eps, lr,
-                                                    self.weight_decay_tensor),
-                                          self.params, self.moments1, self.moments2, gradients, self.decay_flag)
-
-        return updated_velocity
+        optim_result = self.hyper_map(F.partial(_adam_opt, self.beta1, self.beta2, self.eps, lr,
+                                                self.weight_decay_tensor),
+                                      self.params, self.moments1, self.moments2, gradients,
+                                      self.decay_flag, self.optim_filter)
+        if self.use_parallel:
+            optim_result = self.broadcast_params(optim_result)
+        return optim_result


 class AdamWeightDecayDynamicLR(Optimizer):
@@ -363,7 +372,7 @@ class AdamWeightDecayDynamicLR(Optimizer):
        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.

    Outputs:
-        tuple[Parameter], the updated velocity value, the shape is the same as `params`.
+        tuple[bool], all elements are True.

    Examples:
        >>> net = Net()
@@ -424,12 +433,14 @@ class AdamWeightDecayDynamicLR(Optimizer):
            warmup_lr = self.start_learning_rate * warmup_percent
            is_warmup = self.cast(self.greater(self.warmup_steps, self.global_step), mstype.float32)
            lr = (self.one - is_warmup) * lr + is_warmup * warmup_lr
-        updated_velocity = self.hyper_map(F.partial(_adam_opt, self.beta1, self.beta2, self.eps, lr,
-                                                    self.weight_decay_tensor),
-                                          self.params, self.moments1, self.moments2, gradients, self.decay_flag)
-
+        optim_result = self.hyper_map(F.partial(_adam_opt, self.beta1, self.beta2, self.eps, lr,
+                                                self.weight_decay_tensor),
+                                      self.params, self.moments1, self.moments2, gradients,
+                                      self.decay_flag, self.optim_filter)
+        if self.use_parallel:
+            optim_result = self.broadcast_params(optim_result)
        added_global_step = self.global_step + self.one
        F.control_depend(lr, added_global_step)
        self.global_step = added_global_step

-        return updated_velocity
+        return optim_result
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -32,11 +32,10 @@ num_one = Tensor(np.ones([1]), mstype.float32)

 _lamb_opt = C.MultitypeFuncGraph("lamb_opt")

-
-@_lamb_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
-                    "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
+@_lamb_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
+                    "Tensor", "Bool", "Bool")
 def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, param, m, v,
-                   gradient, decay_flag):
+                   gradient, decay_flag, optim_filter):
    """
    Update parameters.

@@ -52,66 +51,66 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, para
        v (Tensor): v value of parameters.
        gradient (Tensor): Gradient of parameters.
        decay_flag (bool): Specifies whether param update with weight decay.
+        optim_filter(bool): Applies parameter update or not.

    Returns:
        Tensor, the new value of v after updating.
    """
-    op_mul = P.Mul()
-    op_sqrt = P.Sqrt()
-    op_rsqrt = P.Rsqrt()
-    op_square = P.Square()
-    op_cast = P.Cast()
-    op_reshape = P.Reshape()
-    op_shape = P.Shape()
-    op_pow = P.Pow()
-    op_norm = layer.Norm()
-    op_select = P.Select()
-    op_greater = P.Greater()
-    op_fill = P.Fill()
-    op_dtype = P.DType()
-
-    param_fp32 = op_cast(param, mstype.float32)
-    m_fp32 = op_cast(m, mstype.float32)
-    v_fp32 = op_cast(v, mstype.float32)
-    gradient_fp32 = op_cast(gradient, mstype.float32)
-
-    next_m = op_mul(beta1, m_fp32) + op_mul(op_cast(num_one,
-                                                    mstype.float32) - beta1, gradient_fp32)
-
-    next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(num_one,
-                                                    mstype.float32) - beta2, op_square(gradient_fp32))
-
-    next_mm = next_m / (op_cast(num_one, mstype.float32)
-                        - op_pow(beta1, op_cast(global_step + num_one, mstype.float32)))
-    next_vv = next_v / (op_cast(num_one, mstype.float32) -
-                        op_pow(beta2, op_cast(global_step + num_one, mstype.float32)))
-    w_norm = op_norm(param_fp32)
-    g_norm = op_norm(gradient_fp32)
-
-    g_norm_hat = op_norm(op_mul(next_mm, op_rsqrt(
-        next_vv + eps)) + weight_decay_tensor * param_fp32)
-    zeros = F.zeros_like(w_norm)
-    ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0)
-    trust_ratio = op_select(
-        op_greater(w_norm, zeros),
-        op_select(op_greater(g_norm, zeros), w_norm / g_norm_hat, ones),
-        ones)
-    tens = op_fill(op_dtype(trust_ratio), op_shape(trust_ratio), 10.0)
-    trust_ratio = C.clip_by_value(trust_ratio, zeros, tens)
-    update = next_mm / (op_sqrt(next_vv) + eps)
-
-    if decay_flag:
-        update = update + op_mul(weight_decay_tensor, param_fp32)
-
-    update_with_lr = op_mul(op_mul(trust_ratio, lr), update)
-
-    next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32))
-
-    next_v = F.depend(next_v, F.assign(param, next_param))
-    next_v = F.depend(next_v, F.assign(m, next_m))
-    next_v = F.depend(next_v, F.assign(v, next_v))
-
-    return next_v
+    if optim_filter:
+        op_mul = P.Mul()
+        op_sqrt = P.Sqrt()
+        op_rsqrt = P.Rsqrt()
+        op_square = P.Square()
+        op_cast = P.Cast()
+        op_reshape = P.Reshape()
+        op_shape = P.Shape()
+        op_pow = P.Pow()
+        op_norm = layer.Norm()
+        op_select = P.Select()
+        op_greater = P.Greater()
+        op_fill = P.Fill()
+        op_dtype = P.DType()
+
+        param_fp32 = op_cast(param, mstype.float32)
+        m_fp32 = op_cast(m, mstype.float32)
+        v_fp32 = op_cast(v, mstype.float32)
+        gradient_fp32 = op_cast(gradient, mstype.float32)
+
+        next_m = op_mul(beta1, m_fp32) + op_mul(op_cast(num_one, mstype.float32) - beta1, gradient_fp32)
+
+        next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(num_one, mstype.float32) - beta2, op_square(gradient_fp32))
+
+        next_mm = next_m / (op_cast(num_one, mstype.float32)
+                            - op_pow(beta1, op_cast(global_step + num_one, mstype.float32)))
+        next_vv = next_v / (op_cast(num_one, mstype.float32) -
+                            op_pow(beta2, op_cast(global_step + num_one, mstype.float32)))
+        w_norm = op_norm(param_fp32)
+        g_norm = op_norm(gradient_fp32)
+
+        g_norm_hat = op_norm(op_mul(next_mm, op_rsqrt(next_vv + eps)) + weight_decay_tensor * param_fp32)
+        zeros = F.zeros_like(w_norm)
+        ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0)
+        trust_ratio = op_select(
+            op_greater(w_norm, zeros),
+            op_select(op_greater(g_norm, zeros), w_norm / g_norm_hat, ones),
+            ones)
+        tens = op_fill(op_dtype(trust_ratio), op_shape(trust_ratio), 10.0)
+        trust_ratio = C.clip_by_value(trust_ratio, zeros, tens)
+        update = next_mm / (op_sqrt(next_vv) + eps)
+
+        if decay_flag:
+            update = update + op_mul(weight_decay_tensor, param_fp32)
+
+        update_with_lr = op_mul(op_mul(trust_ratio, lr), update)
+
+        next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32))
+
+        next_param = F.depend(next_param, F.assign(param, next_param))
+        next_param = F.depend(next_param, F.assign(m, next_m))
+        next_param = F.depend(next_param, F.assign(v, next_v))
+
+        return next_param
+    return gradient


 lamb_opt_graph_kernel = C.MultitypeFuncGraph("lamb_opt_graph_kernel")
@@ -238,7 +237,7 @@ class Lamb(Optimizer):
        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.

    Outputs:
-        tuple[Parameter], the updated velocity value, the shape is the same as `params`.
+        tuple[bool], all elements are True.

    Examples:
        >>> net = Net()
@@ -311,18 +310,21 @@ class Lamb(Optimizer):
                self.warmup_steps, self.global_step), mstype.float32)
            lr = (self.one - is_warmup) * lr + is_warmup * warmup_lr
        if self.enable_graph_kernel:
-            updated_velocity = self.hyper_map(F.partial(lamb_opt_graph_kernel,
-                                                        self.beta1, self.beta2, self.eps, lr,
-                                                        self.weight_decay_tensor, self.global_step),
-                                              self.params, self.moments1, self.moments2, gradients, self.decay_flag)
+            optim_result = self.hyper_map(F.partial(lamb_opt_graph_kernel,
+                                                    self.beta1, self.beta2, self.eps, lr,
+                                                    self.weight_decay_tensor, self.global_step),
+                                          self.params, self.moments1, self.moments2, gradients, self.decay_flag)
        else:
-            updated_velocity = self.hyper_map(F.partial(_lamb_opt,
-                                                        self.beta1, self.beta2, self.eps, lr,
-                                                        self.weight_decay_tensor, self.global_step),
-                                              self.params, self.moments1, self.moments2, gradients, self.decay_flag)
+            optim_result = self.hyper_map(F.partial(_lamb_opt,
+                                                    self.beta1, self.beta2, self.eps, lr,
+                                                    self.weight_decay_tensor, self.global_step),
+                                          self.params, self.moments1, self.moments2, gradients,
+                                          self.decay_flag, self.optim_filter)
+        if self.use_parallel:
+            optim_result = self.broadcast_params(optim_result)

        added_global_step = self.global_step + self.one
        F.control_depend(lr, added_global_step)
        self.global_step = added_global_step

-        return updated_velocity
+        return optim_result
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -22,11 +22,14 @@ from mindspore.ops import functional as F, composite as C, operations as P
 from mindspore.nn.cell import Cell
 from mindspore.common.parameter import Parameter, ParameterTuple
 from mindspore.common.initializer import initializer
+from mindspore.common.tensor import Tensor
 import mindspore.common.dtype as mstype
 from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
-from mindspore.common.tensor import Tensor
 from mindspore import log as logger
+from mindspore.parallel._utils import _get_global_rank, _get_device_num, _get_parallel_mode
+from mindspore.parallel._auto_parallel_context import auto_parallel_context
+from mindspore.train.parallel_utils import ParallelMode

 __all__ = ['Optimizer']

@@ -155,6 +158,27 @@ class Optimizer(Cell):
        self.param_length = len(self.parameters)
        self.map_ = C.Map()

+        use_parallel = auto_parallel_context().get_enable_parallel_optimizer()
+        self.use_parallel = use_parallel
+        if use_parallel:
+            if self.cls_name not in ["Lamb", "AdamWeightDecayDynamicLR", "AdamWeightDecay"]:
+                raise RuntimeError("Optimizer segmentation does not support optimizer {}".format(self.cls_name))
+            if _get_parallel_mode() not in [ParallelMode.HYBRID_PARALLEL, ParallelMode.DATA_PARALLEL,
+                                            ParallelMode.AUTO_PARALLEL]:
+                raise RuntimeError("Optimizer segmentation does not support parallel mode {}".format
+                                   (_get_parallel_mode()))
+            self.dev_num = _get_device_num()
+            if self.dev_num > self.param_length:
+                raise RuntimeError("Optimizer segmentation can not be applied when the number of parameters {} is"
+                                   " less than the number of devices {}".format(self.param_length, self.dev_num))
+            self.param_rank = self._get_parameter_group_id()
+            self.optim_filter = tuple(map(lambda x: x == _get_global_rank(), self.param_rank))
+            self.param_names = []
+            for param in self.parameters:
+                self.param_names.append(param.name)
+        else:
+            self.optim_filter = (True,) * self.param_length
+
    def decay_weight(self, gradients):
        """
        Weight decay.
@@ -219,8 +243,32 @@ class Optimizer(Cell):
            raise TypeError("Learning rate should be float, Tensor or Iterable.")
        return lr

+    def _check_group_params(self, parameters):
+        """Check group params."""
+        parse_keys = ['params', 'lr', 'weight_decay', 'order_params']
+        for group_param in parameters:
+            invalid_key = list(filter(lambda x: x not in parse_keys, group_param.keys()))
+            if invalid_key:
+                raise KeyError(f'The key "{invalid_key}" cannot be recognized in group params.')
+
+            if 'order_params' in group_param.keys():
+                if len(group_param.keys()) > 1:
+                    raise ValueError("The order params dict in group parameters should "
+                                     "only include the 'order_params' key.")
+                if not isinstance(group_param['order_params'], Iterable):
+                    raise TypeError("The value of 'order_params' should be an Iterable type.")
+                continue
+
+            if not group_param['params']:
+                raise ValueError("Optimizer got an empty group parameter list.")
+
+            for param in group_param['params']:
+                if not isinstance(param, Parameter):
+                    raise TypeError("The group param should be an iterator of Parameter type.")
+
    def _parse_group_params(self, parameters, learning_rate):
        """Parse group params."""
+        self._check_group_params(parameters)
        if self.dynamic_lr:
            dynamic_lr_length = learning_rate.size()
        else:
@@ -250,9 +298,6 @@ class Optimizer(Cell):
            if dynamic_lr_length not in (lr_length, 0):
                raise ValueError("The dynamic learning rate in group should be the same size.")

-            if not group_param['params']:
-                raise ValueError("Optimizer got an empty group parameter list.")
-
            dynamic_lr_length = lr_length
        self.dynamic_lr_length = dynamic_lr_length

@@ -384,6 +429,51 @@ class Optimizer(Cell):
                lr = self.learning_rate
        return lr

+    def _get_parameter_group_id(self):
+        """
+        Get the parameter partition group id, which is less than the number of devices.
+
+        Returns:
+            tuple, the group id tuple of parameters.
+        """
+        rank_list = ()
+        count = 0
+        for _ in range(self.param_length):
+            rank_list = rank_list + (count,)
+            count = count + 1
+            if count == self.dev_num:
+                count = 0
+        return rank_list
+
+    def broadcast_params(self, optim_result):
+        """
+        Apply Broadcast operations in the sequential order of parameter groups.
+
+        Returns:
+             bool, the status flag.
+        """
+        param_group = []
+        key_group = []
+        for _ in range(self.dev_num):
+            param_group.append(F.make_tuple())
+            key_group.append(F.make_tuple())
+        for i in range(self.param_length):
+            param_group[self.param_rank[i]] = param_group[self.param_rank[i]] + (optim_result[i],)
+            key = P.MakeRefKey(self.param_names[i])()
+            key_group[self.param_rank[i]] = key_group[self.param_rank[i]] + (key,)
+        new_param_group = []
+        for root in range(self.dev_num):
+            ops = P.Broadcast(root)
+            next_params = ops(param_group[root])
+            new_param_group.append(next_params)
+            for i in range(F.tuple_len(next_params)):
+                F.assign(key_group[root][i], next_params[i])
+        status = True
+        for i in range(self.dev_num - 1):
+            status = F.control_depend(new_param_group[i][0], new_param_group[i+1])
+
+        return status
+
    def construct(self, *hyper_params):
        raise NotImplementedError


--- a/mindspore/nn/wrap/cell_wrapper.py
+++ b/mindspore/nn/wrap/cell_wrapper.py
@@ -220,7 +220,9 @@ class DataWrapper(Cell):

    def __init__(self, network, dataset_types, dataset_shapes, queue_name):
        super(DataWrapper, self).__init__(auto_prefix=False, flags=network.get_flags())
-
+        # Also copy the flag in `network` construct
+        flags = getattr(network.__class__.construct, "_mindspore_flags", {})
+        self.add_flags(**flags)
        self.get_next = P.GetNext(dataset_types, dataset_shapes, len(dataset_types), queue_name)
        self.network = network


--- a/mindspore/ops/_op_impl/akg/__init__.py
+++ b/mindspore/ops/_op_impl/akg/__init__.py
@@ -47,6 +47,7 @@ from .gather_v2 import _gather_v2_akg
 from .less import _less_akg
 from .log import _log_akg
 from .matmul import _matmul_akg
+from .batchmatmul import _batchmatmul_akg
 from .max_pool_grad_with_argmax import _max_pool_grad_with_argmax_akg
 from .max_pool_with_argmax import _max_pool_with_argmax_akg
 from .max import _max_akg

--- a/mindspore/ops/_op_impl/akg/batchmatmul.py
+++ b/mindspore/ops/_op_impl/akg/batchmatmul.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""BatchMatMul op"""
+from mindspore.ops.op_info_register import op_info_register
+
+
+@op_info_register("""{
+    "op_name": "BatchMatMul",
+    "imply_type": "AutoDiff",
+    "fusion_type": "OPAQUE",
+    "attr": [
+        {
+            "name": "transpose_a",
+            "param_type": "optional",
+            "type": "bool"
+        },
+        {
+            "name": "transpose_b",
+            "param_type": "optional",
+            "type": "bool"
+        }
+    ],
+    "inputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "FRACTAL_NZ"
+            ],
+            "name": "x1"
+        },
+        {
+            "index": 1,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "FRACTAL_NZ"
+            ],
+            "name": "x2"
+        }
+    ],
+    "outputs": [
+        {
+            "index": 0,
+            "dtype": [
+                "float16"
+            ],
+            "format": [
+                "FRACTAL_NZ"
+            ],
+            "name": "output"
+        }
+    ]
+}""")
+def _batchmatmul_akg():
+    """BatchMatMul AKG register"""
+    return
--- a/mindspore/ops/_op_impl/tbe/confusion_transpose_d.py
+++ b/mindspore/ops/_op_impl/tbe/confusion_transpose_d.py
@@ -28,26 +28,8 @@ confusion_transpose_d_op_info = TBERegOp("ConfusionTransposeD") \
    .attr("transpose_first", "required", "bool", "all") \
    .input(0, "x", False, "required", "all") \
    .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.I8_FracNZ, DataType.I8_FracNZ) \
-    .dtype_format(DataType.I8_Default, DataType.I8_Default) \
-    .dtype_format(DataType.U8_FracNZ, DataType.U8_FracNZ) \
-    .dtype_format(DataType.U8_Default, DataType.U8_Default) \
-    .dtype_format(DataType.I16_FracNZ, DataType.I16_FracNZ) \
-    .dtype_format(DataType.I16_Default, DataType.I16_Default) \
-    .dtype_format(DataType.U16_FracNZ, DataType.U16_FracNZ) \
-    .dtype_format(DataType.U16_Default, DataType.U16_Default) \
-    .dtype_format(DataType.I32_FracNZ, DataType.I32_FracNZ) \
-    .dtype_format(DataType.I32_Default, DataType.I32_Default) \
-    .dtype_format(DataType.U32_FracNZ, DataType.U32_FracNZ) \
-    .dtype_format(DataType.U32_Default, DataType.U32_Default) \
-    .dtype_format(DataType.I64_FracNZ, DataType.I64_FracNZ) \
-    .dtype_format(DataType.I64_Default, DataType.I64_Default) \
-    .dtype_format(DataType.U64_FracNZ, DataType.U64_FracNZ) \
-    .dtype_format(DataType.U64_Default, DataType.U64_Default) \
-    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ) \
-    .dtype_format(DataType.F16_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F32_FracNZ, DataType.F32_FracNZ) \
-    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .op_pattern("dynamicFormat") \
+    .dtype_format(DataType.None_None, DataType.None_None) \
    .get_op_info()



--- a/mindspore/ops/composite/multitype_ops/setitem_impl.py
+++ b/mindspore/ops/composite/multitype_ops/setitem_impl.py
@@ -85,6 +85,22 @@ def _list_setitem_with_List(data, number_index, value):
    return F.list_setitem(data, number_index, value)


+@setitem.register("List", "Number", "Tuple")
+def _list_setitem_with_Tuple(data, number_index, value):
+    """
+    Assigns value to list.
+
+    Inputs:
+        data (list): Data of type lis.
+        number_index (Number): Index of data.
+        value (list): Value given.
+
+    Outputs:
+        list, type is same as the element type of data.
+    """
+    return F.list_setitem(data, number_index, value)
+
+
 @setitem.register("Dictionary", "String", "Tensor")
 def _dict_setitem_with_tensor(data, key, value):
    """

--- a/mindspore/ops/operations/comm_ops.py
+++ b/mindspore/ops/operations/comm_ops.py
@@ -98,6 +98,7 @@ class AllReduce(PrimitiveWithInfer):
        self.op = op
        self.add_prim_attr('group', _get_group(group))
        self.add_prim_attr('fusion', 0)
+        self.add_prim_attr('index', 0)

    def vm_impl(self, x):
        """Implement by vm mode."""

--- a/mindspore/ops/operations/debug_ops.py
+++ b/mindspore/ops/operations/debug_ops.py
@@ -309,12 +309,6 @@ class Print(PrimitiveWithInfer):
    Output tensor or string to stdout.

    Note:
-        The print operation cannot support the following cases currently.
-
-        1. The type of tensor is float64 or bool.
-
-        2. The data of tensor is a scalar type.
-
        In pynative mode, please use python print function.

    Inputs:
@@ -334,7 +328,7 @@ class Print(PrimitiveWithInfer):

    @prim_attr_register
    def __init__(self):
-        pass
+        self.add_prim_attr("_side_effect", True)

    def __call__(self, *args):
        for arg in args:

--- a/mindspore/ops/operations/math_ops.py
+++ b/mindspore/ops/operations/math_ops.py
@@ -888,7 +888,8 @@ class Neg(PrimitiveWithInfer):
    def infer_value(self, input_x):
        if input_x is not None:
            input_x = input_x.asnumpy()
-            return Tensor(-input_x)
+            out = np.array(-input_x, input_x.dtype)
+            return Tensor(out)

        return None

@@ -1667,7 +1668,8 @@ class Div(_MathBinaryOp):
        if x is not None and y is not None:
            x = x.asnumpy()
            y = y.asnumpy()
-            return Tensor(x / y)
+            out = np.array(x / y, x.dtype)
+            return Tensor(out)
        return None



--- a/mindspore/ops/operations/other_ops.py
+++ b/mindspore/ops/operations/other_ops.py
@@ -59,8 +59,7 @@ class Assign(PrimitiveWithInfer):
        return variable

    def infer_dtype(self, variable, value):
-        args = {"variable": variable, "value": value}
-        validator.check_tensor_type_same(args, (mstype.bool_,) + mstype.number_type, self.name)
+        # Add a type validation later when we don't have to assign a value to RefKey.
        return variable



--- a/mindspore/parallel/_auto_parallel_context.py
+++ b/mindspore/parallel/_auto_parallel_context.py
@@ -400,6 +400,23 @@ class _AutoParallelContext:
        self.check_context_handle()
        return self._context_handle.get_global_rank_is_set()

+    def set_enable_parallel_optimizer(self, enable_parallel_optimizer):
+        """
+        Set enable/disable parallel optimizer.
+
+        Args:
+            set_enable_parallel_optimizer (bool): Enable/disable parallel optimizer.
+        """
+        self.check_context_handle()
+        if not isinstance(enable_parallel_optimizer, bool):
+            raise TypeError('enable_parallel_optimizer is invalid type')
+        self._context_handle.set_enable_parallel_optimizer(enable_parallel_optimizer)
+
+    def get_enable_parallel_optimizer(self):
+        """Get parallel optimizer flag."""
+        self.check_context_handle()
+        return self._context_handle.get_enable_parallel_optimizer()
+
    def reset(self):
        """Reset all settings."""
        self.check_context_handle()
@@ -433,7 +450,8 @@ _set_auto_parallel_context_func_map = {
    "parameter_broadcast": auto_parallel_context().set_parameter_broadcast,
    "strategy_ckpt_load_file": auto_parallel_context().set_strategy_ckpt_load_file,
    "strategy_ckpt_save_file": auto_parallel_context().set_strategy_ckpt_save_file,
-    "full_batch": auto_parallel_context().set_full_batch}
+    "full_batch": auto_parallel_context().set_full_batch,
+    "enable_parallel_optimizer": auto_parallel_context().set_enable_parallel_optimizer}


 _get_auto_parallel_context_func_map = {
@@ -447,13 +465,15 @@ _get_auto_parallel_context_func_map = {
    "parameter_broadcast": auto_parallel_context().get_parameter_broadcast,
    "strategy_ckpt_load_file": auto_parallel_context().get_strategy_ckpt_load_file,
    "strategy_ckpt_save_file": auto_parallel_context().get_strategy_ckpt_save_file,
-    "full_batch": auto_parallel_context().get_full_batch}
+    "full_batch": auto_parallel_context().get_full_batch,
+    "enable_parallel_optimizer": auto_parallel_context().get_enable_parallel_optimizer}


 @args_type_check(device_num=int, global_rank=int, mirror_mean=bool, cast_before_mirror=bool,
                 loss_repeated_mean=bool, parallel_mode=str, auto_parallel_search_mode=str,
                 parameter_broadcast=bool, strategy_ckpt_load_file=str,
-                 strategy_ckpt_save_file=str, full_batch=bool)
+                 strategy_ckpt_save_file=str, full_batch=bool, enable_parallel_optimizer=bool)
+
 def _set_auto_parallel_context(**kwargs):
    """
    Set auto parallel context.
@@ -493,6 +513,7 @@ def _set_auto_parallel_context(**kwargs):
        strategy_ckpt_load_file (str): The path to load parallel strategy checkpoint. Default: ''
        strategy_ckpt_save_file (str): The path to save parallel strategy checkpoint. Default: ''
        full_batch (bool): Whether to load the whole batch on each device. Default: False.
+        enable_parallel_optimizer (bool): Enable using optimizer segmentation or noe. Default: False.

    Raises:
        ValueError: If input key is not attribute in auto parallel context.
@@ -535,5 +556,6 @@ def _reset_auto_parallel_context():
    - parameter_broadcast: False.
    - strategy_ckpt_load_file: ""
    - strategy_ckpt_save_file: ""
+    - enable_parallel_optimizer: False
    """
    auto_parallel_context().reset()
--- a/mindspore/train/callback/_summary_collector.py
+++ b/mindspore/train/callback/_summary_collector.py
@@ -166,8 +166,11 @@ class SummaryCollector(Callback):
        self._has_saved_custom_data = False
        self._is_parse_loss_success = True
        self._first_step = True
+        self._dataset_sink_mode = True

    def __enter__(self):
+        self._first_step = True
+        self._dataset_sink_mode = True
        self._record = SummaryRecord(log_dir=self._summary_dir)
        return self

@@ -279,15 +282,15 @@ class SummaryCollector(Callback):

    def step_end(self, run_context):
        cb_params = run_context.original_args()
+        if self._first_step:
+            # Notice: This way of determining whether dataset sink mode is True does not work in the eval scenario
+            self._dataset_sink_mode = bool(cb_params.cur_step_num == cb_params.batch_num)

        if cb_params.mode == ModeEnum.TRAIN.value:

-            # Make sure the first step data is recorded
-            if not self._first_step and cb_params.cur_step_num % self._collect_freq:
+            if not self._is_collect_this_step(cb_params):
                return

-            self._first_step = False
-
            if not self._has_saved_train_network:
                self._collect_graphs(cb_params)

@@ -295,6 +298,7 @@ class SummaryCollector(Callback):
            self._collect_metric(cb_params)
            self._collect_histogram(cb_params)

+        self._first_step = False
        self._record.record(cb_params.cur_step_num)

    def end(self, run_context):
@@ -320,6 +324,18 @@ class SummaryCollector(Callback):
                raise ValueError(f"There are more than one {self.__class__.__name__} instance in callback list,"
                                 f"but expected only one {self.__class__.__name__} instance.")

+    def _is_collect_this_step(self, cb_params):
+        """Decide whether to collect data for the current step."""
+        # Make sure the first step data is recorded
+        if not self._first_step:
+            if self._dataset_sink_mode:
+                if cb_params.cur_epoch_num % self._collect_freq:
+                    return False
+            else:
+                if cb_params.cur_step_num % self._collect_freq:
+                    return False
+        return True
+
    @staticmethod
    def _package_custom_lineage_data(custom_lineage_data):
        """

--- a/model_zoo/faster_rcnn/src/dataset.py
+++ b/model_zoo/faster_rcnn/src/dataset.py
@@ -318,10 +318,6 @@ def preprocess_fn(image, box, is_training):
        else:
            input_data = resize_column(*input_data)

-        photo = (np.random.rand() < config.photo_ratio)
-        if photo:
-            input_data = photo_crop_column(*input_data)
-
        input_data = image_bgr_rgb(*input_data)

        output_data = input_data
@@ -432,19 +428,19 @@ def data_to_mindrecord_byte_image(dataset="coco", is_training=True, prefix="fast
        writer.write_raw_data([row])
    writer.commit()

+
 def create_fasterrcnn_dataset(mindrecord_file, batch_size=2, repeat_num=12, device_num=1, rank_id=0,
-                              is_training=True, num_parallel_workers=8):
+                              is_training=True, num_parallel_workers=4):
    """Creatr FasterRcnn dataset with MindDataset."""
    ds = de.MindDataset(mindrecord_file, columns_list=["image", "annotation"], num_shards=device_num, shard_id=rank_id,
-                        num_parallel_workers=num_parallel_workers, shuffle=is_training)
+                        num_parallel_workers=1, shuffle=is_training)
    decode = C.Decode()
-    ds = ds.map(input_columns=["image"], operations=decode)
+    ds = ds.map(input_columns=["image"], operations=decode, num_parallel_workers=1)
    compose_map_func = (lambda image, annotation: preprocess_fn(image, annotation, is_training))

    hwc_to_chw = C.HWC2CHW()
    normalize_op = C.Normalize((123.675, 116.28, 103.53), (58.395, 57.12, 57.375))
    horizontally_op = C.RandomHorizontalFlip(1)
-    type_cast0 = CC.TypeCast(mstype.float32)
    type_cast1 = CC.TypeCast(mstype.float16)
    type_cast2 = CC.TypeCast(mstype.int32)
    type_cast3 = CC.TypeCast(mstype.bool_)
@@ -453,17 +449,18 @@ def create_fasterrcnn_dataset(mindrecord_file, batch_size=2, repeat_num=12, devi
        ds = ds.map(input_columns=["image", "annotation"],
                    output_columns=["image", "image_shape", "box", "label", "valid_num"],
                    columns_order=["image", "image_shape", "box", "label", "valid_num"],
-                    operations=compose_map_func, num_parallel_workers=4)
-
-        ds = ds.map(input_columns=["image"], operations=[normalize_op, type_cast0],
-                    num_parallel_workers=num_parallel_workers)
+                    operations=compose_map_func, num_parallel_workers=num_parallel_workers)

        flip = (np.random.rand() < config.flip_ratio)
        if flip:
-            ds = ds.map(input_columns=["image"], operations=[horizontally_op],
-                        num_parallel_workers=num_parallel_workers)
+            ds = ds.map(input_columns=["image"], operations=[normalize_op, horizontally_op, hwc_to_chw, type_cast1],
+                        num_parallel_workers=24)
            ds = ds.map(input_columns=["image", "image_shape", "box", "label", "valid_num"],
-                        operations=flipped_generation, num_parallel_workers=4)
+                        operations=flipped_generation, num_parallel_workers=num_parallel_workers)
+        else:
+            ds = ds.map(input_columns=["image"], operations=[normalize_op, hwc_to_chw, type_cast1],
+                        num_parallel_workers=24)
+
    else:
        ds = ds.map(input_columns=["image", "annotation"],
                    output_columns=["image", "image_shape", "box", "label", "valid_num"],
@@ -471,11 +468,10 @@ def create_fasterrcnn_dataset(mindrecord_file, batch_size=2, repeat_num=12, devi
                    operations=compose_map_func,
                    num_parallel_workers=num_parallel_workers)

-        ds = ds.map(input_columns=["image"], operations=[normalize_op, type_cast0],
-                    num_parallel_workers=num_parallel_workers)
+        ds = ds.map(input_columns=["image"], operations=[normalize_op, hwc_to_chw, type_cast1],
+                    num_parallel_workers=24)

    # transpose_column from python to c
-    ds = ds.map(input_columns=["image"], operations=[hwc_to_chw, type_cast1])
    ds = ds.map(input_columns=["image_shape"], operations=[type_cast1])
    ds = ds.map(input_columns=["box"], operations=[type_cast1])
    ds = ds.map(input_columns=["label"], operations=[type_cast2])

--- a/model_zoo/vgg16/src/config.py
+++ b/model_zoo/vgg16/src/config.py
@@ -19,7 +19,9 @@ from easydict import EasyDict as edict

 cifar_cfg = edict({
    'num_classes': 10,
-    'lr_init': 0.05,
+    'lr_init': 0.01,
+    'lr_max': 0.1,
+    'warmup_epochs': 5,
    'batch_size': 64,
    'epoch_size': 70,
    'momentum': 0.9,

--- a/model_zoo/vgg16/train.py
+++ b/model_zoo/vgg16/train.py
@@ -38,20 +38,25 @@ random.seed(1)
 np.random.seed(1)


-def lr_steps(global_step, lr_max=None, total_epochs=None, steps_per_epoch=None):
+def lr_steps(global_step, lr_init, lr_max, warmup_epochs, total_epochs, steps_per_epoch):
    """Set learning rate."""
    lr_each_step = []
    total_steps = steps_per_epoch * total_epochs
-    decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps]
+    warmup_steps = steps_per_epoch * warmup_epochs
+    if warmup_steps != 0:
+        inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
+    else:
+        inc_each_step = 0
    for i in range(total_steps):
-        if i < decay_epoch_index[0]:
-            lr_each_step.append(lr_max)
-        elif i < decay_epoch_index[1]:
-            lr_each_step.append(lr_max * 0.1)
-        elif i < decay_epoch_index[2]:
-            lr_each_step.append(lr_max * 0.01)
+        if i < warmup_steps:
+            lr_value = float(lr_init) + inc_each_step * float(i)
        else:
-            lr_each_step.append(lr_max * 0.001)
+            base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)))
+            lr_value = float(lr_max) * base * base
+            if lr_value < 0.0:
+                lr_value = 0.0
+        lr_each_step.append(lr_value)
+
    current_step = global_step
    lr_each_step = np.array(lr_each_step).astype(np.float32)
    learning_rate = lr_each_step[current_step:]
@@ -86,7 +91,8 @@ if __name__ == '__main__':
    if args_opt.pre_trained:
        load_param_into_net(net, load_checkpoint(args_opt.pre_trained))

-    lr = lr_steps(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size, steps_per_epoch=batch_num)
+    lr = lr_steps(0, lr_init=cfg.lr_init, lr_max=cfg.lr_max, warmup_epochs=cfg.warmup_epochs,
+                  total_epochs=cfg.epoch_size, steps_per_epoch=batch_num)
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), Tensor(lr), cfg.momentum,
                   weight_decay=cfg.weight_decay)
    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False)

--- a/serving/core/server.cc
+++ b/serving/core/server.cc
@@ -22,6 +22,7 @@
 #include <vector>
 #include <utility>
 #include <memory>
+#include <future>

 #include "mindspore/ccsrc/utils/log_adapter.h"
 #include "serving/ms_service.grpc.pb.h"
@@ -40,7 +41,7 @@ namespace serving {
 using MSTensorPtr = std::shared_ptr<inference::MSTensor>;

 Status Session::CreatDeviceSession(const std::string &device, uint32_t device_id) {
-  session_ = inference::MSSession::CreateSession(device + "Inference", device_id);
+  session_ = inference::MSSession::CreateSession(device, device_id);
  if (session_ == nullptr) {
    MS_LOG(ERROR) << "Creat Session Failed";
    return FAILED;
@@ -67,6 +68,7 @@ Status Session::Predict(const std::vector<MSTensorPtr> &inputs, inference::Multi
  MS_LOG(INFO) << "run Predict";

  *outputs = session_->RunGraph(graph_id_, inputs);
+  MS_LOG(INFO) << "run Predict finished";
  return SUCCESS;
 }

@@ -80,12 +82,16 @@ Status Session::Warmup(const MindSporeModelPtr model) {
  std::string file_name = model->GetModelPath() + '/' + model->GetModelName();
  char *graphBuf = ReadFile(file_name.c_str(), &size);
  if (graphBuf == nullptr) {
-    MS_LOG(ERROR) << "Load graph model failed, file name is " << file_name.c_str();
+    MS_LOG(ERROR) << "Read model file failed, file name is " << file_name.c_str();
    return FAILED;
  }
  last_graph_ = inference::LoadModel(graphBuf, size, device_type_);
+  if (last_graph_ == nullptr) {
+    MS_LOG(ERROR) << "Load graph model failed, file name is " << file_name.c_str();
+    return FAILED;
+  }
  graph_id_ = session_->CompileGraph(last_graph_);
-  MS_LOG(INFO) << "Session Warmup";
+  MS_LOG(INFO) << "Session Warmup finished";
  return SUCCESS;
 }

@@ -95,6 +101,9 @@ Status Session::Clear() {
 }

 namespace {
+static const uint32_t uint32max = 0x7FFFFFFF;
+std::promise<void> exit_requested;
+
 const std::map<ms_serving::DataType, TypeId> type2id_map{
  {ms_serving::MS_UNKNOWN, TypeId::kNumberTypeBegin},   {ms_serving::MS_BOOL, TypeId::kNumberTypeBool},
  {ms_serving::MS_INT8, TypeId::kNumberTypeInt8},       {ms_serving::MS_UINT8, TypeId::kNumberTypeUInt8},
@@ -141,7 +150,7 @@ MSTensorPtr ServingTensor2MSTensor(const ms_serving::Tensor &tensor) {
  }
  TypeId type = iter->second;
  auto ms_tensor = std::shared_ptr<inference::MSTensor>(inference::MSTensor::CreateTensor(type, shape));
-  memcpy_s(ms_tensor->MutableData(), tensor.data().size(), tensor.data().data(), tensor.data().size());
+  memcpy_s(ms_tensor->MutableData(), ms_tensor->Size(), tensor.data().data(), tensor.data().size());
  return ms_tensor;
 }

@@ -166,10 +175,7 @@ void ClearEnv() {
  Session::Instance().Clear();
  inference::ExitInference();
 }
-void HandleSignal(int sig) {
-  ClearEnv();
-  exit(0);
-}
+void HandleSignal(int sig) { exit_requested.set_value(); }

 #ifdef ENABLE_D
 static rtContext_t g_ctx = nullptr;
@@ -247,6 +253,7 @@ Status Server::BuildAndStart() {
  rtError_t rt_ret = rtCtxGetCurrent(&ctx);
  if (rt_ret != RT_ERROR_NONE || ctx == nullptr) {
    MS_LOG(ERROR) << "the ascend device context is null";
+    ClearEnv();
    return FAILED;
  }
  g_ctx = ctx;
@@ -258,6 +265,7 @@ Status Server::BuildAndStart() {
  auto option = grpc::MakeChannelArgumentOption(GRPC_ARG_ALLOW_REUSEPORT, 0);
  grpc::ServerBuilder builder;
  builder.SetOption(std::move(option));
+  builder.SetMaxMessageSize(uint32max);
  // Listen on the given address without any authentication mechanism.
  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
  // Register "service" as the instance through which we'll communicate with
@@ -265,13 +273,20 @@ Status Server::BuildAndStart() {
  builder.RegisterService(&service);
  // Finally assemble the server.
  std::unique_ptr<grpc::Server> server(builder.BuildAndStart());
+  if (server == nullptr) {
+    MS_LOG(ERROR) << "The serving server create failed";
+    ClearEnv();
+    return FAILED;
+  }
+  auto grpc_server_run = [&server]() { server->Wait(); };
+  std::thread serving_thread(grpc_server_run);
  MS_LOG(INFO) << "Server listening on " << server_address << std::endl;
-
-  // Wait for the server to shutdown. Note that some other thread must be
-  // responsible for shutting down the server for this call to ever return.
-  server->Wait();
+  auto exit_future = exit_requested.get_future();
+  exit_future.wait();
+  ClearEnv();
+  server->Shutdown();
+  serving_thread.join();
  return SUCCESS;
 }
-
 }  // namespace serving
 }  // namespace mindspore
--- a/serving/core/util/file_system_operation.cc
+++ b/serving/core/util/file_system_operation.cc
@@ -29,7 +29,6 @@

 namespace mindspore {
 namespace serving {
-
 char *ReadFile(const char *file, size_t *size) {
  if (file == nullptr) {
    MS_LOG(ERROR) << "file is nullptr";
@@ -70,8 +69,8 @@ bool DirOrFileExist(const std::string &file_path) {
 }

 std::vector<std::string> GetAllSubDirs(const std::string &dir_path) {
-  DIR *dir;
-  struct dirent *ptr;
+  DIR *dir = nullptr;
+  struct dirent *ptr = nullptr;
  std::vector<std::string> SubDirs;

  if ((dir = opendir(dir_path.c_str())) == NULL) {

--- a/serving/core/util/option_parser.cc
+++ b/serving/core/util/option_parser.cc
@@ -36,17 +36,16 @@ bool RemovePrefix(std::string *str, const std::string &prefix) {

 bool Option::ParseInt32(std::string *arg) {
  if (RemovePrefix(arg, "--") && RemovePrefix(arg, name_) && RemovePrefix(arg, "=")) {
-    char extra;
    int32_t parsed_value;
-    if (sscanf(arg->data(), "%d%c", &parsed_value, &extra) != 1) {
-      std::cout << "Parse " << name_ << "Error for option " << *arg << std::endl;
+    try {
+      parsed_value = std::stoi(arg->data());
+    } catch (std::invalid_argument) {
+      std::cout << "Parse " << name_ << " Error for option " << *arg << std::endl;
      return false;
-    } else {
-      *int32_default_ = parsed_value;
    }
+    *int32_default_ = parsed_value;
    return true;
  }
-
  return false;
 }

@@ -76,17 +75,16 @@ bool Option::ParseString(std::string *arg) {

 bool Option::ParseFloat(std::string *arg) {
  if (RemovePrefix(arg, "--") && RemovePrefix(arg, name_) && RemovePrefix(arg, "=")) {
-    char extra;
    float parsed_value;
-    if (sscanf(arg->data(), "%f%c", &parsed_value, &extra) != 1) {
-      std::cout << "Parse " << name_ << "Error for option " << *arg << std::endl;
+    try {
+      parsed_value = std::stof(arg->data());
+    } catch (std::invalid_argument) {
+      std::cout << "Parse " << name_ << " Error for option " << *arg << std::endl;
      return false;
-    } else {
-      *float_default_ = parsed_value;
    }
+    *float_default_ = parsed_value;
    return true;
  }
-
  return false;
 }

@@ -159,10 +157,11 @@ Options::Options() : args_(nullptr) { CreateOptions(); }
 void Options::CreateOptions() {
  args_ = std::make_shared<Arguments>();
  std::vector<Option> options = {
-    Option("port", &args_->grpc_port, "Port to listen on for gRPC API, default is 5500"),
-    Option("model_name", &args_->model_name, "model name "),
-    Option("model_path", &args_->model_path, "the path of the model files"),
-    Option("device_id", &args_->device_id, "the device id, default is 0"),
+    Option("port", &args_->grpc_port,
+           "[Optional] Port to listen on for gRPC API, default is 5500, range from 1 to 65535"),
+    Option("model_name", &args_->model_name, "[Required] model name "),
+    Option("model_path", &args_->model_path, "[Required] the path of the model files"),
+    Option("device_id", &args_->device_id, "[Optional] the device id, default is 0, range from 0 to 7"),
  };
  options_ = options;
 }
@@ -176,6 +175,14 @@ bool Options::CheckOptions() {
    std::cout << "device_type only support Ascend right now" << std::endl;
    return false;
  }
+  if (args_->device_id > 7) {
+    std::cout << "the device_id should be in [0~7]" << std::endl;
+    return false;
+  }
+  if (args_->grpc_port < 1 || args_->grpc_port > 65535) {
+    std::cout << "the port should be in [1~65535]" << std::endl;
+    return false;
+  }
  return true;
 }

@@ -238,6 +245,5 @@ void Options::Usage() {
              << option.usage_ << std::endl;
  }
 }
-
 }  // namespace serving
 }  // namespace mindspore
--- a/serving/core/util/option_parser.h
+++ b/serving/core/util/option_parser.h
@@ -22,7 +22,6 @@

 namespace mindspore {
 namespace serving {
-
 struct Arguments {
  int32_t grpc_port = 5500;
  std::string grpc_socket_path;
@@ -40,6 +39,7 @@ class Option {
  Option(const std::string &name, bool *default_point, const std::string &usage);
  Option(const std::string &name, std::string *default_point, const std::string &usage);
  Option(const std::string &name, float *default_point, const std::string &usage);
+  ~Option() = default;

 private:
  friend class Options;
@@ -77,7 +77,6 @@ class Options {
  std::vector<Option> options_;
  std::shared_ptr<Arguments> args_;
 };
-
 }  // namespace serving
 }  // namespace mindspore


--- a/serving/core/version_control/model.cc
+++ b/serving/core/version_control/model.cc
@@ -19,7 +19,6 @@

 namespace mindspore {
 namespace serving {
-
 MindSporeModel::MindSporeModel(const std::string &model_name, const std::string &model_path,
                               const std::string &model_version, const time_t &last_update_time)
    : model_name_(model_name),

--- a/serving/core/version_control/version_controller.cc
+++ b/serving/core/version_control/version_controller.cc
@@ -25,7 +25,6 @@

 namespace mindspore {
 namespace serving {
-
 volatile bool stop_poll = false;

 std::string GetVersionFromPath(const std::string &path) {
@@ -102,10 +101,10 @@ Status VersionController::CreateInitModels() {
  }
  std::vector<std::string> SubDirs = GetAllSubDirs(models_path_);
  if (version_control_strategy_ == kLastest) {
-    auto path = SubDirs.empty() ? models_path_ : SubDirs.back();
-    std::string model_version = GetVersionFromPath(path);
-    time_t last_update_time = GetModifyTime(path);
-    MindSporeModelPtr model_ptr = std::make_shared<MindSporeModel>(model_name_, path, model_version, last_update_time);
+    std::string model_version = GetVersionFromPath(models_path_);
+    time_t last_update_time = GetModifyTime(models_path_);
+    MindSporeModelPtr model_ptr =
+      std::make_shared<MindSporeModel>(model_name_, models_path_, model_version, last_update_time);
    valid_models_.emplace_back(model_ptr);
  } else {
    for (auto &dir : SubDirs) {
@@ -119,8 +118,8 @@ Status VersionController::CreateInitModels() {
    MS_LOG(ERROR) << "There is no valid model for serving";
    return FAILED;
  }
-  Session::Instance().Warmup(valid_models_.back());
-  return SUCCESS;
+  auto ret = Session::Instance().Warmup(valid_models_.back());
+  return ret;
 }

 void VersionController::StartPollModelPeriodic() {
@@ -129,6 +128,5 @@ void VersionController::StartPollModelPeriodic() {
 }

 void VersionController::StopPollModelPeriodic() {}
-
 }  // namespace serving
 }  // namespace mindspore
--- a/serving/core/version_control/version_controller.h
+++ b/serving/core/version_control/version_controller.h
@@ -64,7 +64,6 @@ class PeriodicFunction {
  VersionController::VersionControllerStrategy version_control_strategy_;
  std::vector<MindSporeModelPtr> valid_models_;
 };
-
 }  // namespace serving
 }  // namespace mindspore


--- a/serving/cpp_example/ms_client.cc
+++ b/serving/cpp_example/ms_client.cc
@@ -214,6 +214,7 @@ PredictRequest ReadBertInput() {
 class MSClient {
 public:
  explicit MSClient(std::shared_ptr<Channel> channel) : stub_(MSService::NewStub(channel)) {}
+  ~MSClient() = default;

  std::string Predict(const std::string &type) {
    // Data we are sending to the server.
@@ -310,7 +311,6 @@ int main(int argc, char **argv) {
        type = "add";
      }
    }
-
  } else {
    target_str = "localhost:5500";
    type = "add";

--- a/serving/scripts/format_source_code.sh
+++ b/serving/scripts/format_source_code.sh
@@ -81,7 +81,7 @@ function checkopts()
 checkopts "$@"

 # switch to project root path, which contains clang-format config file '.clang-format'
-cd "${SCRIPTS_PATH}/.." || exit 1
+cd "${SCRIPTS_PATH}/../.." || exit 1

 FMT_FILE_LIST='__format_files_list__'


--- a/setup.py
+++ b/setup.py
@@ -161,6 +161,7 @@ setup(
    description='MindSpore is a new open source deep learning training/inference '
    'framework that could be used for mobile, edge and cloud scenarios.',
    long_description="\n\n".join([readme, release]),
+    long_description_content_type="text/markdown",
    packages=find_packages(),
    package_data=package_data,
    include_package_data=True,

--- a/tests/ut/cpp/dataset/btree_test.cc
+++ b/tests/ut/cpp/dataset/btree_test.cc
@@ -190,9 +190,9 @@ TEST_F(MindDataTestBPlusTree, Test3) {
  EXPECT_TRUE(rc.IsOk());
  uint64_t min = ai.min_key();
  uint64_t max = ai.max_key();
-  EXPECT_EQ(min, 1);
-  EXPECT_EQ(max, 4);
-  auto r = ai.Search(3);
+  EXPECT_EQ(min, 0);
+  EXPECT_EQ(max, 3);
+  auto r = ai.Search(2);
  auto &it = r.first;
  EXPECT_EQ(it.value(), "b");
  MS_LOG(INFO) << "Dump all the values using [] operator.";

--- a/tests/ut/cpp/optimizer/opt_test.cc
+++ b/tests/ut/cpp/optimizer/opt_test.cc
@@ -77,10 +77,10 @@ class TestOptOpt : public UT::Common {
  };

  void SetUp() {
-    elim_Z = MakeSubstitution(irpass::AddByZero(), "elim_Z", prim::kPrimScalarAdd);
-    elim_R = MakeSubstitution(irpass::PrimEliminater(R), "elim_R", R);
-    idempotent_P = MakeSubstitution(IdempotentEliminater(), "idempotent_P", P);
-    Qct_to_P = MakeSubstitution(QctToP(), "Qct_to_P", Q);
+    elim_Z = MakeSubstitution(std::make_shared<irpass::AddByZero>(), "elim_Z", prim::kPrimScalarAdd);
+    elim_R = MakeSubstitution(std::make_shared<irpass::PrimEliminater>(R), "elim_R", R);
+    idempotent_P = MakeSubstitution(std::make_shared<IdempotentEliminater>(), "idempotent_P", P);
+    Qct_to_P = MakeSubstitution(std::make_shared<QctToP>(), "Qct_to_P", Q);
  }

  bool CheckTransform(FuncGraphPtr gbefore, FuncGraphPtr gafter, const SubstitutionList &transform) {

--- a/tests/ut/cpp/parallel/step_parallel_test.cc
+++ b/tests/ut/cpp/parallel/step_parallel_test.cc
@@ -327,6 +327,9 @@ TEST_F(TestStepParallel, CreatOpInstance) {
        } else if (name == "instance_name") {
          parse::ConvertData(py::cast<py::object>(item.second), &converted_ret);
          ASSERT_EQ(converted_ret->ToString(), "test");
+        } else if (name == "index") {
+          parse::ConvertData(py::cast<py::object>(item.second), &converted_ret);
+          ASSERT_EQ(converted_ret->ToString(), "0");
        } else {
          MS_LOG(EXCEPTION) << "Test failed";
        }

--- a/tests/ut/data/dataset/declient.cfg
+++ b/tests/ut/data/dataset/declient.cfg
@@ -4,6 +4,7 @@
   "numParallelWorkers": 4,
   "workerConnectorSize": 16,
   "opConnectorSize": 16,
-   "seed": 5489
+   "seed": 5489,
+   "monitor_sampling_interval": 15

 }
--- a/tests/ut/data/dataset/golden/bounding_box_augment_crop_c_result.npz
+++ b/tests/ut/data/dataset/golden/bounding_box_augment_crop_c_result.npz
--- a/tests/ut/data/dataset/golden/bounding_box_augment_rotation_c_result.npz
+++ b/tests/ut/data/dataset/golden/bounding_box_augment_rotation_c_result.npz
--- a/tests/ut/data/dataset/golden/bounding_box_augment_valid_edge_c_result.npz
+++ b/tests/ut/data/dataset/golden/bounding_box_augment_valid_edge_c_result.npz
--- a/tests/ut/data/dataset/golden/bounding_box_augment_valid_ratio_c_result.npz
+++ b/tests/ut/data/dataset/golden/bounding_box_augment_valid_ratio_c_result.npz
--- a/tests/ut/data/dataset/golden/random_crop_with_bbox_01_c_result.npz
+++ b/tests/ut/data/dataset/golden/random_crop_with_bbox_01_c_result.npz
--- a/tests/ut/data/dataset/golden/random_horizontal_flip_with_bbox_01_c_result.npz
+++ b/tests/ut/data/dataset/golden/random_horizontal_flip_with_bbox_01_c_result.npz
--- a/tests/ut/data/dataset/golden/random_resize_with_bbox_op_01_c_result.npz
+++ b/tests/ut/data/dataset/golden/random_resize_with_bbox_op_01_c_result.npz
--- a/tests/ut/data/dataset/golden/random_resized_crop_with_bbox_01_c_result.npz
+++ b/tests/ut/data/dataset/golden/random_resized_crop_with_bbox_01_c_result.npz
--- a/tests/ut/data/dataset/golden/random_vertical_flip_with_bbox_01_c_result.npz
+++ b/tests/ut/data/dataset/golden/random_vertical_flip_with_bbox_01_c_result.npz
--- a/tests/ut/data/dataset/golden/resize_with_bbox_op_01_c_result.npz
+++ b/tests/ut/data/dataset/golden/resize_with_bbox_op_01_c_result.npz
--- a/tests/ut/python/dataset/test_batch.py
+++ b/tests/ut/python/dataset/test_batch.py
--- a/tests/ut/python/dataset/test_bounding_box_augment.py
+++ b/tests/ut/python/dataset/test_bounding_box_augment.py
--- a/tests/ut/python/dataset/test_center_crop.py
+++ b/tests/ut/python/dataset/test_center_crop.py
--- a/tests/ut/python/dataset/test_config.py
+++ b/tests/ut/python/dataset/test_config.py
--- a/tests/ut/python/dataset/test_filterop.py
+++ b/tests/ut/python/dataset/test_filterop.py
--- a/tests/ut/python/dataset/test_pad.py
+++ b/tests/ut/python/dataset/test_pad.py
--- a/tests/ut/python/dataset/test_random_crop_and_resize_with_bbox.py
+++ b/tests/ut/python/dataset/test_random_crop_and_resize_with_bbox.py
--- a/tests/ut/python/dataset/test_random_crop_with_bbox.py
+++ b/tests/ut/python/dataset/test_random_crop_with_bbox.py
--- a/tests/ut/python/dataset/test_random_horizontal_flip_bbox.py
+++ b/tests/ut/python/dataset/test_random_horizontal_flip_bbox.py
--- a/tests/ut/python/dataset/test_random_horizontal_flip_with_bbox.py
+++ b/tests/ut/python/dataset/test_random_horizontal_flip_with_bbox.py
--- a/tests/ut/python/dataset/test_random_resize_with_bbox.py
+++ b/tests/ut/python/dataset/test_random_resize_with_bbox.py
--- a/tests/ut/python/dataset/test_random_vertical_flip_with_bbox.py
+++ b/tests/ut/python/dataset/test_random_vertical_flip_with_bbox.py
--- a/tests/ut/python/dataset/test_resize_with_bbox.py
+++ b/tests/ut/python/dataset/test_resize_with_bbox.py
--- a/tests/ut/python/dataset/util.py
+++ b/tests/ut/python/dataset/util.py
--- a/tests/ut/python/ops/test_signature.py
+++ b/tests/ut/python/ops/test_signature.py
--- a/tests/ut/python/parallel/test_parallel_optimizer.py
+++ b/tests/ut/python/parallel/test_parallel_optimizer.py
--- a/tests/ut/python/parallel/test_set_auto_parallel_context.py
+++ b/tests/ut/python/parallel/test_set_auto_parallel_context.py
--- a/tests/ut/python/pynative_mode/ge/model/__init__.py
+++ b/tests/ut/python/pynative_mode/ge/model/__init__.py
--- a/tests/ut/python/pynative_mode/ge/model/test_lenet_model.py
+++ b/tests/ut/python/pynative_mode/ge/model/test_lenet_model.py
--- a/tests/ut/python/utils/test_serialize.py
+++ b/tests/ut/python/utils/test_serialize.py