diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 256693528960a9747f89df2c553102322ff1cbeb..c870efb16bc903d7415a47496fe758d578435848 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -160,6 +160,12 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
+paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
+paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
+paddle.fluid.layers.stanh ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None))
+paddle.fluid.layers.hard_sigmoid ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None))
+paddle.fluid.layers.swish ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
 paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.brelu ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None))
 paddle.fluid.layers.leaky_relu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None))
@@ -260,12 +266,6 @@ paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', de
 paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.softshrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.elu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.relu6 ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.stanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.hard_sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.swish ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.exp ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h
index 4fb015b0ffe27e6fa91d4eaf373fca4feca66361..21f75957be5f33f3dfc09c41fa9a1e1ca590f99e 100644
--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -20,41 +20,79 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-template <class T>
-class COWPtr {
+// Change it to thread safe flags if needed.
+class ThreadUnsafeOwnershipFlags {
  public:
-  typedef std::shared_ptr<T> RefPtr;
+  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
 
- private:
-  RefPtr m_sp;
+  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
+  ThreadUnsafeOwnershipFlags& operator=(
+      const ThreadUnsafeOwnershipFlags& other) = delete;
+  ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
 
-  void detach() {
-    T* tmp = m_sp.get();
-    if (!(tmp == nullptr || m_sp.unique())) {
-      m_sp = RefPtr(new T(*tmp));
+  void SetOwnership(bool flag) { flag_ = flag; }
+
+  // Invoke the callback if it is not owned.
+  template <typename Callback>
+  void AcquireOwnershipOnce(Callback acquire) {
+    if (!flag_) {
+      acquire();
+      flag_ = true;
     }
   }
 
- public:
-  COWPtr() : m_sp(nullptr) {}
-  explicit COWPtr(T* t) : m_sp(t) {}
-  explicit COWPtr(const RefPtr& refptr) : m_sp(refptr) {}
+ private:
+  bool flag_;
+};
 
-  const T& Data() const { return operator*(); }
+// Copy-On-Write pointer.
+// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
+//
+// The template parameter OwnershipFlags should have:
+//   * a constructor takes a bool. True if own.
+//   * SetOwnership(bool flag).
+//   * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
+//     owned.
+//
+// https://en.wikipedia.org/wiki/Copy-on-write
+template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
+class COWPtr {
+ public:
+  // Ctor from raw pointer.
+  explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
 
-  T* MutableData() { return operator->(); }
+  // Move methods. Steal ownership from origin
+  COWPtr(COWPtr&& other)
+      : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
+  COWPtr& operator=(COWPtr&& origin) = default;
 
-  const T& operator*() const { return *m_sp; }
-  T& operator*() {
-    detach();
-    return *m_sp;
+  // Copy methods. Not own payload
+  COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
+  COWPtr& operator=(const COWPtr& other) {
+    payload_ = other.payload_;
+    ownership_.SetOwnership(false);
+    return *this;
   }
-  const T* operator->() const { return m_sp.operator->(); }
-  T* operator->() {
-    detach();
-    return m_sp.operator->();
+
+  // Access read only data.
+  const T& Data() const { return *payload_; }
+
+  // Access mutable data. If the data is not owned, the data will be copied
+  // before.
+  T* MutableData() {
+    ownership_.AcquireOwnershipOnce(
+        [this] { payload_.reset(new T(*payload_)); });
+    return payload_.get();
   }
+
+ private:
+  // Actual data pointer.
+  std::shared_ptr<T> payload_;
+
+  // Ownership flag.
+  OwnershipFlags ownership_;
 };
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc
index 5b055d7cb4d127dc20f2cf70869134f24a93d429..d2142af277c0b356d83941b3baab1947cce31dac 100644
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -30,14 +30,6 @@ TEST(COWPtr, all) {
   ASSERT_EQ(ptr2.Data(), 10);
 }
 
-TEST(COWPtr, change_old) {
-  COWPtr<int> ptr(new int{0});
-  COWPtr<int> ptr2 = ptr;
-  *ptr.MutableData() = 10;
-  ASSERT_EQ(ptr2.Data(), 0);
-  ASSERT_EQ(ptr.Data(), 10);
-}
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/reference_count_op_handle.h b/paddle/fluid/framework/details/reference_count_op_handle.h
index 71db8d952f4c205b875ad254dc19c0c1f74e61b3..fc479a4c4a1e7d5c824d3c202e0cccf743dd52c9 100644
--- a/paddle/fluid/framework/details/reference_count_op_handle.h
+++ b/paddle/fluid/framework/details/reference_count_op_handle.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
@@ -46,17 +47,15 @@ class ReferenceCountOpHandle : public OpHandleBase {
                          const std::vector<std::string> &var_names,
                          GarbageCollector<Tensor> *gc,
                          AtomicReferenceCountMap *ref_cnts)
-      : OpHandleBase(node),
-        scope_(scope),
-        var_names_(var_names),
-        gc_(gc),
-        ref_cnts_(ref_cnts) {
+      : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
     dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     if (IsStreamGarabageCollector()) {
       PADDLE_ENFORCE(cudaSetDevice(place.device));
       PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
     }
+
+    for (auto &name : var_names) AddVar(name);
   }
 
   ~ReferenceCountOpHandle() {
@@ -69,19 +68,35 @@ class ReferenceCountOpHandle : public OpHandleBase {
 
   std::string Name() const override { return "reference_count"; }
 
+  void AddVar(const std::string &name) {
+    auto it = var_names_.find(name);
+    if (it != var_names_.end())
+      ++(it->second);
+    else
+      var_names_[name] = 1;
+  }
+
  protected:
   void RunImpl() override {
     auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    std::vector<LoDTensor *> tensors;
-    for (auto &name : var_names_) {
+    std::vector<Tensor *> tensors;
+    for (auto &pair : var_names_) {
+      auto &name = pair.first;
       auto it = ref_cnts_->find(name);
       if (it == ref_cnts_->end()) continue;
 
       auto *var = exec_scope->FindVar(name);
-      if (var == nullptr || !var->IsType<LoDTensor>()) continue;
-
-      if (it->second.fetch_sub(1) <= 1) {
-        tensors.emplace_back(var->GetMutable<LoDTensor>());
+      if (var == nullptr) continue;
+
+      if (var->IsType<LoDTensor>()) {
+        if (it->second.fetch_sub(pair.second) <= pair.second) {
+          tensors.emplace_back(var->GetMutable<LoDTensor>());
+        }
+      } else if (var->IsType<SelectedRows>()) {
+        if (it->second.fetch_sub(pair.second) <= pair.second) {
+          tensors.emplace_back(
+              var->GetMutable<SelectedRows>()->mutable_value());
+        }
       }
     }
 
@@ -91,7 +106,7 @@ class ReferenceCountOpHandle : public OpHandleBase {
   }
 
  private:
-  void ClearTensors(const std::vector<LoDTensor *> &tensors) {
+  void ClearTensors(const std::vector<Tensor *> &tensors) {
     auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
     if (gc != nullptr) {
       auto compute_stream = dev_ctx_->stream();
@@ -112,7 +127,7 @@ class ReferenceCountOpHandle : public OpHandleBase {
 
   const Scope *scope_;
   platform::CUDADeviceContext *dev_ctx_;
-  std::vector<std::string> var_names_;
+  std::unordered_map<std::string, int> var_names_;
   GarbageCollector<Tensor> *gc_;       // not own
   AtomicReferenceCountMap *ref_cnts_;  // not own
   cudaEvent_t event_;
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 344754d5a1e119c04cae08ad50126924b5824315..b1ce551ce73de33bcede187c72feebad6e2fa1a5 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <queue>
 #include <string>
 #include <vector>
 
@@ -23,6 +24,25 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) {
+  std::queue<VarHandleBase *> queue;
+  queue.push(var_in);
+  do {
+    auto *var = queue.front();
+    queue.pop();
+    for (auto *op : var->PendingOps()) {
+      auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
+      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) {
+        return compute_op;
+      }
+      for (auto *out_var : op->Outputs()) {
+        queue.push(out_var);
+      }
+    }
+  } while (!queue.empty());
+  return nullptr;
+}
+
 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
@@ -34,6 +54,9 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
   // Step 2: Find all variables in non-computation ops which refers to variables
   // in computation ops
   std::unordered_set<std::string> names;
+  std::unordered_map<OpHandleBase *, std::unique_ptr<ReferenceCountOpHandle>>
+      compute_ref_cnt_map;
+
   auto get_ref_cnts_from_compute_op = [&](
       const std::unique_ptr<OpHandleBase> &op,
       const std::vector<VarHandleBase *> &vars) {
@@ -54,15 +77,18 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
       VarDesc *var_desc = var_handle->Node()->Var();
       auto var_name = var_handle->Node()->Name();
 
-      // This is wierd but there is really some variables without var_desc
+      // This is weird but there is really some variables without var_desc
       // in computation_op
       if (var_desc == nullptr) {
         if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr)
           continue;
       } else {
-        if (var_desc->Persistable() ||
-            var_desc->Proto()->type().type() != proto::VarType::LOD_TENSOR)
+        if (var_desc->Persistable()) continue;
+        auto var_type = var_desc->Proto()->type().type();
+        if (var_type != proto::VarType::LOD_TENSOR &&
+            var_type != proto::VarType::SELECTED_ROWS) {
           continue;
+        }
       }
 
       // compute op only runs in one device
@@ -93,12 +119,33 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
       if (ref_cnts.count(place.device) &&
           ref_cnts[place.device]->count(var_name)) {
         ++(*ref_cnts[place.device])[var_name];
+
+        auto *next_compute_op = FindNextComputationOpHandle(var_handle);
+        if (next_compute_op != nullptr) {
+          if (compute_ref_cnt_map.count(next_compute_op)) {
+            compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
+            VLOG(5) << "Add reference count of " << var_name << " to Operator "
+                    << next_compute_op->Name();
+          } else {
+            // Create new reference_count_op_handle
+            ir::Node *ref_cnt_node = graph->CreateEmptyNode(
+                "reference_count", ir::Node::Type::kOperation);
+            auto *ref_cnt_handle = new ReferenceCountOpHandle(
+                ref_cnt_node, next_compute_op->GetScope(), place, {var_name},
+                gcs[place.device].get(), cur_ref_cnts[place.device].get());
+            if (next_compute_op->Outputs().empty()) {
+              auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+              next_compute_op->AddOutput(dep_var);
+              graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+            }
+            ref_cnt_handle->AddInput(next_compute_op->Outputs().front());
+            compute_ref_cnt_map[next_compute_op].reset(ref_cnt_handle);
+          }
+        }
       }
     }
   };
 
-  std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
-      compute_ref_cnt_map;
   auto &all_ops = graph->Get<GraphOps>(kGraphOps);
   for (auto &op : all_ops) {
     auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
@@ -113,11 +160,13 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
     auto *ref_cnt_handle = new ReferenceCountOpHandle(
         ref_cnt_node, compute_op->GetScope(), place, in_var_names,
         gcs[place.device].get(), cur_ref_cnts[place.device].get());
-    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-    compute_op->AddOutput(dep_var);
-    ref_cnt_handle->AddInput(dep_var);
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-    compute_ref_cnt_map[compute_op] = ref_cnt_handle;
+    if (compute_op->Outputs().empty()) {
+      auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+      compute_op->AddOutput(dep_var);
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+    }
+    ref_cnt_handle->AddInput(compute_op->Outputs().front());
+    compute_ref_cnt_map[compute_op].reset(ref_cnt_handle);
   }
 
   for (auto &op : all_ops) {
@@ -131,7 +180,11 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
     new_all_ops.emplace_back(std::move(op));
     auto it = compute_ref_cnt_map.find(new_all_ops.back().get());
     if (it != compute_ref_cnt_map.end()) {
-      new_all_ops.emplace_back(it->second);
+      // Add LeafNode to ReferenceCountOpHandle
+      auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+      it->second->AddOutput(dummy_leaf);
+      new_all_ops.emplace_back(std::move(it->second));
     }
   }
 
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index ba2c41eb8968e30bc8a801f4d8d239da8c522de9..7836ecb1272a07a79a70c9cb040335f9a42e5684 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -17,12 +17,10 @@
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
-#include <utility>
 #include <vector>
-#include "paddle/fluid/framework/details/cow_ptr.h"
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/memory/memcpy.h"
 
 #include "glog/logging.h"
 
@@ -30,401 +28,206 @@ namespace paddle {
 namespace framework {
 
 #if defined(PADDLE_WITH_CUDA)
-namespace details {
-struct CUDABuffer {
-  void *data_{nullptr};
-  size_t size_{0};
-  platform::CUDAPlace place_;
-
-  CUDABuffer() {}
-  CUDABuffer(platform::Place place, size_t size)
-      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
-    data_ = memory::Alloc(place_, size);
-  }
-
-  ~CUDABuffer() { ClearMemory(); }
-
-  CUDABuffer(const CUDABuffer &o) = delete;
-  CUDABuffer &operator=(const CUDABuffer &o) = delete;
-
-  void Resize(platform::Place place, size_t size) {
-    ClearMemory();
-    place_ = boost::get<platform::CUDAPlace>(place);
-    data_ = memory::Alloc(place_, size);
-    size_ = size;
-  }
-
-  void Swap(CUDABuffer &o) {
-    std::swap(data_, o.data_);
-    std::swap(place_, o.place_);
-    std::swap(size_, o.size_);
-  }
-
- private:
-  void ClearMemory() const {
-    if (data_) {
-      memory::Free(place_, data_);
-    }
-  }
-};
-}  // namespace details
-
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
 class Vector {
  public:
   using value_type = T;
-  using iterator = typename std::vector<T>::iterator;
-  using const_iterator = typename std::vector<T>::const_iterator;
-
- private:
-  // The actual class to implement vector logic
-  class VectorData {
-   public:
-    VectorData() : flag_(kDataInCPU) {}
-    VectorData(size_t count, const T &value)
-        : cpu_(count, value), flag_(kDataInCPU) {}
-    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
-    template <typename U>
-    explicit VectorData(const std::vector<U> &dat)
-        : cpu_(dat), flag_(kDataInCPU) {}
-
-    VectorData(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-    }
-
-    VectorData &operator=(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-      details::CUDABuffer null;
-      gpu_.Swap(null);
-      return *this;
-    }
-
-    T &operator[](size_t i) {
-      MutableCPU();
-      return cpu_[i];
-    }
-
-    const T &operator[](size_t i) const {
-      ImmutableCPU();
-      return cpu_[i];
-    }
-
-    size_t size() const { return cpu_.size(); }
-
-    iterator begin() {
-      MutableCPU();
-      return cpu_.begin();
-    }
-
-    iterator end() {
-      MutableCPU();
-      return cpu_.end();
-    }
-
-    T &front() {
-      MutableCPU();
-      return cpu_.front();
-    }
-
-    T &back() {
-      MutableCPU();
-      return cpu_.back();
-    }
-
-    const_iterator begin() const {
-      ImmutableCPU();
-      return cpu_.begin();
-    }
-
-    const_iterator end() const {
-      ImmutableCPU();
-      return cpu_.end();
-    }
-
-    const T &back() const {
-      ImmutableCPU();
-      return cpu_.back();
-    }
-
-    T *data() { return &(*this)[0]; }
-
-    const T *data() const { return &(*this)[0]; }
-
-    const T &front() const {
-      ImmutableCPU();
-      return cpu_.front();
-    }
-
-    // assign this from iterator.
-    // NOTE: the iterator must support `end-begin`
-    template <typename Iter>
-    void assign(Iter begin, Iter end) {
-      MutableCPU();
-      cpu_.assign(begin, end);
-    }
-
-    // push_back. If the previous capacity is not enough, the memory will
-    // double.
-    void push_back(T elem) {
-      MutableCPU();
-      cpu_.push_back(elem);
-    }
-
-    // extend a vector by iterator.
-    // NOTE: the iterator must support end-begin
-    template <typename It>
-    void Extend(It begin, It end) {
-      MutableCPU();
-      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
-      std::copy(begin, end, out_it);
-    }
-
-    // resize the vector
-    void resize(size_t size) {
-      MutableCPU();
-      cpu_.resize(size);
-    }
-
-    // get cuda ptr. immutable
-    const T *CUDAData(platform::Place place) const {
-      PADDLE_ENFORCE(platform::is_gpu_place(place),
-                     "CUDA Data must on CUDA place");
-      ImmutableCUDA(place);
-      return reinterpret_cast<T *>(gpu_.data_);
-    }
-
-    // get cuda ptr. mutable
-    T *CUDAMutableData(platform::Place place) {
-      const T *ptr = CUDAData(place);
-      flag_ = kDirty | kDataInCUDA;
-      return const_cast<T *>(ptr);
-    }
-
-    // clear
-    void clear() {
-      cpu_.clear();
-      flag_ = kDirty | kDataInCPU;
-    }
-
-    size_t capacity() const { return cpu_.capacity(); }
-
-    // reserve data
-    void reserve(size_t size) { cpu_.reserve(size); }
-
-    // implicit cast operator. Vector can be cast to std::vector implicitly.
-    operator std::vector<T>() const {
-      ImmutableCPU();
-      return cpu_;
-    }
-
-    bool operator==(const VectorData &other) const {
-      ImmutableCPU();
-      other.ImmutableCPU();
-      return cpu_ == other.cpu_;
-    }
-
-   private:
-    enum DataFlag {
-      kDataInCPU = 0x01,
-      kDataInCUDA = 0x02,
-      // kDirty means the data has been changed in one device.
-      kDirty = 0x10
-    };
-
-    void CopyToCPU() const {
-      // COPY GPU Data To CPU
-      void *src = gpu_.data_;
-      void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
-                   nullptr);
-    }
-
-    void MutableCPU() {
-      if (IsInCUDA() && IsDirty()) {
-        CopyToCPU();
-      }
-      flag_ = kDirty | kDataInCPU;
-    }
-
-    void ImmutableCUDA(platform::Place place) const {
-      if (IsDirty()) {
-        if (IsInCPU()) {
-          CopyCPUDataToCUDA(place);
-          UnsetFlag(kDirty);
-          SetFlag(kDataInCUDA);
-        } else if (IsInCUDA() &&
-                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-          CopyCUDADataToAnotherPlace(place);
-          // Still dirty
-        } else {
-          // Dirty && DataInCUDA && Device is same
-          // Do nothing
-        }
-      } else {
-        if (!IsInCUDA()) {
-          // Even data is not dirty. However, data is not in CUDA. Copy data.
-          CopyCPUDataToCUDA(place);
-          SetFlag(kDataInCUDA);
-        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-          CopyCUDADataToAnotherPlace(place);
-        } else {
-          // Not Dirty && DataInCUDA && Device is same
-          // Do nothing.
-        }
-      }
-    }
-    void CopyCUDADataToAnotherPlace(const platform::Place &place) const {
-      details::CUDABuffer tmp(place, gpu_.size_);
-      const void *src = gpu_.data_;
-      void *dst = tmp.data_;
-
-      memory::Copy(tmp.place_, dst, gpu_.place_, src, gpu_.size_, nullptr);
-      gpu_.Swap(tmp);
-    }
-    void CopyCPUDataToCUDA(const platform::Place &place) const {
-      void *src = cpu_.data();
-      gpu_.Resize(place, cpu_.size() * sizeof(T));
-      void *dst = gpu_.data_;
-      auto stream = static_cast<platform::CUDADeviceContext *>(
-                        platform::DeviceContextPool::Instance().Get(place))
-                        ->stream();
-      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
-                   stream);
-    }
-
-    void ImmutableCPU() const {
-      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
-                                      // CPU has no data.
-        CopyToCPU();
-        UnsetFlag(kDirty);
-      }
-      SetFlag(kDataInCPU);
-    }
-
-    void UnsetFlag(int flag) const { flag_ &= ~flag; }
-    void SetFlag(int flag) const { flag_ |= flag; }
-
-    bool IsDirty() const { return flag_ & kDirty; }
-
-    bool IsInCUDA() const { return flag_ & kDataInCUDA; }
 
-    bool IsInCPU() const { return flag_ & kDataInCPU; }
-
-    mutable std::vector<T> cpu_;
-    mutable details::CUDABuffer gpu_;
-    mutable int flag_;
-  };
-
- public:
   // Default ctor. Create empty Vector
-  Vector() : m_(new VectorData()) {}
+  Vector() { InitEmpty(); }
 
   // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T &value = T())
-      : m_(new VectorData(count, value)) {}
+  explicit Vector(size_t count, const T &value = T()) {
+    InitEmpty();
+    if (count != 0) {
+      resize(count);
+      T *ptr = begin();
+      for (size_t i = 0; i < count; ++i) {
+        ptr[i] = value;
+      }
+    }
+  }
 
   // Ctor with init_list
-  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
+  Vector(std::initializer_list<T> init) {
+    if (init.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(init.size(), init.begin(), init.end());
+    }
+  }
 
   // implicit cast from std::vector.
   template <typename U>
-  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
+  Vector(const std::vector<U> &dat) {  // NOLINT
+    if (dat.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(dat.size(), dat.begin(), dat.end());
+    }
   }
 
   // Copy ctor
-  Vector(const Vector<T> &other) { m_ = other.m_; }
+  Vector(const Vector<T> &other) { this->operator=(other); }
 
   // Copy operator
   Vector<T> &operator=(const Vector<T> &other) {
-    m_ = other.m_;
+    if (other.size() != 0) {
+      this->InitByIter(other.size(), other.begin(), other.end());
+    } else {
+      InitEmpty();
+    }
     return *this;
   }
 
   // Move ctor
-  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
+  Vector(Vector<T> &&other) {
+    this->size_ = other.size_;
+    this->flag_ = other.flag_;
+    if (other.cuda_vec_.memory_size()) {
+      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
+    }
+    if (other.cpu_vec_.memory_size()) {
+      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
+    }
+  }
 
   // CPU data access method. Mutable.
-  T &operator[](size_t i) { return (*m_)[i]; }
+  T &operator[](size_t i) {
+    MutableCPU();
+    return const_cast<T *>(cpu_vec_.data<T>())[i];
+  }
 
   // CPU data access method. Immutable.
-  const T &operator[](size_t i) const { return (*m_)[i]; }
+  const T &operator[](size_t i) const {
+    ImmutableCPU();
+    return cpu_vec_.data<T>()[i];
+  }
 
   // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return m_->size(); }
+  size_t size() const { return size_; }
 
-  iterator begin() { return m_->begin(); }
+  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
 
-  iterator end() { return m_->end(); }
+  T *end() {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }
 
-  T &front() { return m_->front(); }
+  T &front() { return *begin(); }
 
-  T &back() { return m_->back(); }
+  T &back() {
+    auto it = end();
+    --it;
+    return *it;
+  }
 
-  const_iterator begin() const { return m_->begin(); }
+  const T *begin() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
+  }
 
-  const_iterator end() const { return m_->end(); }
+  const T *end() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }
 
-  const_iterator cbegin() const { return begin(); }
+  const T *cbegin() const { return begin(); }
 
-  const_iterator cend() const { return end(); }
+  const T *cend() const { return end(); }
 
-  const T &back() const { return m_->back(); }
+  const T &back() const {
+    auto it = end();
+    --it;
+    return *it;
+  }
 
-  T *data() { return m_->data(); }
+  T *data() { return begin(); }
 
-  const T *data() const { return m_->data(); }
+  const T *data() const { return begin(); }
 
-  const T &front() const { return m_->front(); }
+  const T &front() const { return *begin(); }
   // end of std::vector iterator methods
 
   // assign this from iterator.
   // NOTE: the iterator must support `end-begin`
   template <typename Iter>
   void assign(Iter begin, Iter end) {
-    m_->assign(begin, end);
+    InitByIter(end - begin, begin, end);
   }
 
   // push_back. If the previous capacity is not enough, the memory will
   // double.
-  void push_back(T elem) { m_->push_back(elem); }
+  void push_back(T elem) {
+    if (size_ + 1 > capacity()) {
+      reserve((size_ + 1) << 1);
+    }
+    *end() = elem;
+    ++size_;
+  }
 
   // extend a vector by iterator.
   // NOTE: the iterator must support end-begin
   template <typename It>
   void Extend(It begin, It end) {
-    m_->Extend(begin, end);
+    size_t pre_size = size_;
+    resize(pre_size + (end - begin));
+    T *ptr = this->begin() + pre_size;
+    for (; begin < end; ++begin, ++ptr) {
+      *ptr = *begin;
+    }
   }
 
   // resize the vector
   void resize(size_t size) {
-    if (m_.Data().size() != size) {
-      m_->resize(size);
+    if (size + 1 <= capacity()) {
+      size_ = size;
+    } else {
+      MutableCPU();
+      Tensor cpu_tensor;
+      platform::Place cpu = platform::CPUPlace();
+      T *ptr = cpu_tensor.mutable_data<T>(
+          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+      const T *old_ptr =
+          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
+      if (old_ptr != nullptr) {
+        std::copy(old_ptr, old_ptr + size_, ptr);
+      }
+      size_ = size;
+      cpu_vec_.ShareDataWith(cpu_tensor);
     }
   }
 
   // get cuda ptr. immutable
   const T *CUDAData(platform::Place place) const {
-    return m_.Data().CUDAData(place);
+    PADDLE_ENFORCE(platform::is_gpu_place(place),
+                   "CUDA Data must on CUDA place");
+    ImmutableCUDA(place);
+    return cuda_vec_.data<T>();
   }
 
   // get cuda ptr. mutable
   T *CUDAMutableData(platform::Place place) {
-    return m_->CUDAMutableData(place);
+    const T *ptr = CUDAData(place);
+    flag_ = kDirty | kDataInCUDA;
+    return const_cast<T *>(ptr);
   }
 
   // clear
-  void clear() { m_->clear(); }
+  void clear() {
+    size_ = 0;
+    flag_ = kDirty | kDataInCPU;
+  }
 
-  size_t capacity() const { return m_->capacity(); }
+  size_t capacity() const {
+    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
+  }
 
   // reserve data
-  void reserve(size_t size) { m_->reserve(size); }
+  void reserve(size_t size) {
+    size_t pre_size = size_;
+    resize(size);
+    resize(pre_size);
+  }
 
   // the unify method to access CPU or CUDA data. immutable.
   const T *Data(platform::Place place) const {
@@ -445,7 +248,12 @@ class Vector {
   }
 
   // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const { return *m_; }
+  operator std::vector<T>() const {
+    std::vector<T> result;
+    result.resize(size());
+    std::copy(begin(), end(), result.begin());
+    return result;
+  }
 
   bool operator==(const Vector<T> &other) const {
     if (size() != other.size()) return false;
@@ -459,11 +267,118 @@ class Vector {
     return true;
   }
 
-  const void *Handle() const { return &m_.Data(); }
-
  private:
-  // Vector is an COW object.
-  details::COWPtr<VectorData> m_;
+  void InitEmpty() {
+    size_ = 0;
+    flag_ = kDataInCPU;
+  }
+
+  template <typename Iter>
+  void InitByIter(size_t size, Iter begin, Iter end) {
+    platform::Place cpu = platform::CPUPlace();
+    T *ptr = this->cpu_vec_.template mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+    for (size_t i = 0; i < size; ++i) {
+      *ptr++ = *begin++;
+    }
+    flag_ = kDataInCPU | kDirty;
+    size_ = size;
+  }
+
+  enum DataFlag {
+    kDataInCPU = 0x01,
+    kDataInCUDA = 0x02,
+    // kDirty means the data has been changed in one device.
+    kDirty = 0x10
+  };
+
+  void CopyToCPU() const {
+    // COPY GPU Data To CPU
+    TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
+    WaitPlace(cuda_vec_.place());
+  }
+
+  void MutableCPU() {
+    if (IsInCUDA() && IsDirty()) {
+      CopyToCPU();
+    }
+    flag_ = kDirty | kDataInCPU;
+  }
+
+  void ImmutableCUDA(platform::Place place) const {
+    if (IsDirty()) {
+      if (IsInCPU()) {
+        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+                   &cuda_vec_);
+        WaitPlace(place);
+        UnsetFlag(kDirty);
+        SetFlag(kDataInCUDA);
+      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
+        framework::Tensor tmp;
+        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        WaitPlace(cuda_vec_.place());
+        cuda_vec_.ShareDataWith(tmp);
+        // Still dirty
+      } else {
+        // Dirty && DataInCUDA && Device is same
+        // Do nothing
+      }
+    } else {
+      if (!IsInCUDA()) {
+        // Even data is not dirty. However, data is not in CUDA. Copy data.
+        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+                   &cuda_vec_);
+        WaitPlace(place);
+        SetFlag(kDataInCUDA);
+      } else if (!(place == cuda_vec_.place())) {
+        framework::Tensor tmp;
+        WaitPlace(cuda_vec_.place());
+        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        WaitPlace(cuda_vec_.place());
+        WaitPlace(place);
+        cuda_vec_.ShareDataWith(tmp);
+      } else {
+        // Not Dirty && DataInCUDA && Device is same
+        // Do nothing.
+      }
+    }
+  }
+
+  void ImmutableCPU() const {
+    if (IsDirty() &&
+        !IsInCPU()) {  // If data has been changed in CUDA, or CPU has no data.
+      CopyToCPU();
+      UnsetFlag(kDirty);
+    }
+    SetFlag(kDataInCPU);
+  }
+
+  void UnsetFlag(int flag) const { flag_ &= ~flag; }
+  void SetFlag(int flag) const { flag_ |= flag; }
+
+  bool IsDirty() const { return flag_ & kDirty; }
+
+  bool IsInCUDA() const { return flag_ & kDataInCUDA; }
+
+  bool IsInCPU() const { return flag_ & kDataInCPU; }
+
+  static void WaitPlace(const platform::Place place) {
+    if (platform::is_gpu_place(place)) {
+      platform::DeviceContextPool::Instance()
+          .Get(boost::get<platform::CUDAPlace>(place))
+          ->Wait();
+    }
+  }
+
+  static T &EmptyDummy() {
+    static T dummy = T();
+    return dummy;
+  }
+
+  mutable int flag_;
+  mutable Tensor cpu_vec_;
+  mutable Tensor cuda_vec_;
+  size_t size_;
 };
 
 #else  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index 5b27068c9e805146b8bce03f4f676ef0d4d16c53..4cb1f3a80e95bdda79e6451dc3cc87e899b11779 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>  // for sqrt in CPU and CUDA
 #include <Eigen/Dense>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -306,26 +307,43 @@ class AdamOpKernel : public framework::OpKernel<T> {
         VLOG(3) << "grad row size is 0!!";
         return;
       }
-      // merge duplicated rows if any.
-      // The rows of grad_merge have been sorted inside MergeAdd functor
-      scatter::MergeAdd<DeviceContext, T> merge_func;
-      auto& grad_merge = *(ctx.scope()
-                               .NewScope()
-                               .Var("sparse_adam_grad_merge")
-                               ->GetMutable<framework::SelectedRows>());
-      merge_func(ctx.template device_context<DeviceContext>(), grad,
-                 &grad_merge);
+
+      std::vector<int64_t> cpu_rows(grad.rows().begin(), grad.rows().end());
+      bool is_strict_sorted = true;
+      for (size_t i = 1; i < cpu_rows.size(); ++i) {
+        if (cpu_rows[i - 1] >= cpu_rows[i]) {
+          is_strict_sorted = false;
+          break;
+        }
+      }
+
+      const framework::SelectedRows* grad_merge_ptr;
+      if (is_strict_sorted) {
+        grad_merge_ptr = &grad;
+      } else {
+        // merge duplicated rows if any.
+        // The rows of grad_merge have been sorted inside MergeAdd functor
+        scatter::MergeAdd<DeviceContext, T> merge_func;
+        auto* grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
+                                   .Var()
+                                   ->GetMutable<framework::SelectedRows>();
+        merge_func(ctx.template device_context<DeviceContext>(), grad,
+                   grad_merge_var);
+        grad_merge_ptr = grad_merge_var;
+      }
+
+      auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      int64_t* rows = nullptr;
-// When compiled without CUDA, the CUDAMutableData() interface should not be
+      const int64_t* rows = nullptr;
+// When compiled without CUDA, the CUDAData() interface should not be
 // provided.
 #if defined(PADDLE_WITH_CUDA)
       if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
+        rows = grad_merge.rows().CUDAData(ctx.GetPlace());
       } else {
 #endif
-        rows = grad_merge.mutable_rows()->data();
+        rows = grad_merge.rows().data();
 
 #if defined(PADDLE_WITH_CUDA)
       }
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index dd5d138a1e979826d59c4731920379b030e3b492..dd1ab85fd8d0c8170afcd9dd2a49ee55c41dc8be 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
     int class_num = ctx.Attr<int>("class_num");
 
-    auto& label_lod = in_label->lod();
-    auto& detect_lod = in_detect->lod();
+    auto label_lod = in_label->lod();
+    auto detect_lod = in_detect->lod();
     PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
                       "Only support one level sequence now.");
     PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     auto labels = framework::EigenTensor<T, 2>::From(input_label);
     auto detect = framework::EigenTensor<T, 2>::From(input_detect);
 
-    auto& label_lod = input_label.lod();
-    auto& detect_lod = input_detect.lod();
+    auto label_lod = input_label.lod();
+    auto detect_lod = input_detect.lod();
 
     int batch_size = label_lod[0].size() - 1;
-    auto& label_index = label_lod[0];
+    auto label_index = label_lod[0];
 
     for (int n = 0; n < batch_size; ++n) {
       std::map<int, std::vector<Box>> boxes;
@@ -274,6 +274,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
 
     output_true_pos->set_lod(true_pos_lod);
     output_false_pos->set_lod(false_pos_lod);
+    return;
   }
 
   void GetInputPos(const framework::Tensor& input_pos_count,
@@ -291,7 +292,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     auto SetData = [](const framework::LoDTensor& pos_tensor,
                       std::map<int, std::vector<std::pair<T, int>>>& pos) {
       const T* pos_data = pos_tensor.data<T>();
-      auto& pos_data_lod = pos_tensor.lod()[0];
+      auto pos_data_lod = pos_tensor.lod()[0];
       for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
         for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
           T score = pos_data[j * 2];
@@ -316,23 +317,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
     int batch_size = gt_boxes.size();
     for (int n = 0; n < batch_size; ++n) {
-      auto& image_gt_boxes = gt_boxes[n];
-      for (auto& image_gt_box : image_gt_boxes) {
+      auto image_gt_boxes = gt_boxes[n];
+      for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
         size_t count = 0;
-        auto& labeled_bboxes = image_gt_box.second;
+        auto labeled_bboxes = it->second;
         if (evaluate_difficult) {
           count = labeled_bboxes.size();
         } else {
-          for (auto& box : labeled_bboxes) {
-            if (!box.is_difficult) {
-              ++count;
-            }
-          }
+          for (size_t i = 0; i < labeled_bboxes.size(); ++i)
+            if (!(labeled_bboxes[i].is_difficult)) ++count;
         }
         if (count == 0) {
           continue;
         }
-        int label = image_gt_box.first;
+        int label = it->first;
         if (label_pos_count->find(label) == label_pos_count->end()) {
           (*label_pos_count)[label] = count;
         } else {
diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc
index 3acae3bcdf4a509ab6e7e19f21c4b2ec4d72b7d7..9a297d03cfb041e584159a5fc5ba214f8ac404b4 100644
--- a/paddle/fluid/operators/extract_rows_op.cc
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
     auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
     auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
 
-    auto &in_rows = in.rows();
+    auto in_rows = in.rows();
     auto out_dim = framework::make_ddim(
         std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
     auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index ba8eccf82042b679f69a32f9d053f05ac8fb9a99..b27880c232a51d32777569cf9ac67656ce02f232 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -60,9 +60,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     auto out_place = context.GetPlace();
     PADDLE_ENFORCE(platform::is_gpu_place(out_place));
 
-    memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
-                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
-                 in1_value.numel() * sizeof(T), context.stream());
+    memory::Copy(
+        boost::get<platform::CUDAPlace>(out_place), out_data,
+        boost::get<platform::CUDAPlace>(in1_place), in1_data,
+        in1_value.numel() * sizeof(T),
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
 
     auto* in2_data = in2_value.data<T>();
     memory::Copy(boost::get<platform::CUDAPlace>(out_place),
@@ -146,7 +148,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     auto in1_height = input1.height();
     PADDLE_ENFORCE_EQ(in1_height, input2->height());
 
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
     auto& in2_rows = *(input2->mutable_rows());
 
     auto& in1_value = input1.value();
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 2c4c2411259e9b7bfa223e4d65823ce4610596d0..6dffe527c1072ee97fcde1725bfc1a47ed1ad74a 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -123,6 +123,7 @@ class SumKernel : public framework::OpKernel<T> {
 
       out_value->Resize(framework::make_ddim(in_dim));
       out_value->mutable_data<T>(context.GetPlace());
+
       // if all the input sparse vars are empty, no need to
       // merge these vars.
       if (first_dim == 0UL) {
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 7a7a0078a557c47492a4396897aafabe6c9c5dcb..a26b8df5a240be8340597b9627866c323fa98a2d 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -74,7 +74,7 @@ class Initializer(object):
     directly, but need to use one of its implementations.
     """
 
-    def __init_(self):
+    def __init__(self):
         pass
 
     def __call__(self, param, block):
@@ -293,7 +293,7 @@ class TruncatedNormalInitializer(Initializer):
         assert loc is not None
         assert scale is not None
         assert seed is not None
-        super(NormalInitializer, self).__init__()
+        super(TruncatedNormalInitializer, self).__init__()
         self._mean = loc
         self._std_dev = scale
         self._seed = seed
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index e99e8afd877a8ea342aac0ce30a9750d3d30942d..d558f2436441c6be8728ced6670b04cc367634a2 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -107,6 +107,12 @@ __all__ = [
     'log',
     'crop',
     'rank_loss',
+    'elu',
+    'relu6',
+    'pow',
+    'stanh',
+    'hard_sigmoid',
+    'swish',
     'prelu',
     'brelu',
     'leaky_relu',
@@ -5898,6 +5904,148 @@ def pad2d(input,
     return out
 
 
+@templatedoc()
+def elu(x, alpha=1.0, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        alpha(${alpha_type}|1.0): ${alpha_comment}
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        output(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper('elu', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='elu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': alpha})
+    return out
+
+
+@templatedoc()
+def relu6(x, threshold=6.0, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        threshold(${threshold_type}|6.0): ${threshold_comment}
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        output(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper('relu6', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='relu6',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold})
+    return out
+
+
+@templatedoc()
+def pow(x, factor=1.0, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        factor(${factor_type}|1.0): ${factor_comment}
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        output(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper('pow', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='pow',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'factor': factor})
+    return out
+
+
+@templatedoc()
+def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        scale_a(${scale_a_type}|2.0 / 3.0): ${scale_a_comment}
+        scale_b(${scale_b_type}|1.7159): ${scale_b_comment}
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        output(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper('stanh', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='stanh',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'scale_a': scale_a,
+               'scale_b': scale_b})
+    return out
+
+
+@templatedoc()
+def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        slope(${slope_type}|0.2): ${slope_comment}
+        offset(${offset_type}|0.5): ${offset_comment}
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        output(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper('hard_sigmoid', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='hard_sigmoid',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': slope,
+               'offset': offset})
+    return out
+
+
+@templatedoc()
+def swish(x, beta=1.0, name=None):
+    """
+    ${comment}
+    Args:
+        x(${x_type}): ${x_comment}
+        beta(${beta_type}|1.0): ${beta_comment}
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        output(${out_type}): ${out_comment}
+    """
+    helper = LayerHelper('swish', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='swish',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': beta})
+    return out
+
+
 def prelu(x, mode, param_attr=None, name=None):
     """
     Equation:
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 6c9cbe740e8fdf8fb0a042943ce0b2008efdfc42..4d942170f21d8d3944c47adece85c8888abe5309 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -17,12 +17,6 @@ from .layer_function_generator import generate_layer_fn, generate_layer_fn_noatt
 
 __activations__ = [
     'softshrink',
-    'elu',
-    'relu6',
-    'pow',
-    'stanh',
-    'hard_sigmoid',
-    'swish',
 ]
 
 __activations_noattr__ = [
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index e97643cddef22465436051a41ef4b825e9634d23..b5549c507ed753f4504afd655be59b444164e6f3 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase):
                         actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                     "Output (" + out_name + ") has diff at " + str(place) +
                     "\nExpect " + str(expect_t) + "\n" + "But Got" +
-                    str(actual_t) + " in class " + self.__class__.__name__)
+                    str(actual_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.recursive_sequence_lengths(),
                                          expect[1], "Output (" + out_name +
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index 0c5343a97d5ef0f97fc6b144dfc82174eacb8573..f6eb8f2c6d8b94f92e24ff789c91efb53a645a46 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -20,7 +20,6 @@ import six
 import sys
 import collections
 import math
-import paddle.fluid as fluid
 from op_test import OpTest
 
 
@@ -33,7 +32,7 @@ class TestDetectionMAPOp(OpTest):
         self.detect = np.array(self.detect).astype('float32')
         self.mAP = np.array(self.mAP).astype('float32')
 
-        if len(self.class_pos_count) > 0:
+        if (len(self.class_pos_count) > 0):
             self.class_pos_count = np.array(self.class_pos_count).astype(
                 'int32')
             self.true_pos = np.array(self.true_pos).astype('float32')
@@ -274,7 +273,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp):
 class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
     def init_test_case(self):
         super(TestDetectionMAPOpMultiBatch, self).init_test_case()
-        self.class_pos_count = [0, 2, 1, 0]
+        self.class_pos_count = [0, 2, 1]
         self.true_pos_lod = [[0, 3, 2]]
         self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
         self.false_pos_lod = [[0, 3, 2]]