diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 35e69dcb20411b77d3b24edf0e9d96bf8cbf1aa2..ef1bc07c2dbe71268c706a119056d3a9fcfc7f8c 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -20,6 +20,7 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index d0ab640485baf6d76ee629ea420b603f42b031b4..be2b301619639106ac7b578e5a79cf33f4379e48 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -48,12 +48,26 @@ namespace framework {
  */
 struct LoD : public std::vector<Vector<size_t>> {
   using std::vector<Vector<size_t>>::vector;
+  platform::Place place() const {
+    if (this->size() == 0) {
+      // Not Initialze Yet.
+      return platform::CPUPlace();
+    } else {
+      return this->front().place();
+    }
+  }
 
   void CopyFromCUDA() {
     for (auto it = this->begin(); it != this->end(); ++it) {
       it->CopyFromCUDA();
     }
   }
+
+  void CopyToPeer(platform::Place place) {
+    for (auto it = this->begin(); it != this->end(); ++it) {
+      it->CopyToPeer(place);
+    }
+  }
 };
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index d4c9f00bd9c00f3cae68858ca46c5320fc117405..adea02e3b3fdcf4873de76ff91116f43ac9fe259 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -28,28 +28,6 @@ __global__ void test(size_t* a, int size) {
   }
 }
 
-TEST(Vector, Normal) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::memory;
-
-  paddle::framework::InitDevices();
-
-  paddle::framework::Vector<size_t> vec({1, 2, 3});
-  size_t* ptr = vec.data();
-  for (size_t i = 0; i < vec.size(); ++i) {
-    EXPECT_EQ(vec[i], *(ptr + i));
-  }
-
-  vec.clear();
-  vec.CopyFromCUDA();
-
-  std::vector<size_t> v = {1, 2, 3};
-  for (size_t i = 0; i < v.size(); ++i) {
-    EXPECT_EQ(v[i], vec[i]);
-  }
-}
-
 TEST(LoD, data) {
   paddle::framework::InitDevices();
 
diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h
index 422fbbac488abff846d0d79e393ecef5400de9d2..5202775515d335ff81bb17e6ce21338c40041ca3 100644
--- a/paddle/framework/mixed_vector.h
+++ b/paddle/framework/mixed_vector.h
@@ -40,20 +40,21 @@ class Vector : public std::vector<T> {
   Vector() {}
   Vector(const std::vector<T> &v) : std::vector<T>(v) {}  // NOLINT
 
-  virtual ~Vector() {
-#ifdef PADDLE_WITH_CUDA
-    if (cuda_ptr_ != nullptr) {
-      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
-    }
-#endif
-  }
+  inline platform::Place place() const { return place_; }
 
+  /*! Return a pointer to constant memory block. */
+  inline const T *data(platform::Place place) const;
+
+  /*! Return a pointer to mutable memory block. */
+  inline T *mutable_data(platform::Place place);
+
+  // TODO(dzhwinter): below interfaces should be removed
   /* Get device vector */
   T *cuda_data() {
     CopyToCUDA();
     PADDLE_ENFORCE_NOT_NULL(
         cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
-    return static_cast<T *>(cuda_ptr_);
+    return static_cast<T *>(cuda_ptr_.get());
   }
 
   /* Get host vector */
@@ -76,25 +77,73 @@ class Vector : public std::vector<T> {
   void CopyToPeer(platform::Place);
 
  private:
-  void *cuda_ptr_ = nullptr;
+  std::shared_ptr<void> cuda_ptr_;
   size_t cuda_size_ = 0;  // device vector numel
   platform::CUDAPlace place_;
 };
 
 template <typename T>
-void Vector<T>::CopyToCUDA() {
+inline const T *Vector<T>::data(platform::Place place) const {
+  if (platform::is_cpu_place(place)) {
+    return std::vector<T>::data();
+  } else if (platform::is_gpu_place(place)) {
+    if (cuda_ptr_ == nullptr) {
+      return nullptr;
+    }
+    if (boost::get<platform::CUDAPlace>(place) == place_) {
+      return static_cast<const T *>(cuda_ptr_.get());
+    } else {
+      PADDLE_THROW(
+          "Unmatched place. Please use `mutable_data` copy lod to the target "
+          "Place first.");
+    }
+  } else {
+    PADDLE_THROW("Unsupport Place.");
+  }
+}
+
+template <typename T>
+inline T *Vector<T>::mutable_data(platform::Place place) {
+  if (platform::is_cpu_place(place)) {
+    return std::vector<T>::data();
+  } else if (platform::is_gpu_place(place)) {
+    if (boost::get<platform::CUDAPlace>(place) != place_) {
+      place_ = boost::get<platform::CUDAPlace>(place);
+    }
 #ifdef PADDLE_WITH_CUDA
-  if (cuda_size_ < this->size()) {
-    if (cuda_ptr_ != nullptr) {
-      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
+    if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+      cuda_ptr_.reset(
+          memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+          memory::PlainDeleter<void, platform::CUDAPlace>(place_));
     }
-    cuda_ptr_ =
-        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
+    cuda_size_ = this->size();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto *ctx = pool.GetByPlace(place_);
+    memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
+                 static_cast<const void *>(this->data()),
+                 this->size() * sizeof(T), ctx->stream());
+    ctx->Wait();
+    return static_cast<T *>(cuda_ptr_.get());
+#else
+    return nullptr;
+#endif
+  } else {
+    PADDLE_THROW("Unsupport Place.");
+  }
+}
+
+template <typename T>
+void Vector<T>::CopyToCUDA() {
+#ifdef PADDLE_WITH_CUDA
+  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+    cuda_ptr_.reset(
+        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
   }
   cuda_size_ = this->size();
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto *ctx = pool.GetByPlace(place_);
-  memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
+  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
                static_cast<const void *>(this->data()),
                this->size() * sizeof(T), ctx->stream());
   ctx->Wait();
@@ -112,32 +161,32 @@ void Vector<T>::CopyFromCUDA() {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto *ctx = pool.GetByPlace(place_);
   memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
-               static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
-               ctx->stream());
+               static_cast<const void *>(cuda_ptr_.get()),
+               this->size() * sizeof(T), ctx->stream());
   ctx->Wait();
 #endif
 }
 
 template <typename T>
-void Vector<T>::CopyToPeer(platform::Place peer_place) {
+void Vector<T>::CopyToPeer(platform::Place place) {
 #ifdef PADDLE_WITH_CUDA
-  auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
-  void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
-      boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
-  memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
-               place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
+  if (boost::get<platform::CUDAPlace>(place) != place_) {
+    place_ = boost::get<platform::CUDAPlace>(place);
+  }
+  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+    cuda_ptr_.reset(
+        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
+  }
+  cuda_size_ = this->size();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
+               static_cast<const void *>(this->data()),
+               this->size() * sizeof(T), ctx->stream());
   ctx->Wait();
-
-  memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
-  place_ = boost::get<platform::CUDAPlace>(peer_place);
-  cuda_ptr_ = peer_cuda_ptr;
 #endif
 }
 
-template class Vector<int>;
-template class Vector<unsigned>;
-template class Vector<size_t>;
-template class Vector<int64_t>;
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/mixed_vector_test.cu b/paddle/framework/mixed_vector_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7b571788ad1ade50e05dc9a70cba35b83f8db3ea
--- /dev/null
+++ b/paddle/framework/mixed_vector_test.cu
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/mixed_vector.h"
+
+using namespace paddle::framework;
+using namespace paddle::platform;
+using namespace paddle::memory;
+
+template <typename T>
+__global__ void test(T* data, int size) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    data[i] *= 2;
+  }
+}
+
+TEST(Vector, Normal) {
+  // fill the device context pool.
+  InitDevices();
+
+  Vector<size_t> vec({1, 2, 3});
+  size_t* ptr = vec.data();
+  for (size_t i = 0; i < vec.size(); ++i) {
+    EXPECT_EQ(vec[i], *(ptr + i));
+  }
+
+  vec.clear();
+  vec.CopyFromCUDA();
+
+  std::vector<size_t> v = {1, 2, 3};
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], vec[i]);
+  }
+}
+
+TEST(Vector, MultipleCopy) {
+  InitDevices();
+  Vector<size_t> vec({1, 2, 3});
+  CUDAPlace place(0);
+  vec.mutable_data(place);
+  auto vec2 = Vector<size_t>(vec);
+  {
+    const size_t* ptr = vec2.data(CPUPlace());
+    for (size_t i = 0; i < vec2.size(); ++i) {
+      EXPECT_EQ(*(ptr + i), vec[i]);
+    }
+  }
+  test<size_t><<<3, 3>>>(vec2.mutable_data(place), vec2.size());
+  vec2.CopyFromCUDA();
+  {
+    const size_t* ptr = vec2.data(CPUPlace());
+    for (size_t i = 0; i < vec2.size(); ++i) {
+      EXPECT_EQ(*(ptr + i), vec[i] * 2);
+    }
+  }
+}
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 7012b6d331d0c4631a3d120fbaf3db7c97298ac7..30ed68c6e0ea95d206658d16684800e169ededc5 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -81,5 +81,23 @@ class PODDeleter {
   Place place_;
 };
 
+/**
+ * \brief   Free memory block in one place does not meet POD
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PlainDeleter {
+ public:
+  explicit PlainDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
index dfff6f0888a5258dfbd68d34ef83ca8306aeed2a..89045923f9ff2f33bc112b199c493047440e15c4 100644
--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/operators/parallel_do_op.cc
@@ -76,18 +76,25 @@ inline void CopyOrShare(const framework::Variable &src,
   if (src.IsType<LoDTensor>()) {
     if (src.Get<LoDTensor>().place() == dst_place) {
       dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
+      dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
     } else {
       Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
+      framework::LoD lod(src.Get<LoDTensor>().lod());
+      lod.CopyToPeer(dst_place);
+      dst->GetMutable<LoDTensor>()->set_lod(lod);
     }
   } else if (src.IsType<SelectedRows>()) {
     auto &src_sr = src.Get<SelectedRows>();
     auto *dst_sr = dst->GetMutable<SelectedRows>();
-    dst_sr->set_rows(src_sr.rows());
     dst_sr->set_height(src_sr.height());
     if (src_sr.value().place() == dst_place) {
       dst_sr->mutable_value()->ShareDataWith(src_sr.value());
+      dst_sr->set_rows(src_sr.rows());
     } else {
       Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
+      framework::Vector<int64_t> lod(src_sr.rows());
+      lod.CopyToPeer(dst_place);
+      dst_sr->set_rows(lod);
     }
   } else {
     PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
@@ -145,6 +152,9 @@ class ParallelDoOp : public framework::OperatorBase {
         auto *sub_scope = sub_scopes[i];
         auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
         framework::Copy(src, place, dst);
+        framework::LoD lod(src.lod());
+        lod.CopyToPeer(place);
+        dst->set_lod(lod);
       }
     }
     WaitOnPlaces(places);