diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index df3d57d629184d28fd42130df9b020a7b52ade72..78443cc35a400bceac77b99c3468daf16d8a4690 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -15,7 +15,8 @@ limitations under the License. */
 #include "paddle/memory/memory.h"
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/assert.h"
+
+#include <cstring>  // for memcpy
 
 namespace paddle {
 namespace memory {
@@ -45,6 +46,13 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+template <>
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::CPUPlace,
+                                                  const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
 #ifndef PADDLE_ONLY_CPU
 
 detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
@@ -77,6 +85,41 @@ size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
   return GetGPUBuddyAllocator(place.device)->Used();
 }
 
+template <>
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  if (dst_place == src_place) {
+    platform::SetDeviceId(src_place.device);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+  } else {
+    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
+                            stream);
+  }
+}
+
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace memory
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index d0a64869cc3bcfa58b7e125e5d5699e6753396bf..7ef7a73bc8b25e6a637a5e89c87e3eef06174b92 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -14,20 +14,30 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 
 namespace paddle {
 namespace memory {
 
-template <class Place>
+template <typename Place>
 void* Alloc(Place, size_t);
 
-template <class Place>
+template <typename Place>
 void Free(Place, void*);
 
-template <class Place>
+template <typename Place>
 size_t Used(Place);
 
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
+
+#ifndef PADDLE_ONLY_CPU
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          cudaStream_t stream);
+#endif  // PADDLE_ONLY_CPU
+
 template <typename T, /* must be POD types */
           typename Place /* platform::GPUPlace or platform::CPUPlace */,
           typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 5d440dec48e7a4cba404bc297eca5a451a144d93..b06ab8a2f184e7bb7dd9cb39f377b087c5258dc4 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -43,10 +43,26 @@ namespace platform {
 // For more details, please check https://stackoverflow.com/a/43870188/724872.
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 
+template <typename T>
+inline void throw_on_error(T e) {
+  throw_on_error(e, "");
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    int stat, const Args&... args) {
+  if (UNLIKELY(!(stat))) {
+    throw std::runtime_error(
+        string::Sprintf(args...) +
+        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
+  }
+}
+
 #ifndef PADDLE_ONLY_CPU
 
 template <typename... Args>
-inline void throw_on_error(cudaError_t e, const Args&... args) {
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudaError_t e, const Args&... args) {
   if (UNLIKELY(e)) {
     // clang-format off
     throw thrust::system_error(
@@ -58,7 +74,8 @@ inline void throw_on_error(cudaError_t e, const Args&... args) {
 }
 
 template <typename... Args>
-inline void throw_on_error(curandStatus_t stat, const Args&... args) {
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    curandStatus_t stat, const Args&... args) {
   if (stat != CURAND_STATUS_SUCCESS) {
     // clang-format off
     throw thrust::system_error(
@@ -70,7 +87,8 @@ inline void throw_on_error(curandStatus_t stat, const Args&... args) {
 }
 
 template <typename... Args>
-inline void throw_on_error(cudnnStatus_t stat, const Args&... args) {
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudnnStatus_t stat, const Args&... args) {
   if (stat == CUDNN_STATUS_SUCCESS) {
     return;
   } else {
@@ -84,7 +102,8 @@ inline void throw_on_error(cudnnStatus_t stat, const Args&... args) {
 }
 
 template <typename... Args>
-inline void throw_on_error(cublasStatus_t stat, const Args&... args) {
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cublasStatus_t stat, const Args&... args) {
   std::string err;
   if (stat == CUBLAS_STATUS_SUCCESS) {
     return;
@@ -113,15 +132,6 @@ inline void throw_on_error(cublasStatus_t stat, const Args&... args) {
 
 #endif  // PADDLE_ONLY_CPU
 
-template <typename... Args>
-inline void throw_on_error(int stat, const Args&... args) {
-  if (UNLIKELY(!(stat))) {
-    throw std::runtime_error(
-        string::Sprintf(args...) +
-        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
-  }
-}
-
 #define PADDLE_THROW(...)                                     \
   do {                                                        \
     throw std::runtime_error(                                 \
@@ -129,12 +139,9 @@ inline void throw_on_error(int stat, const Args&... args) {
         string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); \
   } while (0)
 
-/**
- * @brief Enforce a condition, otherwise throw an EnforceNotMet
- */
-#define PADDLE_ENFORCE(condition, ...)                          \
-  do {                                                          \
-    ::paddle::platform::throw_on_error(condition, __VA_ARGS__); \
+#define PADDLE_ENFORCE(...)                          \
+  do {                                               \
+    ::paddle::platform::throw_on_error(__VA_ARGS__); \
   } while (0)
 
 }  // namespace platform
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index cf9921e870d47fe77c0cca80828dbf2bb36ccda8..edeb3ecd7bf8b87333813eee5b40f71030f6609f 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -44,7 +44,7 @@ void SetDeviceId(int id) {
                  "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }
 
-void GpuMemoryUsage(size_t& available, size_t& total) {
+void GpuMemoryUsage(size_t &available, size_t &total) {
   PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
                  "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
 }
@@ -82,5 +82,28 @@ size_t GpuMaxChunkSize() {
   return usable;
 }
 
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream) {
+  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
+                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind) {
+  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
+  // note: cudaMemcpy may actually be asynchronous with respect to the caller,
+  //       block on stream 0 to make sure the copy has completed
+  PADDLE_ENFORCE(
+      cudaStreamSynchronize(0),
+      "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
+}
+
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream) {
+  PADDLE_ENFORCE(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
+      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
+}
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index 79e71956bd32e8c253ac4192a04e5903bed1c94a..d3a5f5f13fdd3dd59eb43465da4a64b0d8d95e5b 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #ifndef PADDLE_ONLY_CPU
 
+#include <cuda_runtime.h>
 #include <stddef.h>
 
 namespace paddle {
@@ -31,7 +32,7 @@ int GetCurrentDeviceId();
 void SetDeviceId(int device_id);
 
 //！Get the memory usage of current GPU device.
-void GpuMemoryUsage(size_t& available, size_t& total);
+void GpuMemoryUsage(size_t &available, size_t &total);
 
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
@@ -42,6 +43,18 @@ size_t GpuMinChunkSize();
 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();
 
+//! Copy memory from address src to dst asynchronously.
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream);
+
+//! Copy memory from address src to dst synchronously.
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind);
+
+//! Copy memory from one device to another device.
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream);
+
 }  // namespace platform
 }  // namespace paddle