diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 8035d93bfec75b20a54c5af0521ab724cafba8ca..9cc4233e43267472d405c3e4e617f0782e1430ea 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(detail)
 
 cc_library(memory SRCS memory.cc)
-cc_library(memcpy SRCS memcpy.cc DEPS device_context)
+cc_library(memcpy SRCS memcpy.cc)
 
 cc_library(paddle_memory
     DEPS
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index c1ad60d16064753ef50c4d90bef4fcc88cb1a310..acfc0639736beb82df41b851664e7bcd079b5eb1 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -16,5 +16,8 @@ ELSE()
     set(GPU_CTX_DEPS)
 ENDIF()
 
-cc_library(device_context SRCS device_context.cc DEPS memory place eigen3 ${GPU_CTX_DEPS})
+# memcpy deoends on device_context, here add deps individually for
+# avoiding cycle dependencies
+cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
+    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index dc345bdd57ed2d7560a8027f70df3ef1d1cfbb97..f92c15ae450e94de44d27e77763e791e6bae4426 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -57,7 +57,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   }
 
   void* allocate(size_t num_bytes) const override {
-    paddle::memory::Alloc(place_, num_bytes);
+    return paddle::memory::Alloc(place_, num_bytes);
   }
 
   void deallocate(void* buffer) const override {
@@ -86,7 +86,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   GPUPlace place_;
   const cudaStream_t* stream_;         // not owned;
   const cudaDeviceProp* device_prop_;  // not owned;
-  mutable char* scratch_;
+  mutable void* scratch_;
   mutable unsigned int* semaphore_;
 };
 
@@ -145,7 +145,7 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
   if (!cudnn_handle_) {
     SetDeviceId(place_.device);
     PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
-    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnnHandle_t, stream_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
   }
   return cudnn_handle_;
 }
@@ -160,7 +160,7 @@ curandGenerator_t CUDADeviceContext::curand_generator() {
     PADDLE_ENFORCE(
         dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
 
-    PADDLE_ENFORCE(dynload::curandSetStream(curandGenerator_t, stream_));
+    PADDLE_ENFORCE(dynload::curandSetStream(curand_generator_, stream_));
   }
   return curand_generator_;
 }
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index b68e177c0a62d8855c7ea2a40baa6ebef16183ff..c5042ae33e47e04521e59e0d91ddd8d4efffe50a 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -52,6 +52,7 @@ class CPUDeviceContext : public DeviceContext {
 };
 
 #ifndef PADDLE_ONLY_CPU
+class EigenCudaStreamDevice;
 
 class CUDADeviceContext : public DeviceContext {
  public:
@@ -92,7 +93,7 @@ class CUDADeviceContext : public DeviceContext {
   uint64_t seed_;
 
   // clang-format off
-  cudaStream_t       stream_{nullptr}
+  cudaStream_t       stream_{nullptr};
   cudnnHandle_t      cudnn_handle_{nullptr};
   cublasHandle_t     cublas_handle_{nullptr};
   curandGenerator_t  curand_generator_{nullptr};