diff --git a/cmake/copyfile.py b/cmake/copyfile.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ba4d95049dc76d1f6bd5bb67e116d5d3f4ea23b
--- /dev/null
+++ b/cmake/copyfile.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import shutil
+import glob
+
+
+def main():
+    src = sys.argv[1]
+    dst = sys.argv[2]
+    if os.path.isdir(src):  #copy directory
+        pathList = os.path.split(src)
+        dst = os.path.join(dst, pathList[-1])
+        if not os.path.exists(dst):
+            shutil.copytree(src, dst)
+            print("first copy directory: {0} --->>> {1}".format(src, dst))
+        else:
+            shutil.rmtree(dst)
+            shutil.copytree(src, dst)
+            print("overwritten copy directory: {0} --->>> {1}".format(src, dst))
+    else:  #copy file, wildcard
+        if not os.path.exists(dst):
+            os.makedirs(dst)
+        srcFiles = glob.glob(src)
+        for srcFile in srcFiles:
+            shutil.copy(srcFile, dst)
+            print("copy file: {0} --->>> {1}".format(srcFile, dst))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index bd46bac0b36542082677a950e2afa747242e01df..c3877d2a4a4fee0dd08ca2ae35db0f6c9abb6d41 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -13,6 +13,14 @@
 # limitations under the License.
 
 # make package for paddle fluid shared and static library
+
+if(WIN32)
+    if(NOT PYTHON_EXECUTABLE)
+	FIND_PACKAGE(PythonInterp REQUIRED)
+    endif()
+endif()
+
+set(COPY_SCRIPT_DIR ${PADDLE_SOURCE_DIR}/cmake)
 function(copy TARGET)
     set(options "")
     set(oneValueArgs "")
@@ -26,42 +34,16 @@ function(copy TARGET)
         message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
     endif ()
     math(EXPR len "${copy_lib_SRCS_len} - 1")
-
     add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
     foreach (index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
-        if (WIN32)
-            if(IS_DIRECTORY ${src})
-                get_filename_component(last_path ${src} NAME)
-                string(APPEND dst "/" ${last_path})
-                add_custom_command(TARGET ${TARGET} PRE_BUILD
-                        COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
-                        )
-                if(EXISTS ${src})
-                    add_custom_command(TARGET ${TARGET} PRE_BUILD
-                            COMMAND cmake -E copy_directory "${src}" "${dst}"
-                            COMMENT "copying ${src} -> ${dst}")
-                else()
-                    message(WARNING "${src} not exist!")
-                endif()
-            else()
-                # windows cmd shell will not expand wildcard automatically.
-                # below expand the files, and copy them by rules.
-                file(GLOB src_files ${src})
-                if (NOT "${src_files}" STREQUAL "")
-                    list(REMOVE_DUPLICATES src_files)
-                endif ()
-                add_custom_command(TARGET ${TARGET} PRE_BUILD
-                        COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
-                        )
-                foreach (src_file ${src_files})
-                    add_custom_command(TARGET ${TARGET} PRE_BUILD
-                            COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
-                            COMMENT "copying ${src_file} -> ${dst}")
-                endforeach ()
-            endif()
-        else (WIN32) # not windows
+        if (WIN32)   #windows
+            file(TO_NATIVE_PATH ${src} native_src)
+            file(TO_NATIVE_PATH ${dst} native_dst)
+            add_custom_command(TARGET ${TARGET} POST_BUILD
+                    COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py ${native_src} ${native_dst})
+        else (WIN32) #not windows
             add_custom_command(TARGET ${TARGET} PRE_BUILD
                     COMMAND mkdir -p "${dst}"
                     COMMAND cp -r "${src}" "${dst}"
@@ -189,13 +171,12 @@ copy(zlib_lib
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
-if (NOT WIN32)
-    set(framework_lib_deps framework_py_proto)
-endif (NOT WIN32)
+set(framework_lib_deps framework_py_proto)
+
 copy(framework_lib DEPS ${framework_lib_deps}
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-        ${src_dir}/${module}/ir/*.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
+        ${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet
         )
 
 set(module "memory")
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index a8f3b084b0d8b7f792520d0335cce9580ec12a0c..cf00ab32b8accbfef88bc64e1aad3d1efd78fa48 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -79,6 +79,12 @@ DEFINE_string(selected_gpus, "",
 namespace paddle {
 namespace platform {
 
+inline std::string CudaErrorWebsite() {
+  return "Please see detail in https://docs.nvidia.com/cuda/cuda-runtime-api"
+         "/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c217824"
+         "6db0a94a430e0038";
+}
+
 static int GetCUDADeviceCountImpl() {
   const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
   if (cuda_visible_devices != nullptr) {
@@ -92,9 +98,12 @@ static int GetCUDADeviceCountImpl() {
   }
 
   int count;
+  auto error_code = cudaGetDeviceCount(&count);
   PADDLE_ENFORCE(
-      cudaGetDeviceCount(&count),
-      "cudaGetDeviceCount failed in paddle::platform::GetCUDADeviceCount");
+      error_code,
+      "cudaGetDeviceCount failed in "
+      "paddle::platform::GetCUDADeviceCountImpl, error code : %d, %s",
+      error_code, CudaErrorWebsite());
   return count;
 }
 
@@ -106,9 +115,12 @@ int GetCUDADeviceCount() {
 int GetCUDAComputeCapability(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   cudaDeviceProp device_prop;
-  PADDLE_ENFORCE(cudaGetDeviceProperties(&device_prop, id),
-                 "cudaGetDeviceProperties failed in "
-                 "paddle::platform::GetCUDAComputeCapability");
+  auto error_code = cudaGetDeviceProperties(&device_prop, id);
+  PADDLE_ENFORCE(
+      error_code,
+      "cudaGetDeviceProperties failed in "
+      "paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
+      error_code, CudaErrorWebsite());
   return device_prop.major * 10 + device_prop.minor;
 }
 
@@ -143,20 +155,25 @@ bool TensorCoreAvailable() {
 int GetCUDAMultiProcessors(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int count;
-  PADDLE_ENFORCE(
-      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id),
-      "cudaDeviceGetAttribute failed in "
-      "paddle::platform::GetCUDAMultiProcessors");
+  auto error_code =
+      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id);
+  PADDLE_ENFORCE(error_code,
+                 "cudaDeviceGetAttribute failed in "
+                 "paddle::platform::GetCUDAMultiProcess, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
   return count;
 }
 
 int GetCUDAMaxThreadsPerMultiProcessor(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int count;
-  PADDLE_ENFORCE(cudaDeviceGetAttribute(
-                     &count, cudaDevAttrMaxThreadsPerMultiProcessor, id),
-                 "cudaDeviceGetAttribute failed in "
-                 "paddle::platform::GetCUDAMaxThreadsPerMultiProcessor");
+  auto error_code = cudaDeviceGetAttribute(
+      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id);
+  PADDLE_ENFORCE(
+      error_code,
+      "cudaDeviceGetAttribute failed in paddle::"
+      "platform::GetCUDAMaxThreadsPerMultiProcessor, error code : %d, %s",
+      error_code, CudaErrorWebsite());
   return count;
 }
 
@@ -266,37 +283,50 @@ size_t GpuMaxChunkSize() {
 
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream) {
-  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
+  auto error_code = cudaMemcpyAsync(dst, src, count, kind, stream);
+  PADDLE_ENFORCE(error_code,
                  "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
-                 "(%p -> %p, length: %d)",
-                 src, dst, static_cast<int>(count));
+                 "(%p -> %p, length: %d) error code : %d, %s",
+                 src, dst, static_cast<int>(count), error_code,
+                 CudaErrorWebsite());
 }
 
 void GpuMemcpySync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind) {
-  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> "
-                 "%p, length: %d)",
-                 src, dst, static_cast<int>(count));
+  auto error_code = cudaMemcpy(dst, src, count, kind);
+  PADDLE_ENFORCE(error_code,
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync "
+                 "(%p -> %p, length: %d) error code : %d, %s",
+                 src, dst, static_cast<int>(count), error_code,
+                 CudaErrorWebsite());
 }
 
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
                         int src_device, size_t count, cudaStream_t stream) {
+  auto error_code =
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream);
   PADDLE_ENFORCE(
-      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
-      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync");
+      error_code,
+      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync "
+      "error code : %d, %s",
+      error_code, CudaErrorWebsite());
 }
 
 void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
                        int src_device, size_t count) {
-  PADDLE_ENFORCE(
-      cudaMemcpyPeer(dst, dst_device, src, src_device, count),
-      "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync");
+  auto error_code = cudaMemcpyPeer(dst, dst_device, src, src_device, count);
+  PADDLE_ENFORCE(error_code,
+                 "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync "
+                 "error code : %d, %s",
+                 error_code, CudaErrorWebsite());
 }
 
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
-  PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
-                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
+  auto error_code = cudaMemsetAsync(dst, value, count, stream);
+  PADDLE_ENFORCE(error_code,
+                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync "
+                 "error code : %d, %s",
+                 error_code, CudaErrorWebsite());
 }
 }  // namespace platform
 }  // namespace paddle