From fa613206e4f9ed7c2d5216afe15fecab80381e4a Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 25 May 2018 14:50:21 +0800
Subject: [PATCH] update

---
 paddle/fluid/framework/data_device_transform.cc | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 0cd2ebcd41d..4089458a33f 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -37,11 +37,17 @@ void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
           << " dst_place: " << dst_place;
   auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
 
+  // FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and
+  // the enforced checkings have been done in GetDeviceContext, so the
+  // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program
+  // slow, especially when the number of elements is little, for example,
+  // the elements of learning rate are one and it's CPU side.
+  // One solution is to use a CUDA kernel to complete the copy operation when
+  // the transforming is from CPU to GPU and the number of elements is little.
+  // But the embarrassment is that this solution this solution makes training
+  // slower.
   TensorCopy(in, dst_place, *dev_ctx, out);
-
-  if (in.place().which() != dst_place.which()) {
-    dev_ctx->Wait();
-  }
+  dev_ctx->Wait();
 }
 
 }  // namespace framework
-- 
GitLab