update

fa613206 · chengduoZH · 4bfadcd1 · fa613206
隐藏空白更改
内联并排

Showing with 10 addition and 4 deletion

paddle/fluid/framework/data_device_transform.cc paddle/fluid/framework/data_device_transform.cc +10 -4

未找到文件。
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -37,11 +37,17 @@ void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
          << " dst_place: " << dst_place;
  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
+  // FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and
+  // the enforced checkings have been done in GetDeviceContext, so the
+  // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program
+  // slow, especially when the number of elements is little, for example,
+  // the elements of learning rate are one and it's CPU side.
+  // One solution is to use a CUDA kernel to complete the copy operation when
+  // the transforming is from CPU to GPU and the number of elements is little.
+  // But the embarrassment is that this solution this solution makes training
+  // slower.
  TensorCopy(in, dst_place, *dev_ctx, out);
+  dev_ctx->Wait();
-  if (in.place().which() != dst_place.which()) {
-    dev_ctx->Wait();
-  }
 }
 }  // namespace framework