Merge pull request #10906 from chengduoZH/fix_data_trans

Fix DataTransFunc

Merge pull request #10906 from chengduoZH/fix_data_trans
Fix DataTransFunc
8c54f1fb · chengduo · GitHub · 7d1332f6 · 17a076d8 · 8c54f1fb
隐藏空白更改
内联并排

Showing with 16 addition and 22 deletion

paddle/fluid/framework/data_device_transform.cc paddle/fluid/framework/data_device_transform.cc +16 -22

未找到文件。
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -16,31 +16,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-static const platform::DeviceContext* GetDeviceContext(
-    const platform::Place& src_place, const platform::Place& dst_place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-
-  if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    return pool.Get(src_place);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    return pool.Get(dst_place);
-  } else {
-    PADDLE_THROW(
-        "Currently, model parallelism is only supported between CPU and CUDA");
-  }
-}
-
-void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
-                     Tensor* out) {
+void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
+                     Tensor *out) {
  VLOG(3) << "DeviceTransform in, src_place " << in.place()
          << " dst_place: " << dst_place;
-  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);

-  TensorCopy(in, dst_place, *dev_ctx, out);
-  if (platform::is_gpu_place(in.place()) && platform::is_cpu_place(dst_place)) {
-    dev_ctx->Wait();
-  }
+  PADDLE_ENFORCE_NE(
+      in.place().which(), dst_place.which(),
+      "Currently, model parallelism is only supported between CPU and CUDA");
+
+  // FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and
+  // the enforced checkings have been done in GetDeviceContext, so the
+  // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program
+  // slow, especially when the number of elements is little, for example,
+  // the elements of learning rate are one and it's CPU side.
+  // One solution is to use a CUDA kernel to complete the copy operation when
+  // the transforming is from CPU to GPU and the number of elements is little.
+  // But the embarrassment is that this solution this solution makes training
+  // slower.
+  TensorCopySync(in, dst_place, out);
 }

 }  // namespace framework