Stash

0ef9edf5 · Yu Yang · 5e87cd75 · 0ef9edf5 · 0ef9edf5
2 changed file
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -229,8 +229,15 @@ class ParallelExecutorPrivate {
 // TODO(yy): Move this function somewhere
 ncclDataType_t ToNCCLDataType(std::type_index type) {
-  // FIXME!!
+  if (type == typeid(float)) {  // NOLINT
    return ncclFloat;
+  } else if (type == typeid(double)) {  // NOLINT
+    return ncclDouble;
+  } else if (type == typeid(int)) {  // NOLINT
+    return ncclInt;
+  } else {
+    PADDLE_THROW("Not supported");
+  }
 }
 ParallelExecutor::ParallelExecutor(
@@ -479,30 +486,32 @@ void ParallelExecutor::BCastParamsToGPUs(
      ncclDataType_t data_type = ToNCCLDataType(main_tensor.type());
      auto &dims = main_tensor.dims();
      size_t numel = main_tensor.numel();
-      std::vector<std::pair<void *, ParallelExecutorPrivate::NCCLContext *>>
-          mems;
-      mems.emplace_back(const_cast<void *>(main_tensor.data<void>()),
-                        &member_->GetNCCLCtx(member_->main_place_));
-      for (auto &pair : member_->local_scopes_) {
+      platform::dynload::ncclGroupStart();
-        if (pair.first == member_->main_place_) {
-          continue;
-        }
+      for (auto &pair : member_->local_scopes_) {
        auto local_scope = pair.second;
        auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
        t->Resize(dims);
-        mems.emplace_back(t->mutable_data(pair.first, main_tensor.type()),
+        auto &nccl_ctx = member_->GetNCCLCtx(pair.first);
-                          &member_->GetNCCLCtx(member_->main_place_));
+        platform::dynload::ncclBcast(
+            t->mutable_data(pair.first, main_tensor.type()), numel, data_type,
+            0, nccl_ctx.comm, nccl_ctx.stream());
+      }
+      platform::dynload::ncclGroupEnd();
+    }
  }
-      // TODO(yy): Invoke ncclBCast here. mems, numel, data_type. The mems[0]
+  for (auto &pair : member_->local_scopes_) {
-      // is the src, rests are dests.
+    member_->GetNCCLCtx(pair.first).ctx_->Wait();
-      (void)(data_type);
+    auto &b = pair.second->FindVar("fc_1.b_0")->Get<framework::LoDTensor>();
-      (void)(numel);
+    framework::LoDTensor cpu;
-    }
+    framework::TensorCopy(b, platform::CPUPlace(), &cpu);
+    platform::DeviceContextPool::Instance().Get(b.place())->Wait();
+    LOG(INFO) << *cpu.data<float>();
  }
 #else
  PADDLE_THROW("Not compiled with CUDA");
 #endif

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -52,7 +52,7 @@ class ParallelExecutor(unittest.TestCase):
            adam = fluid.optimizer.Adam()
            adam.minimize(loss)
        act_places = []
-        for each in [fluid.CUDAPlace(0)]:
+        for each in [fluid.CUDAPlace(0), fluid.CUDAPlace(1)]:
            p = fluid.core.Place()
            p.set_place(each)
            act_places.append(p)