checkin nccl operator

0990c87b · Dong Zhihong · da1181bf · 0990c87b · da1181bf · 0990c87b
5 changed file
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -46,7 +46,8 @@ struct Communicator {
  ~Communicator() {
    for (size_t i = 0; i < comms_.size(); ++i) {
-      PADDLE_ENFORCE(dynload::ncclCommDestroy(comms_[i]));
+      // FIXME(dzh) : PADDLE_ENFORCE return void
+      dynload::ncclCommDestroy(comms_[i]);
    }
  }

--- a/paddle/operators/nccl_op_test.cc
+++ b/paddle/operators/nccl_op_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include "paddle/operators/nccl_op.h"
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/gpu_info.h"
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
-static std::vector<int> gpu_list;
-using f = paddle::framework;
-using ops = paddle::operators;
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
-    }
-  }
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-TEST(NCCL, ncclInitOp) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.Block(0);
-}
-int main(int argc, char **argv) {
-  static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < gpu_count; ++i) {
-    gpu_list.emplace_back(i);
-  }
-  if (dev_count <= 1) {
-    LOG(WARNING)
-        << "Cannot test multi-gpu nccl, because the CUDA device count is "
-        << dev_count;
-    return 0;
-  }
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -16,6 +16,11 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/var_desc.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/gpu_info.h"
@@ -26,8 +31,8 @@
 static std::vector<int> gpu_list;
-using f = paddle::framework;
+namespace f = paddle::framework;
-using ops = paddle::operators;
+namespace ops = paddle::operators;
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
           const f::VariableNameMap &outputs, f::AttributeMap attrs,
@@ -50,22 +55,40 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
  op->SetAttrMap(attrs);
 }
-TEST(NCCL, ncclInitOp) {
+TEST(NCCL, ncclInit) {
  f::ProgramDescBind program;
  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op = block->AppendOp();
+  paddle::platform::Communicator comm;
+  op->SetType("ncclInit");
+  op->SetOutput("Communicator", )
+      AddOp("ncclInit", {}, {{"Communicator", {comm}}}, {{"gpus", {gpu_list}}},
+            block);
 }
+// TEST(NCCL, ncclAllReduce) {
+//   f::ProgramDescBind program;
+//   f::BlockDescBind *block = program.Block(0);
+//   paddle::platform::Communicator comm;
+//   AddOp("ncclInit", {}, {{"Communicator", {comm}}, {"gpus", {gpu_list}}},
+//   block);
+// }
 int main(int argc, char **argv) {
-  static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount();
+  static int dev_count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < gpu_count; ++i) {
-    gpu_list.emplace_back(i);
-  }
  if (dev_count <= 1) {
    LOG(WARNING)
        << "Cannot test multi-gpu nccl, because the CUDA device count is "
        << dev_count;
    return 0;
  }
+  for (int i = 0; i < dev_count; ++i) {
+    gpu_list.emplace_back(i);
+  }
  testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();
 }
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -31,9 +31,7 @@ namespace platform {
 TEST(NCCL, init) {
  std::vector<ncclComm_t> comms;
  comms.resize(dev_count);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
-  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
-  PADDLE_ENFORCE(status);
  for (int i = 0; i < dev_count; ++i) {
    dynload::ncclCommDestroy(comms[i]);
  }
@@ -64,8 +62,7 @@ TEST(NCCL, all_reduce) {
  std::vector<ncclComm_t> comms;
  comms.resize(dev_count);
  VLOG(1) << "Initializing ncclComm";
-  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
-  PADDLE_ENFORCE(status);
  VLOG(1) << "ncclComm initialized";
  VLOG(1) << "Creating thread data";
  std::vector<std::unique_ptr<PerThreadData<double>>> data;

--- a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
@@ -53,6 +53,9 @@ def thread_allreduce_op(thread_id, gpu_id):
    op = create_op(scope, "ncclAllReduce", inputs, outputs, attrs={})
    place = core.GPUPlace(gpus[i])
    set_input(scope, op, inputs, place)
+    # # print scope.find_var("Out").get_tensor()
+    # # print scope.find_var("X").get_tensor()
+    print scope.find_var("Communicator").get_communicator()
    ctx = core.DeviceContext.create(place)
@@ -83,13 +86,13 @@ class TestNCCLAllReduce(unittest.TestCase):
                    i,
                    gpus[i], ))
            th.start()
-            ops.append(ops)
+            ops.append(th)
-        for th in ops:
+        for t in ops:
-            th.join()
+            t.join()
        idx = 0
-        for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
+        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
-            actual = np.array(scope.find_var(out_name).get_tensor())
+            actual = np.array(g_scope.find_var(out_name).get_tensor())
            expect = output_data[idx]
            idx += 1