From 0990c87bf63302ab608005ec7aa2e8dcd37b6b5c Mon Sep 17 00:00:00 2001 From: Dong Zhihong Date: Tue, 24 Oct 2017 13:43:01 -0700 Subject: [PATCH] checkin nccl operator --- paddle/operators/nccl/nccl_gpu_common.h | 3 +- paddle/operators/nccl_op_test.cc | 71 ------------------- paddle/operators/nccl_op_test.cu | 37 ++++++++-- paddle/platform/nccl_test.cu | 7 +- .../framework/tests/test_nccl_allreduce_op.py | 13 ++-- 5 files changed, 42 insertions(+), 89 deletions(-) delete mode 100644 paddle/operators/nccl_op_test.cc diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h index f492f96aa87..fe49d19a9dd 100644 --- a/paddle/operators/nccl/nccl_gpu_common.h +++ b/paddle/operators/nccl/nccl_gpu_common.h @@ -46,7 +46,8 @@ struct Communicator { ~Communicator() { for (size_t i = 0; i < comms_.size(); ++i) { - PADDLE_ENFORCE(dynload::ncclCommDestroy(comms_[i])); + // FIXME(dzh) : PADDLE_ENFORCE return void + dynload::ncclCommDestroy(comms_[i]); } } diff --git a/paddle/operators/nccl_op_test.cc b/paddle/operators/nccl_op_test.cc deleted file mode 100644 index 9c319a33876..00000000000 --- a/paddle/operators/nccl_op_test.cc +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -#include "paddle/operators/nccl_op.h" - -#include "glog/logging.h" -#include "gtest/gtest.h" - -#include "paddle/platform/device_context.h" -#include "paddle/platform/enforce.h" -#include "paddle/platform/gpu_info.h" - -#include -#include -#include - -static std::vector gpu_list; - -using f = paddle::framework; -using ops = paddle::operators; - -void AddOp(const std::string &type, const f::VariableNameMap &inputs, - const f::VariableNameMap &outputs, f::AttributeMap attrs, - paddle::framework::BlockDescBind *block) { - for (auto kv : outputs) { - for (auto v : kv.second) { - auto var = block->Var(v); - var->SetDataType(paddle::framework::DataType::FP32); - } - } - - auto op = block->AppendOp(); - op->SetType(type); - for (auto &kv : inputs) { - op->SetInput(kv.first, kv.second); - } - for (auto &kv : outputs) { - op->SetOutput(kv.first, kv.second); - } - op->SetAttrMap(attrs); -} - -TEST(NCCL, ncclInitOp) { - f::ProgramDescBind program; - f::BlockDescBind *block = program.Block(0); -} - -int main(int argc, char **argv) { - static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount(); - for (int i = 0; i < gpu_count; ++i) { - gpu_list.emplace_back(i); - } - if (dev_count <= 1) { - LOG(WARNING) - << "Cannot test multi-gpu nccl, because the CUDA device count is " - << dev_count; - return 0; - } - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu index 9c319a33876..15d8bde933f 100644 --- a/paddle/operators/nccl_op_test.cu +++ b/paddle/operators/nccl_op_test.cu @@ -16,6 +16,11 @@ #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/framework/block_desc.h" +#include "paddle/framework/op_desc.h" +#include "paddle/framework/op_registry.h" +#include "paddle/framework/program_desc.h" +#include "paddle/framework/var_desc.h" #include "paddle/platform/device_context.h" #include "paddle/platform/enforce.h" #include "paddle/platform/gpu_info.h" @@ -26,8 +31,8 @@ static std::vector gpu_list; -using f = paddle::framework; -using ops = paddle::operators; +namespace f = paddle::framework; +namespace ops = paddle::operators; void AddOp(const std::string &type, const f::VariableNameMap &inputs, const f::VariableNameMap &outputs, f::AttributeMap attrs, @@ -50,22 +55,40 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs, op->SetAttrMap(attrs); } -TEST(NCCL, ncclInitOp) { +TEST(NCCL, ncclInit) { f::ProgramDescBind program; f::BlockDescBind *block = program.Block(0); + f::OpDescBind *op = block->AppendOp(); + + paddle::platform::Communicator comm; + op->SetType("ncclInit"); + op->SetOutput("Communicator", ) + + AddOp("ncclInit", {}, {{"Communicator", {comm}}}, {{"gpus", {gpu_list}}}, + block); } +// TEST(NCCL, ncclAllReduce) { +// f::ProgramDescBind program; +// f::BlockDescBind *block = program.Block(0); + +// paddle::platform::Communicator comm; +// AddOp("ncclInit", {}, {{"Communicator", {comm}}, {"gpus", {gpu_list}}}, +// block); +// } + int main(int argc, char **argv) { - static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount(); - for (int i = 0; i < gpu_count; ++i) { - gpu_list.emplace_back(i); - } + static int dev_count = paddle::platform::GetCUDADeviceCount(); if (dev_count <= 1) { LOG(WARNING) << "Cannot test multi-gpu nccl, because the CUDA device count is " << dev_count; return 0; } + + for (int i = 0; i < dev_count; ++i) { + gpu_list.emplace_back(i); + } testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu index ab8b96f7263..c99dae68bef 100644 --- a/paddle/platform/nccl_test.cu +++ b/paddle/platform/nccl_test.cu @@ -31,9 +31,7 @@ namespace platform { TEST(NCCL, init) { std::vector comms; comms.resize(dev_count); - - auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr); - PADDLE_ENFORCE(status); + PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr)); for (int i = 0; i < dev_count; ++i) { dynload::ncclCommDestroy(comms[i]); } @@ -64,8 +62,7 @@ TEST(NCCL, all_reduce) { std::vector comms; comms.resize(dev_count); VLOG(1) << "Initializing ncclComm"; - auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr); - PADDLE_ENFORCE(status); + PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr)); VLOG(1) << "ncclComm initialized"; VLOG(1) << "Creating thread data"; std::vector>> data; diff --git a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py index 06e079eda8b..f79dcd664b2 100644 --- a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py +++ b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py @@ -53,6 +53,9 @@ def thread_allreduce_op(thread_id, gpu_id): op = create_op(scope, "ncclAllReduce", inputs, outputs, attrs={}) place = core.GPUPlace(gpus[i]) set_input(scope, op, inputs, place) + # # print scope.find_var("Out").get_tensor() + # # print scope.find_var("X").get_tensor() + print scope.find_var("Communicator").get_communicator() ctx = core.DeviceContext.create(place) @@ -83,13 +86,13 @@ class TestNCCLAllReduce(unittest.TestCase): i, gpus[i], )) th.start() - ops.append(ops) - for th in ops: - th.join() + ops.append(th) + for t in ops: + t.join() idx = 0 - for out_name, out_dup in Operator.get_op_outputs(self.op.type()): - actual = np.array(scope.find_var(out_name).get_tensor()) + for out_name, out_dup in Operator.get_op_outputs(self.op_type): + actual = np.array(g_scope.find_var(out_name).get_tensor()) expect = output_data[idx] idx += 1 -- GitLab