diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu index 15d8bde933fcfa73902f725e75e0549321119939..a25e01baa4d539673f4149c72048e53a5613ed04 100644 --- a/paddle/operators/nccl_op_test.cu +++ b/paddle/operators/nccl_op_test.cu @@ -13,8 +13,11 @@ limitations under the License. */ #include "paddle/operators/nccl_op.h" -#include "glog/logging.h" -#include "gtest/gtest.h" +#include +#include +#include +#include +#include #include "paddle/framework/block_desc.h" #include "paddle/framework/op_desc.h" @@ -24,10 +27,13 @@ #include "paddle/platform/device_context.h" #include "paddle/platform/enforce.h" #include "paddle/platform/gpu_info.h" +#include "paddle/platform/place.h" -#include -#include -#include +USE_CPU_ONLY_OP(ncclInit); +USE_GPU_ONLY_OP(ncclAllReduce); +USE_GPU_ONLY_OP(ncclReduce); +USE_GPU_ONLY_OP(ncclBcastSend); +USE_GPU_ONLY_OP(ncclBcastRecv); static std::vector gpu_list; @@ -55,28 +61,28 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs, op->SetAttrMap(attrs); } -TEST(NCCL, ncclInit) { +// ncclInitOp with desc +TEST(NCCL, ncclInitOp) { f::ProgramDescBind program; f::BlockDescBind *block = program.Block(0); - f::OpDescBind *op = block->AppendOp(); - - paddle::platform::Communicator comm; - op->SetType("ncclInit"); - op->SetOutput("Communicator", ) - - AddOp("ncclInit", {}, {{"Communicator", {comm}}}, {{"gpus", {gpu_list}}}, - block); + f::OpDescBind *op1 = block->AppendOp(); + + op1->SetType("ncclInit"); + op1->SetOutput("Communicator", {"x1"}); + op1->SetAttr("gpus", {gpu_list}); + f::Scope g_scope; + paddle::platform::DeviceContext *ctx = + new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); + + auto *var = g_scope.Var("x1"); + var->GetMutable(); + + auto op = f::OpRegistry::CreateOp(*op1); + VLOG(1) << "invoke NCCLInitOp."; + op->Run(g_scope, *ctx); + VLOG(1) << "NCCLInitOp finished."; } -// TEST(NCCL, ncclAllReduce) { -// f::ProgramDescBind program; -// f::BlockDescBind *block = program.Block(0); - -// paddle::platform::Communicator comm; -// AddOp("ncclInit", {}, {{"Communicator", {comm}}, {"gpus", {gpu_list}}}, -// block); -// } - int main(int argc, char **argv) { static int dev_count = paddle::platform::GetCUDADeviceCount(); if (dev_count <= 1) { diff --git a/python/paddle/v2/framework/tests/test_multigpu.py b/python/paddle/v2/framework/tests/test_multigpu.py deleted file mode 100644 index b75d274d885e9014417faba258232e783fe2dda2..0000000000000000000000000000000000000000 --- a/python/paddle/v2/framework/tests/test_multigpu.py +++ /dev/null @@ -1,8 +0,0 @@ -import unittest, os -import numpy as np -import paddle.v2 as paddle -from paddle.v2.framework.op import Operator -import paddle.v2.framework.core as core -from op_test import OpTest, create_op, set_input - -gpu_list = "0,1,2,3" diff --git a/python/paddle/v2/framework/tests/test_nccl_ops.py b/python/paddle/v2/framework/tests/test_nccl_ops.py deleted file mode 100644 index 6dd6231aa8dee863ade6e3eddf022d736c6a8a09..0000000000000000000000000000000000000000 --- a/python/paddle/v2/framework/tests/test_nccl_ops.py +++ /dev/null @@ -1,87 +0,0 @@ -import unittest, os -import numpy as np -import paddle.v2 as paddle -from paddle.v2.framework.op import Operator -import paddle.v2.framework.core as core -from op_test import OpTest, create_op, set_input - -# gpu_list = os.environ["NV_LIST"] -gpu_list = "0,1,2,3" - -if not core.is_compile_gpu() or not gpu_list: - exit(0) - - -def allreduce(tensors, gpus): - num_device = len(gpus) - assert (len(tensors) == num_device), "not match of tensor and device" - Out = tensors - for i in range(1, len(tensors)): - Out[0] += Out[i] - - for i in range(1, len(tensors)): - Out[i] = Out[0] - - return Out - - -class TestNCCLAllReduce(unittest.TestCase): - def setUp(self): - - self.op_type = "ncclAllReduce" - - self.gpus = [int(g) for g in gpu_list.split(",")] - - self.g_scope = core.Scope() - self.g_ctx = core.DeviceContext.create(core.CPUPlace()) - self.scopes = [] - self.ops = [] - self.places = [] - - self.input_data = [] - - for i in range(len(self.gpus)): - self.input_data.append(np.random.random((32, 32))) - self.output_data = allreduce(self.input_data, self.gpus) - - nccl_init = Operator("ncclInit", Out="Communicator", gpus=self.gpus) - op.run(self.g_scope, self.g_ctx) - - for i in range(len(self.gpus)): - # insert kid scope - scope = self.g_scope.new_scope() - place = core.GPUPlace(self.gpus[i]) - - inputs = {"X": self.input_data[i]} - outputs = {"Out": self.output_data[i]} - attrs = {"gpus": self.gpus} - - op = create_op(scope, self.op_type, inputs, outputs, attrs) - set_input(scope, op, inputs, place) - - self.scopes.append(scope) - self.ops.append(op) - self.places.append(place) - - def test_output(self): - idx = 0 - for scope, place, op in zip(self.scopes, self.places, self.ops): - ctx = core.DeviceContext.create(place) - op.run(scope, ctx) - - for out_name, out_dup in Operator.get_op_outputs(self.op.type()): - actual = np.array(scope.find_var(out_name).get_tensor()) - expect = self.output_data[idx] - - idx += 1 - self.assertTrue(actual, expect), "has diff" - - -# if __name__ == "__main__": -# unittest.main() -# usage : export NV_LIST=0,1,2,3 python *.py - -# os.environ["NV_LIST"] = ["0,1,2,3"] - -if __name__ == "__main__": - unittest.main()