diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index f492f96aa873c6bc96da1c051d2b41f732f9e073..fe49d19a9dd7785886b4143861ee281c6b8342b0 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -46,7 +46,8 @@ struct Communicator {
 
   ~Communicator() {
     for (size_t i = 0; i < comms_.size(); ++i) {
-      PADDLE_ENFORCE(dynload::ncclCommDestroy(comms_[i]));
+      // FIXME(dzh) : PADDLE_ENFORCE return void
+      dynload::ncclCommDestroy(comms_[i]);
     }
   }
 
diff --git a/paddle/operators/nccl_op_test.cc b/paddle/operators/nccl_op_test.cc
deleted file mode 100644
index 9c319a33876adef722f9563a7f1e6264c9dcb1f0..0000000000000000000000000000000000000000
--- a/paddle/operators/nccl_op_test.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include "paddle/operators/nccl_op.h"
-
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/gpu_info.h"
-
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
-
-static std::vector<int> gpu_list;
-
-using f = paddle::framework;
-using ops = paddle::operators;
-
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
-    }
-  }
-
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-TEST(NCCL, ncclInitOp) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.Block(0);
-}
-
-int main(int argc, char **argv) {
-  static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < gpu_count; ++i) {
-    gpu_list.emplace_back(i);
-  }
-  if (dev_count <= 1) {
-    LOG(WARNING)
-        << "Cannot test multi-gpu nccl, because the CUDA device count is "
-        << dev_count;
-    return 0;
-  }
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index 9c319a33876adef722f9563a7f1e6264c9dcb1f0..15d8bde933fcfa73902f725e75e0549321119939 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -16,6 +16,11 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/var_desc.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/gpu_info.h"
@@ -26,8 +31,8 @@
 
 static std::vector<int> gpu_list;
 
-using f = paddle::framework;
-using ops = paddle::operators;
+namespace f = paddle::framework;
+namespace ops = paddle::operators;
 
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
            const f::VariableNameMap &outputs, f::AttributeMap attrs,
@@ -50,22 +55,40 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
   op->SetAttrMap(attrs);
 }
 
-TEST(NCCL, ncclInitOp) {
+TEST(NCCL, ncclInit) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op = block->AppendOp();
+
+  paddle::platform::Communicator comm;
+  op->SetType("ncclInit");
+  op->SetOutput("Communicator", )
+
+      AddOp("ncclInit", {}, {{"Communicator", {comm}}}, {{"gpus", {gpu_list}}},
+            block);
 }
 
+// TEST(NCCL, ncclAllReduce) {
+//   f::ProgramDescBind program;
+//   f::BlockDescBind *block = program.Block(0);
+
+//   paddle::platform::Communicator comm;
+//   AddOp("ncclInit", {}, {{"Communicator", {comm}}, {"gpus", {gpu_list}}},
+//   block);
+// }
+
 int main(int argc, char **argv) {
-  static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < gpu_count; ++i) {
-    gpu_list.emplace_back(i);
-  }
+  static int dev_count = paddle::platform::GetCUDADeviceCount();
   if (dev_count <= 1) {
     LOG(WARNING)
         << "Cannot test multi-gpu nccl, because the CUDA device count is "
         << dev_count;
     return 0;
   }
+
+  for (int i = 0; i < dev_count; ++i) {
+    gpu_list.emplace_back(i);
+  }
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
index ab8b96f7263aed83407866fedf9e529ce0affe3f..c99dae68bef67c58d3efea42fef45e84bb3d9255 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -31,9 +31,7 @@ namespace platform {
 TEST(NCCL, init) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
-
-  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
-  PADDLE_ENFORCE(status);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
   for (int i = 0; i < dev_count; ++i) {
     dynload::ncclCommDestroy(comms[i]);
   }
@@ -64,8 +62,7 @@ TEST(NCCL, all_reduce) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
   VLOG(1) << "Initializing ncclComm";
-  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
-  PADDLE_ENFORCE(status);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
   VLOG(1) << "ncclComm initialized";
   VLOG(1) << "Creating thread data";
   std::vector<std::unique_ptr<PerThreadData<double>>> data;
diff --git a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
index 06e079eda8b1df30fb40075983a531cba985bbbb..f79dcd664b20325dd9d222ab9d4dcb3132d5d6cd 100644
--- a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
@@ -53,6 +53,9 @@ def thread_allreduce_op(thread_id, gpu_id):
     op = create_op(scope, "ncclAllReduce", inputs, outputs, attrs={})
     place = core.GPUPlace(gpus[i])
     set_input(scope, op, inputs, place)
+    # # print scope.find_var("Out").get_tensor()
+    # # print scope.find_var("X").get_tensor()
+    print scope.find_var("Communicator").get_communicator()
 
     ctx = core.DeviceContext.create(place)
 
@@ -83,13 +86,13 @@ class TestNCCLAllReduce(unittest.TestCase):
                     i,
                     gpus[i], ))
             th.start()
-            ops.append(ops)
-        for th in ops:
-            th.join()
+            ops.append(th)
+        for t in ops:
+            t.join()
 
         idx = 0
-        for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
-            actual = np.array(scope.find_var(out_name).get_tensor())
+        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+            actual = np.array(g_scope.find_var(out_name).get_tensor())
             expect = output_data[idx]
 
             idx += 1