From 2194ad1566464eb7bc8eb4d5bf6b791379a3a145 Mon Sep 17 00:00:00 2001
From: liuyuhui <liuyuhui@baidu.com>
Date: Wed, 21 Apr 2021 11:38:50 +0800
Subject: [PATCH] [Kunlun]add collective ops for multi XPU cards training and
 add Kunlun multi XPU cards CI (#32302)

---
 .../collective/c_allreduce_max_op_xpu.cc      | 28 +++++++
 .../collective/c_allreduce_min_op_xpu.cc      | 28 +++++++
 .../operators/collective/c_allreduce_op.h     | 72 +++++++++++++++++-
 .../collective/c_allreduce_prod_op_xpu.cc     | 28 +++++++
 .../collective/c_allreduce_sum_op_xpu.cc      | 28 +++++++
 .../collective/c_reduce_max_op_xpu.cc         | 28 +++++++
 .../collective/c_reduce_min_op_xpu.cc         | 28 +++++++
 .../fluid/operators/collective/c_reduce_op.h  | 74 ++++++++++++++++++-
 .../collective/c_reduce_prod_op_xpu.cc        | 28 +++++++
 .../collective/c_reduce_sum_op_xpu.cc         | 28 +++++++
 .../fluid/tests/unittests/CMakeLists.txt      |  6 ++
 .../test_collective_allreduce_api.py          | 10 ++-
 .../unittests/test_collective_api_base.py     | 54 ++++++++++----
 .../unittests/test_collective_reduce_api.py   |  7 +-
 14 files changed, 426 insertions(+), 21 deletions(-)
 create mode 100644 paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
 create mode 100644 paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
 create mode 100644 paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
 create mode 100644 paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
 create mode 100644 paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
 create mode 100644 paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
 create mode 100644 paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
 create mode 100644 paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc

diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
new file mode 100644
index 0000000000..b0aa51f7cf
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_max,
+                       ops::CAllReduceOpXPUKernel<ops::kRedMax, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
new file mode 100644
index 0000000000..2f16a89c21
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_min,
+                       ops::CAllReduceOpXPUKernel<ops::kRedMin, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 2f56f43d79..ab1cc508fd 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -20,11 +20,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
+#endif
+
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/allreduce.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
@@ -105,6 +113,68 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    BKCLDataType dtype = platform::ToBKCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::BKCLCommContext::Instance().Get(rid, place);
+
+    XPUStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
+                   ->x_context()
+                   ->xpu_stream;
+    } else {
+      stream = comm->stream();
+    }
+
+    BKCLOp bkcl_red_type = BKCL_ADD;
+    switch (red_type) {
+      case kRedSum:
+        bkcl_red_type = BKCL_ADD;
+        break;
+
+      case kRedMax:
+        bkcl_red_type = BKCL_MAX;
+        break;
+
+      case kRedMin:
+        bkcl_red_type = BKCL_MIN;
+        break;
+
+      case kRedProd:
+        bkcl_red_type = BKCL_PRODUCT;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel,
+                                      dtype, bkcl_red_type, stream),
+                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
+                                        "BKCL all reduce failed"));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should be compiled with XPU."));
+#endif
+  }
+};
+
 template <ReduceType red_type, typename T>
 class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
new file mode 100644
index 0000000000..92ba004280
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_prod,
+                       ops::CAllReduceOpXPUKernel<ops::kRedProd, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
new file mode 100644
index 0000000000..e4ec538cd2
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_sum,
+                       ops::CAllReduceOpXPUKernel<ops::kRedSum, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
new file mode 100644
index 0000000000..6d3af7bb5f
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpXPUKernel<ops::kRedMax, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
new file mode 100644
index 0000000000..791e58d849
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpXPUKernel<ops::kRedMin, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index 1bce01e13a..e537478109 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -24,10 +24,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
+
+#if defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
+#endif
+
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/reduce.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
@@ -110,6 +119,69 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CReduceOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    BKCLDataType dtype = platform::ToBKCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    int root = ctx.Attr<int>("root_id");
+    auto comm = platform::BKCLCommContext::Instance().Get(rid, place);
+
+    XPUStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
+                   ->x_context()
+                   ->xpu_stream;
+    } else {
+      stream = comm->stream();
+    }
+
+    BKCLOp bkcl_red_type = BKCL_ADD;
+    switch (red_type) {
+      case kRedSum:
+        bkcl_red_type = BKCL_ADD;
+        break;
+
+      case kRedMax:
+        bkcl_red_type = BKCL_MAX;
+        break;
+
+      case kRedMin:
+        bkcl_red_type = BKCL_MIN;
+        break;
+
+      case kRedProd:
+        bkcl_red_type = BKCL_PRODUCT;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    PADDLE_ENFORCE_EQ(bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel,
+                                  dtype, bkcl_red_type, root, stream),
+                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
+                                        "BKCL all reduce failed"));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should be compiled with XPU."));
+#endif
+  }
+};
+
 template <ReduceType red_type, typename T>
 class CReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
new file mode 100644
index 0000000000..e7e770e8ff
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpXPUKernel<ops::kRedProd, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
new file mode 100644
index 0000000000..a0ec4d2a99
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpXPUKernel<ops::kRedSum, float>)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 486ad38ae2..d0906052c9 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -627,6 +627,12 @@ if (WITH_XPU)
     add_subdirectory(xpu)
 endif()
 
+# dist xpu tests: 
+if (WITH_XPU_BKCL)
+    py_test(test_collective_reduce_api_xpu SRCS "test_collective_reduce_api.py")
+    py_test(test_collective_allreduce_api_xpu SRCS "test_collective_allreduce_api.py")
+endif()
+
 if (WITH_ASCEND_CL)
     add_subdirectory(npu)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
index a405da80ad..eed2388f36 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
@@ -27,8 +27,14 @@ class TestCollectiveAllreduceAPI(TestDistBase):
         pass
 
     def test_allreduce_nccl(self):
-        self.check_with_place("collective_allreduce_api.py", "allreduce",
-                              "nccl")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_with_place("collective_allreduce_api.py", "allreduce",
+                                  "nccl")
+
+    def test_allreduce_bkcl(self):
+        if paddle.fluid.core.is_compiled_with_xpu():
+            self.check_with_place("collective_allreduce_api.py", "allreduce",
+                                  "bkcl")
 
     def test_allreduce_gloo(self):
         self.check_with_place("collective_allreduce_api.py", "allreduce",
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index 660018e285..ad85adb2d5 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -50,6 +50,9 @@ class TestCollectiveAPIRunnerBase(object):
             device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
             place = fluid.CUDAPlace(
                 device_id)  #if args.use_gpu else fluid.CPUPlace()
+        elif args['backend'] == 'bkcl':
+            device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
+            place = fluid.XPUPlace(device_id)
         else:
             place = fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -71,7 +74,6 @@ class TestCollectiveAPIRunnerBase(object):
 def runtime_main(test_class, col_type):
     args = {}
     model = test_class()
-    args["deviceid"] = os.getenv("FLAGS_selected_gpus")
     args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
     args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
     args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
@@ -112,21 +114,38 @@ class TestDistBase(unittest.TestCase):
         worker_endpoints = self._ps_endpoints.split(",")
         w0_ep, w1_ep = worker_endpoints
         #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
-        env0 = {
-            "FLAGS_selected_gpus": "0",
-            "PADDLE_TRAINER_ID": "0",
-            "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w0_ep
-        }
+        if core.is_compiled_with_cuda():
+            env0 = {
+                "FLAGS_selected_gpus": "0",
+                "PADDLE_TRAINER_ID": "0",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w0_ep
+            }
 
-        env1 = {
-            "FLAGS_selected_gpus": "1",
-            "PADDLE_TRAINER_ID": "1",
-            "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w1_ep
-        }
+            env1 = {
+                "FLAGS_selected_gpus": "1",
+                "PADDLE_TRAINER_ID": "1",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w1_ep
+            }
+        elif core.is_compiled_with_xpu():
+            env0 = {
+                "FLAGS_selected_xpus": "0",
+                "PADDLE_TRAINER_ID": "0",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w0_ep
+            }
+
+            env1 = {
+                "FLAGS_selected_xpus": "1",
+                "PADDLE_TRAINER_ID": "1",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w1_ep
+            }
         #update environment
         env0.update(envs)
         env1.update(envs)
@@ -169,7 +188,10 @@ class TestDistBase(unittest.TestCase):
                          path_id="0",
                          check_error_log=False,
                          need_envs={}):
-        with_gloo = '0' if backend == "nccl" else '1'
+        if backend == "nccl" or backend == "bkcl":
+            with_gloo = '0'
+        else:
+            with_gloo = '1'
         required_envs = {
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
             "FLAGS_eager_delete_tensor_gb": "0.0",
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
index 8d28c794f0..721f446c9f 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
@@ -27,7 +27,12 @@ class TestCollectiveReduceAPI(TestDistBase):
         pass
 
     def test_reduce_nccl(self):
-        self.check_with_place("collective_reduce_api.py", "reduce", "nccl")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_with_place("collective_reduce_api.py", "reduce", "nccl")
+
+    def test_reduce_bkcl(self):
+        if paddle.fluid.core.is_compiled_with_xpu():
+            self.check_with_place("collective_reduce_api.py", "reduce", "bkcl")
 
     def test_reduce_gloo(self):
         self.check_with_place("collective_reduce_api.py", "reduce", "gloo", "1")
-- 
GitLab