From 5f995d3fd2b89b1d1d3f4b5311e0f4c6fdf89daa Mon Sep 17 00:00:00 2001
From: james <zhangxiaoci@baidu.com>
Date: Thu, 24 Nov 2022 10:26:41 +0800
Subject: [PATCH] processgroup bkcl support reduce (#48232)

Note: this is a temporary solution, should be replaced once reduce kernel
is natively supported on KL2
---
 .../collective/ProcessGroupBKCL.cc            | 51 +++++++++++++++++++
 .../distributed/collective/ProcessGroupBKCL.h |  6 +++
 .../tests/unittests/xpu/process_group_bkcl.py | 21 ++++++++
 3 files changed, 78 insertions(+)
diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc
index 75953dc0b42..ff39196b92b 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc
@@ -260,6 +260,57 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
       use_calc_stream);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Reduce(
+    phi::DenseTensor* out_tensor,
+    const phi::DenseTensor& in_tensor,
+    const ReduceOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  return Collective(
+      out_tensor,
+      in_tensor,
+      [&](phi::DenseTensor* output,
+          const phi::DenseTensor& input,
+          BKCLContext_t comm,
+          const XPUStream& stream) {
+        phi::DenseTensor output_t(*output);
+        const auto& place = input.place();
+        auto* calc_ctx = static_cast<phi::XPUContext*>(
+            platform::DeviceContextPool::Instance().Get(place));
+        switch (input.dtype()) {
+          case phi::DataType::FLOAT32:
+            calc_ctx->template Alloc<float>(&output_t);
+            break;
+          case phi::DataType::FLOAT16:
+            calc_ctx->template Alloc<float16>(&output_t);
+            break;
+          case phi::DataType::INT32:
+            calc_ctx->template Alloc<int>(&output_t);
+            break;
+          default:
+            VLOG(0) << "Error: type " << input.dtype() << " not supported for "
+                    << GetBackendName();
+            break;
+        }
+        int ret =
+            bkcl_all_reduce(comm,
+                            input.data(),
+                            output_t.data(),
+                            input.numel(),
+                            platform::ToBKCLDataType(
+                                framework::TransToProtoVarType(input.type())),
+                            ToBKCLRedType(opts.reduce_op),
+                            stream);
+        if (rank_ == opts.root_rank) {
+          *output = output_t;
+        }
+        return ret;
+      },
+      CommType::ALLREDUCE,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Barrier(
     const BarrierOptions& opts) {
   PADDLE_ENFORCE_GE(opts.device_id,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h
index b4a47e83fdd..79d97609d92 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h
@@ -107,6 +107,12 @@ class ProcessGroupBKCL : public ProcessGroupStream {
       bool sync_op,
       bool use_calc_stream) override;
 
+  std::shared_ptr<ProcessGroup::Task> Reduce(phi::DenseTensor* out_tensor,
+                                             const phi::DenseTensor& in_tensor,
+                                             const ReduceOptions& opts,
+                                             bool sync_op,
+                                             bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Barrier(
       const BarrierOptions& = BarrierOptions()) override;
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
index 2317e38cb28..a106c630f36 100644
--- a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
+++ b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
@@ -168,6 +168,27 @@ class TestProcessGroupFp32(unittest.TestCase):
                 "rank {}: test allgather api2 ok\n".format(pg.rank())
             )
 
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = dist.reduce(tensor_x, 0, sync_op=True)
+                paddle.device.xpu.synchronize()
+            # rank 1
+            else:
+                task = dist.reduce(tensor_y, 0, sync_op=False)
+                task.wait()
+                paddle.device.xpu.synchronize()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            sys.stdout.write(
+                "rank {}: test reduce sum api ok\n".format(pg.rank())
+            )
+
 
 class TestProcessGroupFp16(TestProcessGroupFp32):
     def setUp(self):
-- 
GitLab