[KUNLUN] update xccl lib & use native Reduce in dygraph (#49941)

* update xccl lib & use native Reduce in dygraph * minor

[KUNLUN] update xccl lib & use native Reduce in dygraph (#49941)
* update xccl lib & use native Reduce in dygraph * minor
073f7ced · jameszhang · GitHub · 5670644c · 073f7ced · 073f7ced
隐藏空白更改
内联并排

Showing with 11 addition and 35 deletion

cmake/external/xpu.cmake cmake/external/xpu.cmake +1 -1

paddle/fluid/distributed/collective/process_group_bkcl.cc paddle/fluid/distributed/collective/process_group_bkcl.cc +10 -34

未找到文件。
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -16,7 +16,7 @@ else()
 endif()

 set(XPU_XCCL_BASE_URL
-    "https://klx-sdk-release-public.su.bcebos.com/xccl/release/1.0.6")
+    "https://klx-sdk-release-public.su.bcebos.com/xccl/release/1.0.7")

 if(WITH_AARCH64)
  set(XPU_XRE_DIR_NAME "xre-kylin_aarch64")

--- a/paddle/fluid/distributed/collective/process_group_bkcl.cc
+++ b/paddle/fluid/distributed/collective/process_group_bkcl.cc
@@ -352,41 +352,17 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Reduce(
          const phi::DenseTensor& input,
          BKCLContext_t comm,
          const XPUStream& stream) {
-        phi::DenseTensor output_t;
-        paddle::framework::TensorCopy(*output, platform::XPUPlace(), &output_t);
-        const auto& place = input.place();
-        auto* calc_ctx = static_cast<phi::XPUContext*>(
-            platform::DeviceContextPool::Instance().Get(place));
-        switch (input.dtype()) {
-          case phi::DataType::FLOAT32:
-            calc_ctx->template Alloc<float>(&output_t);
-            break;
-          case phi::DataType::FLOAT16:
-            calc_ctx->template Alloc<float16>(&output_t);
-            break;
-          case phi::DataType::INT32:
-            calc_ctx->template Alloc<int>(&output_t);
-            break;
-          default:
-            VLOG(0) << "Error: type " << input.dtype() << " not supported for "
-                    << GetBackendName();
-            break;
-        }
-        int ret =
-            bkcl_all_reduce(comm,
-                            input.data(),
-                            output_t.data(),
-                            input.numel(),
-                            platform::ToBKCLDataType(
-                                framework::TransToProtoVarType(input.type())),
-                            ToBKCLRedType(opts.reduce_op),
-                            stream);
-        if (rank_ == opts.root_rank) {
-          *output = output_t;
-        }
-        return ret;
+        return bkcl_reduce(comm,
+                           input.data(),
+                           output->data(),
+                           input.numel(),
+                           platform::ToBKCLDataType(
+                               framework::TransToProtoVarType(input.type())),
+                           ToBKCLRedType(opts.reduce_op),
+                           opts.root_rank,
+                           stream);
      },
-      CommType::ALLREDUCE,
+      CommType::REDUCE,
      sync_op,
      use_calc_stream);
 }