[XPU] update log for bkcl function calls. (#53609)

* [XPU] update log for bkcl function calls. * minor update * revert unnecessary modifications.

[XPU] update log for bkcl function calls. (#53609)
* [XPU] update log for bkcl function calls. * minor update * revert unnecessary modifications.
d67d74cc · houj04 · GitHub · 0d45ac73 · d67d74cc · d67d74cc
3 changed file
--- a/paddle/fluid/distributed/collective/process_group_bkcl.cc
+++ b/paddle/fluid/distributed/collective/process_group_bkcl.cc
@@ -115,7 +115,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Recv(
          const phi::DenseTensor& input,
          BKCLContext_t comm,
          const XPUStream& stream) {
-        VLOG(3) << "bkcl_recv";
+        VLOG(3) << "calling bkcl_recv"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", src_rank: " << src_rank << ", numel: " << output->numel()
+                << ", dtype: " << output->type() << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
        int r = bkcl_recv(comm,
                          output->data(),
                          output->numel(),
@@ -148,7 +154,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Send(
          const phi::DenseTensor& input,
          BKCLContext_t comm,
          const XPUStream& stream) {
-        VLOG(3) << "bkcl_send";
+        VLOG(3) << "calling bkcl_send"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", dst_rank: " << dst_rank
+                << ", input numel: " << input.numel()
+                << ", dtype: " << input.type() << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
        int r = bkcl_send(comm,
                          input.data(),
                          input.numel(),
@@ -276,7 +289,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
          const phi::DenseTensor& input,
          BKCLContext_t comm,
          const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_reduce";
+        VLOG(3) << "calling bkcl_all_reduce"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << input.numel() << ", dtype: " << input.type()
+                << ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
+                << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
        int r =
            bkcl_all_reduce(comm,
                            input.data(),
@@ -307,7 +327,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
          BKCLContext_t comm,
          const XPUStream& stream) {
        int root = opts.source_rank + opts.source_root;
-        VLOG(3) << "bkcl_broadcast";
+        VLOG(3) << "calling bkcl_broadcast"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", root: " << root << ", numel: " << input.numel()
+                << ", dtype: " << input.type() << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
        int r =
            bkcl_broadcast(comm,
                           input.data(),
@@ -346,7 +372,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
          const phi::DenseTensor& input,
          BKCLContext_t comm,
          const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_gather";
+        VLOG(3) << "calling bkcl_all_gather"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << in_tensor_maybe_partial.numel()
+                << ", dtype: " << input.type() << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
        int r =
            bkcl_all_gather(comm,
                            in_tensor_maybe_partial.data(),
@@ -375,7 +407,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Reduce(
          const phi::DenseTensor& input,
          BKCLContext_t comm,
          const XPUStream& stream) {
-        VLOG(3) << "bkcl_reduce";
+        VLOG(3) << "calling bkcl_reduce"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", root: " << opts.root_rank << ", numel: " << input.numel()
+                << ", dtype: " << input.type()
+                << ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
+                << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
        int r = bkcl_reduce(comm,
                            input.data(),
                            output->data(),
@@ -405,7 +445,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::ReduceScatter(
          const phi::DenseTensor& input,
          BKCLContext_t comm,
          const XPUStream& stream) {
-        VLOG(3) << "bkcl_reduce_scatter";
+        VLOG(3) << "calling bkcl_reduce_scatter"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << output->numel() << ", dtype: " << input.type()
+                << ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
+                << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
        int r = bkcl_reduce_scatter(
            comm,
            input.data(),
@@ -491,8 +538,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
          const phi::DenseTensor& input,
          BKCLContext_t comm,
          const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_reduce";
-
+        VLOG(3) << "calling bkcl_all_reduce"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << input.numel() << ", dtype: " << input.type()
+                << ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
+                << ", sync_op: " << true << ", use_calc_stream: " << false;
        int r =
            bkcl_all_reduce(comm,
                            input.data(),
@@ -535,7 +587,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
          const phi::DenseTensor& input,
          BKCLContext_t comm,
          const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_reduce";
+        VLOG(3) << "calling bkcl_all_reduce"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << input.numel() << ", dtype: " << input.type()
+                << ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
+                << ", sync_op: " << sync_op << ", use_calc_stream: " << false;
        int r =
            bkcl_all_reduce(comm,
                            input.data(),
@@ -580,7 +638,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
          const XPUStream& stream) {
        const auto root =
            opts.source_rank * in_tensors.size() + opts.source_root;
-        VLOG(3) << "bkcl_broadcast";
+        VLOG(3) << "calling bkcl_broadcast"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", root: " << root << ", numel: " << input.numel()
+                << ", dtype: " << input.type() << ", sync_op: " << true
+                << ", use_calc_stream: " << false;
        int r =
            bkcl_broadcast(comm,
                           input.data(),
@@ -626,7 +690,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
          const XPUStream& stream) {
        const auto root =
            opts.source_rank * in_tensors.size() + opts.source_root;
-        VLOG(3) << "bkcl_broadcast";
+        VLOG(3) << "calling bkcl_broadcast"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", root: " << root << ", numel: " << input.numel()
+                << ", dtype: " << input.type() << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << false;
        int r =
            bkcl_broadcast(comm,
                           input.data(),
@@ -671,7 +741,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
          const phi::DenseTensor& input,
          BKCLContext_t comm,
          const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_gather";
+        VLOG(3) << "calling bkcl_all_gather"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << input.numel() << ", dtype: " << input.type()
+                << ", sync_op: " << true << ", use_calc_stream: " << false;
        int r =
            bkcl_all_gather(comm,
                            input.data(),
@@ -712,7 +787,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
          const phi::DenseTensor& input,
          BKCLContext_t comm,
          const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_gather";
+        VLOG(3) << "calling bkcl_all_gather"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << input.numel() << ", dtype: " << input.type()
+                << ", sync_op: " << sync_op << ", use_calc_stream: " << false;
        int r =
            bkcl_all_gather(comm,
                            input.data(),

--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
@@ -66,7 +66,6 @@ class FusedGemmEpilogueXPUKernel : public framework::OpKernel<T> {
    phi::XpuFcInfo fc_info;

    phi::GetFCInfo(x_mat_dims, y->dims(), trans_x, trans_y, &fc_info);
-    VLOG(0) << "FusedGemmEpilogueXPUKernel 000";
    xpu::Context* xpu_ctx = dev_ctx.x_context();

    const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x->data<T>());

--- a/paddle/fluid/platform/device/xpu/bkcl_helper.h
+++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h
@@ -62,6 +62,18 @@ inline BKCLDataType ToBKCLDataType(framework::proto::VarType::Type type) {
  }
 }

+inline int GetBKCLRankID(BKCLContext_t comm) {
+  return reinterpret_cast<int *>(comm)[0];
+}
+
+inline int GetBKCLDevID(BKCLContext_t comm) {
+  return reinterpret_cast<int *>(comm)[1];
+}
+
+inline int GetBKCLNRanks(BKCLContext_t comm) {
+  return reinterpret_cast<int *>(comm)[2];
+}
+
 class BKCLGroupGuard {
 public:
  static std::mutex &BKCLMutex() {