[NPU]Use another method to void c_allreduce_sum core! (#34619)

c91b1e03 · gongweibao · GitHub · 436a9f14 · c91b1e03 · c91b1e03
4 changed file
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -59,6 +59,8 @@ if(WITH_ASCEND_CL)
        DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
        DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(checknumeric SRCS checknumeric_npu_test.cc
+        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
        DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc

--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -121,35 +121,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
 };

 #if defined(PADDLE_WITH_ASCEND_CL)
-// return true if found_inf_or_nan or return false;
-template <typename T>
-bool CheckNumerics(const framework::ExecutionContext& exe_ctx,
-                   aclrtStream stream, const paddle::framework::Tensor* in) {
-  auto& dev_ctx =
-      exe_ctx.template device_context<paddle::platform::NPUDeviceContext>();
+// return true if found_nan or return false;
+inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
+                        aclrtStream stream,
+                        const paddle::framework::Tensor* in) {
  using Tensor = paddle::framework::Tensor;
  Tensor out(in->type());
-  out.Resize(in->dims());
-  out.mutable_data<T>(dev_ctx.GetPlace());

-  bool found_inf_data = false;
+  Tensor mean(in->type());
+  mean.Resize({1});
+  mean.mutable_data<float>(dev_ctx.GetPlace());
+  std::vector<int> axes;
+  for (int i = 0; i < in->dims().size(); ++i) {
+    axes.push_back(i);
+  }

+  std::vector<float> vec;
  try {
-    const auto& runner =
-        NpuOpRunner("CheckNumerics", {*in}, {out},
-                    {{"message", std::string("check_numberics")}});
-    runner.Run(stream);
-    dev_ctx.Wait();
-  } catch (platform::EnforceNotMet& exception) {
-    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-    found_inf_data = true;
+    const auto& runner_mean = paddle::operators::NpuOpRunner(
+        "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
+    TensorToVector(mean, dev_ctx, &vec);
  } catch (...) {
-    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-    found_inf_data = true;
+    LOG(WARNING) << "ContainsNan catch exception";
+    return true;
+  }
+
+  VLOG(4) << "reducemeand result:" << vec[0];
+  if (std::isnan(static_cast<float>(vec[0]))) {
+    LOG(WARNING) << "ContainsNan detects nan";
+    return true;
+  }
+
+  if (std::isinf(static_cast<float>(vec[0]))) {
+    LOG(WARNING) << "ContainsNan detects inf";
  }

-  return found_inf_data;
+  return false;
 }
+
 #endif

 template <ReduceType red_type, typename T>
@@ -216,22 +225,24 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
    framework::Tensor tmp;
    tmp.mutable_data<float>({8}, ctx.GetPlace());

-    bool check_numerics = false;
+    bool found_nan = false;

    auto d_type = in->type();
    switch (d_type) {
-      case framework::proto::VarType::FP16:
+      case framework::proto::VarType::FP16: {
+        break;
+      }
      case framework::proto::VarType::FP32: {
        VLOG(4) << "prepare to FoundNanInf";
-        check_numerics = CheckNumerics<T>(ctx, dev_ctx->stream(), in);
-        VLOG(4) << "check_numerics:" << check_numerics;
+        found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
+        VLOG(4) << "check_numerics:" << found_nan;
        break;
      }
      default:
        break;
    }

-    if (check_numerics) {
+    if (found_nan) {
      T inf = static_cast<T>(std::numeric_limits<float>::infinity());
      VLOG(4) << "fill input data constant inf";
      auto dims = in->dims();

--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -38,6 +38,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/hccl_helper.h"
 #endif

+// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
+// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
+// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
+// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
+
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 namespace m = paddle::operators::math;
@@ -52,10 +57,11 @@ DECLARE_string(selected_npus);
 template <typename T>
 void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
+  std::cout << preStr << ":" << std::endl << debugstring;
  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
+    std::cout << ele << " ";
  }
-  VLOG(3) << preStr << ":" << std::endl << debugstring;
+  std::cout << std::endl;
 }

 void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
@@ -120,6 +126,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
  ctx.Wait();
 }

+template <typename T>
 void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
                         int iter) {
  // init
@@ -130,10 +137,11 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
  int num1 = 3;
  int num2 = 128;

-  std::vector<float> init;
+  std::vector<T> init;
  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id);
+    init.push_back(static_cast<T>(1.0 + rank_id));
  }
+  init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
  PrintDebugInfo("input data", init);

  auto place = ctx.GetPlace();
@@ -145,31 +153,33 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
+  tensor_out->mutable_data<T>(place);  // allocate
  ctx.Wait();

  // run
  f::AttributeMap attrs;
  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
  attrs["ring_id"] = 0;
+  attrs["use_calc_stream"] = 1;

  auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
                                    {{"Out", {"OutData"}}}, attrs);
-
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 1; i++) {
    op->Run(*scope, place);
  }
  ctx.Wait();

-  std::vector<float> out_vec;
+  std::vector<T> out_vec;
  TensorToVector(*tensor_out, ctx, &out_vec);
  ctx.Wait();

  PrintDebugInfo("output data", out_vec);

+  float diff = static_cast<float>(out_vec[0]) - 65504;
+  EXPECT_TRUE(diff < 0.1 && diff > -0.1);
  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 3.0);
+  for (uint32_t i = 1; i < 10; i++) {
+    EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
  }
 }

@@ -182,8 +192,7 @@ TEST(c_allreduce_sum, NPU) {
  // only support one device, if more than one device, use first default
  PrepareUniqueId(&scope, ctx, &hccl_id);
  Prepare(&scope, ctx, &hccl_id);
-  for (int i = 0; i < 1; i++) {
-    VLOG(2) << "iter num: " << i;
-    TestHCCLAllReduceOp(&scope, ctx, i);
-  }
+
+  TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
+  // TestHCCLAllReduceOp<float>(&scope, ctx, 0);
 }
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <cmath>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_sum);
+USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
+DECLARE_string(selected_npus);
+
+template <typename T>
+bool Check(T value, int size = 2 * 512 * 8192) {
+  f::Scope scope;
+  auto x = scope.Var("in");
+  auto& ctx = *dynamic_cast<p::NPUDeviceContext*>(
+      p::DeviceContextPool::Instance().Get(p::NPUPlace(0)));
+  auto place = ctx.GetPlace();
+
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  tensor_x->Resize({size});
+  tensor_x->mutable_data<T>(place);  // allocate
+
+  std::vector<T> init;
+  for (int64_t i = 0; i < size; ++i) {
+    init.push_back(static_cast<T>(value));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x);
+  return result;
+}
+
+TEST(check_numeric, NPU) {
+  auto inf = std::numeric_limits<float>::infinity();
+  auto fp16_inf = static_cast<p::float16>(inf);
+  auto nan = NAN;
+  auto fp16_nan = static_cast<p::float16>(nan);
+
+  bool result = false;
+  // Normal
+  VLOG(0) << "start normal";
+  result = Check<p::float16>(static_cast<p::float16>(65546));
+  ASSERT_FALSE(result);
+  Check<float>(static_cast<float>(1.0));
+  ASSERT_FALSE(result);
+
+  // Inf
+  VLOG(0) << "start inf";
+  result = Check<p::float16>(fp16_inf);
+  ASSERT_FALSE(result);
+  result = Check<float>(inf);
+  ASSERT_FALSE(result);
+
+  // Nan
+  VLOG(0) << "start nan";
+  result = Check<p::float16>(fp16_nan);
+  ASSERT_TRUE(result);
+  result = Check<float>(nan);
+  ASSERT_TRUE(result);
+}