[XPU] remove scale_loss in parallel.py (#53337)

* [XPU] remove scale_loss in parallel.py * [XPU] throw Unimplemented when using Reducer

[XPU] remove scale_loss in parallel.py (#53337)
* [XPU] remove scale_loss in parallel.py * [XPU] throw Unimplemented when using Reducer
2e1ac529 · houj04 · GitHub · eee9c788 · 2e1ac529 · 2e1ac529
9 changed file
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -61,7 +61,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
    VLOG(4) << "after div 2" << *tensor;
  } else if (platform::is_xpu_place(tensor->place())) {
 #ifdef PADDLE_WITH_XPU_BKCL
-// TODO(liuyuhui) support xpu about div nranks in the future
+    PADDLE_THROW(
+        platform::errors::Unimplemented("DivNRanks is not supported on XPU / "
+                                        "XPU_BKCL, use EagerReducer instead."));
 #endif
  }
 }

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -149,7 +149,7 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  void *p = nullptr;

-  platform::XPUDeviceGuard gurad(place.device);
+  platform::XPUDeviceGuard guard(place.device);
  int ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
  if (ret != XPU_SUCCESS) {
    VLOG(10) << "xpu memory malloc(" << size << ") failed, try again";
@@ -182,7 +182,7 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place,
  VLOG(10) << "Free " << size << " bytes on " << platform::Place(place);
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);

-  platform::XPUDeviceGuard gurad(place.device);
+  platform::XPUDeviceGuard guard(place.device);
  xpu_free(p);
 #else
  PADDLE_THROW(

--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -268,7 +268,7 @@ void BufferedReader::ReadAsync(size_t i) {
        xpu_ptrs.emplace_back(xpu[i].mutable_data(place_, cpu[i].type()));
      }

-      platform::XPUDeviceGuard gurad(place_.device);
+      platform::XPUDeviceGuard guard(place_.device);
      int r = xpu_event_record(events_[i].get(), compute_stream_);
      PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_event_record");
      r = xpu_stream_wait_event(stream_.get(), events_[i].get());

--- a/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc
@@ -22,14 +22,14 @@ XpuStreamResourcePool::XpuStreamResourcePool() {
  pool_.reserve(dev_cnt);
  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
    auto creator = [dev_idx] {
-      platform::XPUDeviceGuard gurad(dev_idx);
+      platform::XPUDeviceGuard guard(dev_idx);
      xpuStream stream;
      xpu_stream_create(&stream);
      return stream;
    };

    auto deleter = [dev_idx](xpuStream stream) {
-      platform::XPUDeviceGuard gurad(dev_idx);
+      platform::XPUDeviceGuard guard(dev_idx);
      xpu_stream_destroy(stream);
    };

@@ -63,14 +63,14 @@ XpuEventResourcePool::XpuEventResourcePool() {
  pool_.reserve(dev_cnt);
  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
    auto creator = [dev_idx] {
-      platform::XPUDeviceGuard gurad(dev_idx);
+      platform::XPUDeviceGuard guard(dev_idx);
      xpuEventHandle event;
      xpu_event_create(&event);
      return event;
    };

    auto deleter = [dev_idx](xpuEventHandle event) {
-      platform::XPUDeviceGuard gurad(dev_idx);
+      platform::XPUDeviceGuard guard(dev_idx);
      xpu_event_destroy(event);
    };


--- a/paddle/phi/kernels/xpu/bmm_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/bmm_grad_kernel.cc
@@ -33,7 +33,7 @@ void MatMul(const Context& dev_ctx,
    MatMulXPUFunction<T, int32_t>(a, b, out, trans_a, trans_b, xpu_ctx);
  } else if (fccal_type == XPUFCCalcType::FC_FLOAT) {
    MatMulXPUFunction<T, float>(a, b, out, trans_a, trans_b, xpu_ctx);
-  } else if (fccal_type == XPUFCCalcType::FC_INT_WITH_LL) {
+  } else if (fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) {
    MatMulXPUFunction<T, int_with_ll_t>(a, b, out, trans_a, trans_b, xpu_ctx);
  } else {
    MatMulXPUFunction<T, int16_t>(a, b, out, trans_a, trans_b, xpu_ctx);

--- a/paddle/phi/kernels/xpu/bmm_kernel.cc
+++ b/paddle/phi/kernels/xpu/bmm_kernel.cc
@@ -68,7 +68,7 @@ void BmmKernel(const Context& dev_ctx,
    MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, xpu_ctx);
  } else if (fccal_type == XPUFCCalcType::FC_FLOAT) {
    MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, xpu_ctx);
-  } else if (fccal_type == XPUFCCalcType::FC_INT_WITH_LL) {
+  } else if (fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) {
    MatMulXPUFunction<T, int_with_ll_t>(x, y, out, trans_x, trans_y, xpu_ctx);
  } else {
    MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, xpu_ctx);

--- a/paddle/phi/kernels/xpu/xpu_api_wrapper.h
+++ b/paddle/phi/kernels/xpu/xpu_api_wrapper.h
@@ -30,7 +30,7 @@ enum XPUFCCalcType {
  FC_INT16 = 0,
  FC_INT32,
  FC_FLOAT,
-  FC_INT_WITH_LL,
+  FC_INT32_WITH_LL,
 };

 template <typename T>
@@ -42,8 +42,8 @@ XPUFCCalcType FCCalcType() {
    return XPUFCCalcType::FC_INT32;
  } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
    return XPUFCCalcType::FC_FLOAT;
-  } else if (std::getenv("XPU_PADDLE_FC_INT_WITH_LL") != nullptr) {
-    return XPUFCCalcType::FC_INT_WITH_LL;
+  } else if (std::getenv("XPU_PADDLE_FC_INT32_WITH_LL") != nullptr) {
+    return XPUFCCalcType::FC_INT32_WITH_LL;
  }
  return XPUFCCalcType::FC_INT16;
 }

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -47,7 +47,7 @@ from paddle.distributed.fleet.launch_utils import check_backend
 # (TODO: GhostScreaming) It will be removed later.
 from paddle.framework import _set_expected_place
 from paddle.framework import base as imperative_base
-from paddle.framework import core, in_dygraph_mode, to_variable
+from paddle.framework import core, in_dygraph_mode
 from paddle.nn.layer import layers
 from paddle.utils import deprecated

@@ -117,21 +117,6 @@ def _split_tensors(coalesced_grads_and_grad_vars):
                assert g_var.shape == g_shape


-def scale_loss(loss):
-    # TODO(liuyuhui) Currently only for xpu. Will be removed in the future.
-    if not paddle.distributed.ParallelEnv().world_size > 1:
-        return loss
-
-    loss_scale = to_variable(
-        np.array([paddle.distributed.ParallelEnv().world_size]).astype(
-            "float32"
-        )
-    )
-    loss_scale.stop_gradient = True
-    scaled_loss = loss / loss_scale
-    return scaled_loss
-
-
 @imperative_base.no_grad
 @framework.dygraph_only
 def build_groups(vars, group_size):

--- a/python/paddle/fluid/dygraph/tensor_patch_methods.py
+++ b/python/paddle/fluid/dygraph/tensor_patch_methods.py
@@ -275,8 +275,6 @@ def monkey_patch_tensor():
                # 4: [5000.]

        """
-        from paddle.distributed.parallel import scale_loss
-
        if framework._non_static_mode():
            if in_profiler_mode():
                record_event = profiler.RecordEvent(
@@ -306,30 +304,15 @@ def monkey_patch_tensor():
            if _grad_scalar:
                # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly.
                self = _grad_scalar.scale(self)
-            if paddle.is_compiled_with_xpu():
-                # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
-                scaled_loss = scale_loss(self)
-                if framework.global_var._in_eager_mode_:
-                    core.eager.run_backward(
-                        [scaled_loss], grad_tensor, retain_graph
-                    )
-                else:
-                    core.dygraph_run_backward(
-                        [scaled_loss],
-                        [grad_tensor],
-                        retain_graph,
-                        framework._dygraph_tracer(),
-                    )
+            if framework.global_var._in_eager_mode_:
+                core.eager.run_backward([self], grad_tensor, retain_graph)
            else:
-                if framework.global_var._in_eager_mode_:
-                    core.eager.run_backward([self], grad_tensor, retain_graph)
-                else:
-                    core.dygraph_run_backward(
-                        [self],
-                        [grad_tensor],
-                        retain_graph,
-                        framework._dygraph_tracer(),
-                    )
+                core.dygraph_run_backward(
+                    [self],
+                    [grad_tensor],
+                    retain_graph,
+                    framework._dygraph_tracer(),
+                )
            if in_profiler_mode():
                record_event.end()
        else: