fix gradient accumulator bug. test=kunlun (#39127)

* fix gradient accumulator bug. test=kunlun * fix typo. test=kunlun * fix typo. test=kunlun * fix unit tests. test=kunlun * using TensorCopySync. test=kunlun * only fix for xpu place. test=kunlun

fix gradient accumulator bug. test=kunlun (#39127)
* fix gradient accumulator bug. test=kunlun * fix typo. test=kunlun * fix typo. test=kunlun * fix unit tests. test=kunlun * using TensorCopySync. test=kunlun * only fix for xpu place. test=kunlun
b1a458ac · houj04 · GitHub · 02d3f232 · b1a458ac · b1a458ac
2 changed file
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -243,6 +243,13 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
                        "should be equal, Otherwise, the calculation results "
                        "will be incorrect."));
+#ifdef PADDLE_WITH_XPU
+  // if src and dst are in different place, copy dst to src's place
+  if (dst_tensor->place() != place) {
+    paddle::framework::TensorCopySync(*dst_tensor, place, dst_tensor);
+  }
+#endif
 #define PADDLE_TENSOR_ADD(cpp_type)                                  \
  if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) { \
    TensorAddFunctor<cpp_type> func(                                 \

--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -15,6 +15,7 @@
 #include <memory>
 #include <type_traits>
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
@@ -29,8 +30,8 @@ namespace imperative {
 void TensorAdd(const framework::Variable& src, framework::Variable* dst);
-template <typename Place, typename T>
+template <typename Place1, typename Place2, typename T>
-int TensorddTest(Place place, T t1, T t2) {
+int TensorddTest(Place1 place1, Place2 place2, T t1, T t2) {
  framework::Variable var1;
  framework::Variable var2;
  std::vector<T> src_data(10, t1);
@@ -46,18 +47,25 @@ int TensorddTest(Place place, T t1, T t2) {
  auto* dst = var2.GetMutable<framework::LoDTensor>();
  src->Resize(framework::make_ddim(dims));
  dst->Resize(framework::make_ddim(dims));
-  auto* src_mutable = src->mutable_data<T>(place);
+  auto* src_mutable = src->mutable_data<T>(place1);
-  auto* dst_mutable = dst->mutable_data<T>(place);
+  auto* dst_mutable = dst->mutable_data<T>(place2);
-  if (!std::is_same<Place, platform::CUDAPlace>::value) {
-    paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
+  if (!std::is_same<Place1, platform::CUDAPlace>::value) {
+    paddle::memory::Copy(place1, src_mutable, src_place, src_data.data(),
                         sizeof(T) * src_data.size());
-    paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
-                         sizeof(T) * dst_data.size());
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  } else {
-    paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
+    paddle::memory::Copy(place1, src_mutable, src_place, src_data.data(),
                         sizeof(T) * src_data.size(), 0);
-    paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
+#endif
+  }
+  if (!std::is_same<Place2, platform::CUDAPlace>::value) {
+    paddle::memory::Copy(place2, dst_mutable, src_place, dst_data.data(),
+                         sizeof(T) * dst_data.size());
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else {
+    paddle::memory::Copy(place2, dst_mutable, src_place, dst_data.data(),
                         sizeof(T) * dst_data.size(), 0);
 #endif
  }
@@ -80,25 +88,64 @@ TEST(test_add_functor, add_functor) {
  platform::CPUPlace cpu_place;
  int cpu_res = 1;
-  cpu_res = TensorddTest(cpu_place, 1.0, 0.0);
-  EXPECT_EQ(cpu_res, 0);
+  // float32
-  cpu_res = TensorddTest(cpu_place, static_cast<double>(1.0),
+  cpu_res = TensorddTest(cpu_place, cpu_place, static_cast<float>(1.0),
-                         static_cast<double>(2.0));
+                         static_cast<float>(2.0));
  EXPECT_EQ(cpu_res, 0);
-  cpu_res = TensorddTest(cpu_place, static_cast<platform::float16>(1.0),
+  // float16
+  cpu_res =
+      TensorddTest(cpu_place, cpu_place, static_cast<platform::float16>(1.0),
                   static_cast<platform::float16>(2.0));
  EXPECT_EQ(cpu_res, 0);
+#ifndef PADDLE_WITH_XPU
+  // does not support double when compiled using xpu
+  cpu_res = TensorddTest(cpu_place, cpu_place, static_cast<double>(1.0),
+                         static_cast<double>(2.0));
+  EXPECT_EQ(cpu_res, 0);
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  int gpu_res = 1;
-  gpu_res = TensorddTest(gpu_place, 1.0, 0.0);
+  gpu_res = TensorddTest(gpu_place, gpu_place, 1.0, 0.0);
  EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorddTest(gpu_place, static_cast<double>(1.0),
+  gpu_res = TensorddTest(gpu_place, gpu_place, static_cast<double>(1.0),
                         static_cast<double>(2.0));
  EXPECT_EQ(gpu_res, 0);
-  gpu_res = TensorddTest(gpu_place, static_cast<platform::float16>(1.0),
+  gpu_res =
+      TensorddTest(gpu_place, gpu_place, static_cast<platform::float16>(1.0),
                   static_cast<platform::float16>(2.0));
  EXPECT_EQ(gpu_res, 0);
 #endif
+#ifdef PADDLE_WITH_XPU
+  platform::XPUPlace xpu_place(0);
+  int xpu_res = 1;
+  // normal
+  xpu_res = TensorddTest(xpu_place, xpu_place, static_cast<float>(1.0),
+                         static_cast<float>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+  xpu_res =
+      TensorddTest(xpu_place, xpu_place, static_cast<platform::float16>(1.0),
+                   static_cast<platform::float16>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+  // different places
+  xpu_res = TensorddTest(cpu_place, xpu_place, static_cast<float>(1.0),
+                         static_cast<float>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+  xpu_res = TensorddTest(xpu_place, cpu_place, static_cast<float>(1.0),
+                         static_cast<float>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+  xpu_res =
+      TensorddTest(cpu_place, xpu_place, static_cast<platform::float16>(1.0),
+                   static_cast<platform::float16>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+  xpu_res =
+      TensorddTest(xpu_place, cpu_place, static_cast<platform::float16>(1.0),
+                   static_cast<platform::float16>(2.0));
+  EXPECT_EQ(xpu_res, 0);
+#endif
 }
 TEST(test_add_functor, execption) {
@@ -106,10 +153,11 @@ TEST(test_add_functor, execption) {
  platform::CUDAPlace cuda_place(0);
  platform::CPUPlace cpu_place;
-  ASSERT_ANY_THROW(TensorddTest(cpu_place, 1, 0));
+  ASSERT_ANY_THROW(TensorddTest(cpu_place, cpu_place, 1, 0));
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, 1.0, 0.0));
+  ASSERT_ANY_THROW(
-  ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place,
+      TensorddTest(cuda_pinned_place, cuda_pinned_place, 1.0, 0.0));
+  ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, cuda_pinned_place,
                                static_cast<platform::float16>(1.0),
                                static_cast<platform::float16>(2.0)));
 #endif