fix bug of reduce_sum when src_dtype != dst_dtype and reduce_num == 1 (#36123)

d5268a6e · Guoxia Wang · GitHub · ad128144 · d5268a6e
隐藏空白更改
内联并排

Showing with 11 addition and 2 deletion

paddle/fluid/operators/reduce_ops/reduce_op.cu.h paddle/fluid/operators/reduce_ops/reduce_op.cu.h +11 -2

未找到文件。
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -34,6 +34,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/fast_divmod.h"
@@ -705,8 +706,16 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
  if (config.reduce_num == 1) {
    auto out_dims = y->dims();
-    framework::TensorCopy(x, y->place(), y);
+    if (x.type() == y->type()) {
-    y->Resize(out_dims);
+      framework::TensorCopy(x, y->place(), y);
+      y->Resize(out_dims);
+    } else {
+      auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
+          paddle::platform::DeviceContextPool::Instance().Get(x.place()));
+      framework::VisitDataType(
+          static_cast<framework::proto::VarType::Type>(y->type()),
+          CastOpFunctor<platform::CUDADeviceContext, Tx>(&x, y, *dev_ctx));
+    }
    return;
  }