fix dtype error of compare op, test=develop (#25059) (#25234)

ff559938 · Leo Chen · GitHub · 07a65aa3 · ff559938 · ff559938
2 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -15,10 +15,12 @@ limitations under the License. */
 #pragma once

 #include <glog/logging.h>
+
 #include <algorithm>
 #include <functional>  // for multiplies
 #include <iterator>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -30,6 +32,7 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <cuda.h>
 #include <thrust/iterator/iterator_adaptor.h>
+
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
@@ -194,11 +197,11 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x,
 }

 #ifdef __NVCC__
-template <typename Functor, typename T>
+template <typename Functor, typename T, typename OutType = T>
 __global__ void CommonForwardBroadcastCUDAKernel(
    const int *x_strides_array, const int *y_strides_array,
-    const int *out_dims_array, const T *x, const T *y, T *out, int out_size,
-    int max_dim, Functor func, const bool is_xsize_larger) {
+    const int *out_dims_array, const T *x, const T *y, OutType *out,
+    int out_size, int max_dim, Functor func, const bool is_xsize_larger) {
  for (int out_index = blockIdx.x * blockDim.x + threadIdx.x;
       out_index < out_size; out_index += blockDim.x * gridDim.x) {
    int x_index = 0;
@@ -220,7 +223,7 @@ __global__ void CommonForwardBroadcastCUDAKernel(
  }
 }

-template <typename Functor, typename T>
+template <typename Functor, typename T, typename OutType = T>
 void CommonForwardBroadcastCUDA(
    const framework::Tensor *x, const framework::Tensor *y,
    framework::Tensor *z, int *x_dims_array, int *y_dims_array,
@@ -230,7 +233,7 @@ void CommonForwardBroadcastCUDA(
  auto cplace = platform::CPUPlace();
  const T *x_data = x->data<T>();
  const T *y_data = y->data<T>();
-  T *out_data = z->mutable_data<T>(ctx.GetPlace());
+  OutType *out_data = z->mutable_data<OutType>(ctx.GetPlace());

  std::vector<int> x_strides_array(max_dim);
  std::vector<int> y_strides_array(max_dim);
@@ -268,7 +271,7 @@ void CommonForwardBroadcastCUDA(
  dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);

  CommonForwardBroadcastCUDAKernel<
-      Functor, T><<<gird_size, block_size, 0, ctx.stream()>>>(
+      Functor, T, OutType><<<gird_size, block_size, 0, ctx.stream()>>>(
      x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu, x_data,
      y_data, out_data, out_size, max_dim, func, is_xsize_larger);
 }
@@ -1796,7 +1799,7 @@ void CommonElementwiseBroadcastForward(

  if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef __NVCC__
-    CommonForwardBroadcastCUDA<Functor, T>(
+    CommonForwardBroadcastCUDA<Functor, T, OutType>(
        x, y, z, x_dims_array.data(), y_dims_array.data(),
        out_dims_array.data(), max_dim,
        ctx.template device_context<platform::CUDADeviceContext>(), func,

--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -237,6 +237,16 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
            str1 = "just test"
            self.assertTrue(str1[var1] == 's')

+    def test_conpare_op_broadcast(self):
+        a_np = np.random.uniform(-1, 1, [10, 1, 10]).astype(self.dtype)
+        b_np = np.random.uniform(-1, 1, [1, 1, 10]).astype(self.dtype)
+        with fluid.dygraph.guard():
+            a = fluid.dygraph.to_variable(a_np)
+            b = fluid.dygraph.to_variable(b_np)
+
+            self.assertEqual((a != b).dtype, fluid.core.VarDesc.VarType.BOOL)
+            self.assertTrue(np.array_equal((a != b).numpy(), a_np != b_np))
+

 if __name__ == '__main__':
    unittest.main()