diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index b3d4d9ab8f1e8823a15214dedb0579ad49472a9e..364fe773c712389e79d9c3280cf68535c18ffc9c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -15,10 +15,12 @@ limitations under the License. */ #pragma once #include + #include #include // for multiplies #include #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -30,6 +32,7 @@ limitations under the License. */ #ifdef __NVCC__ #include #include + #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; @@ -194,11 +197,11 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x, } #ifdef __NVCC__ -template +template __global__ void CommonForwardBroadcastCUDAKernel( const int *x_strides_array, const int *y_strides_array, - const int *out_dims_array, const T *x, const T *y, T *out, int out_size, - int max_dim, Functor func, const bool is_xsize_larger) { + const int *out_dims_array, const T *x, const T *y, OutType *out, + int out_size, int max_dim, Functor func, const bool is_xsize_larger) { for (int out_index = blockIdx.x * blockDim.x + threadIdx.x; out_index < out_size; out_index += blockDim.x * gridDim.x) { int x_index = 0; @@ -220,7 +223,7 @@ __global__ void CommonForwardBroadcastCUDAKernel( } } -template +template void CommonForwardBroadcastCUDA( const framework::Tensor *x, const framework::Tensor *y, framework::Tensor *z, int *x_dims_array, int *y_dims_array, @@ -230,7 +233,7 @@ void CommonForwardBroadcastCUDA( auto cplace = platform::CPUPlace(); const T *x_data = x->data(); const T *y_data = y->data(); - T *out_data = z->mutable_data(ctx.GetPlace()); + OutType *out_data = z->mutable_data(ctx.GetPlace()); std::vector x_strides_array(max_dim); std::vector y_strides_array(max_dim); @@ -268,7 +271,7 @@ void CommonForwardBroadcastCUDA( dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); CommonForwardBroadcastCUDAKernel< - Functor, T><<>>( + Functor, T, OutType><<>>( x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu, x_data, y_data, out_data, out_size, max_dim, func, is_xsize_larger); } @@ -1796,7 +1799,7 @@ void CommonElementwiseBroadcastForward( if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef __NVCC__ - CommonForwardBroadcastCUDA( + CommonForwardBroadcastCUDA( x, y, z, x_dims_array.data(), y_dims_array.data(), out_dims_array.data(), max_dim, ctx.template device_context(), func, diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py index 14c3fb7c8bf06908bd8eabffbe46887a6546f6d2..d9327c9d710ace9ff5273b61deabecfa47e67b5b 100644 --- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py @@ -273,6 +273,16 @@ class TestMathOpPatchesVarBase(unittest.TestCase): self.assertTrue(np.array_equal(res1.numpy(), res2.numpy())) self.assertTrue(np.array_equal(res1.numpy(), res3.numpy())) + def test_conpare_op_broadcast(self): + a_np = np.random.uniform(-1, 1, [10, 1, 10]).astype(self.dtype) + b_np = np.random.uniform(-1, 1, [1, 1, 10]).astype(self.dtype) + with fluid.dygraph.guard(): + a = fluid.dygraph.to_variable(a_np) + b = fluid.dygraph.to_variable(b_np) + + self.assertEqual((a != b).dtype, fluid.core.VarDesc.VarType.BOOL) + self.assertTrue(np.array_equal((a != b).numpy(), a_np != b_np)) + if __name__ == '__main__': unittest.main()