diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu index 6c2c685601b9412acf76e7603376ef7623b877ba..798794025b12f3e12ebd5641d9803656b68a812f 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu @@ -202,7 +202,8 @@ void ElewiseArith(const int &nums, enum BroadcastOpType op, const T *x0, const T template <> void ElewiseArith(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, half *y, cudaStream_t stream) { - if (nums % 2 == 0) { + // `>` return true iff both half result are true. fallback to half + if (nums % 2 == 0 && op != BROADCAST_TYPE_MINIMUM && op != BROADCAST_TYPE_MAXIMUM && op != BROADCAST_TYPE_ABSGRAD) { ElewiseArithKernel(nums / 2, op, reinterpret_cast(x0), reinterpret_cast(x1), reinterpret_cast(y), stream); } else { diff --git a/tests/st/ops/gpu/test_broadcast_op.py b/tests/st/ops/gpu/test_broadcast_op.py index 53b3fd14c964d35f73fbacaf9f7dc6906c3348b3..4681b608b50f0ceb42b478cbf15b4978b09eeda5 100644 --- a/tests/st/ops/gpu/test_broadcast_op.py +++ b/tests/st/ops/gpu/test_broadcast_op.py @@ -68,6 +68,48 @@ def test_nobroadcast(): assert np.allclose(output_ms.asnumpy(), output_np) +@pytest.mark.level0 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_nobroadcast_fp16(): + context.set_context(mode=context.GRAPH_MODE, device_target='GPU') + + x1_np = np.random.rand(10, 20).astype(np.float16) + x2_np = np.random.rand(10, 20).astype(np.float16) + + output_ms = P.Minimum()(Tensor(x1_np), Tensor(x2_np)) + output_np = np.minimum(x1_np, x2_np) + assert np.allclose(output_ms.asnumpy(), output_np) + + output_ms = P.Maximum()(Tensor(x1_np), Tensor(x2_np)) + output_np = np.maximum(x1_np, x2_np) + assert np.allclose(output_ms.asnumpy(), output_np) + + output_ms = P.Greater()(Tensor(x1_np), Tensor(x2_np)) + output_np = x1_np > x2_np + assert np.allclose(output_ms.asnumpy(), output_np) + + output_ms = P.Less()(Tensor(x1_np), Tensor(x2_np)) + output_np = x1_np < x2_np + assert np.allclose(output_ms.asnumpy(), output_np) + + output_ms = P.Pow()(Tensor(x1_np), Tensor(x2_np)) + output_np = np.power(x1_np, x2_np) + assert np.allclose(output_ms.asnumpy(), output_np) + + output_ms = P.RealDiv()(Tensor(x1_np), Tensor(x2_np)) + output_np = x1_np / x2_np + assert np.allclose(output_ms.asnumpy(), output_np) + + output_ms = P.Mul()(Tensor(x1_np), Tensor(x2_np)) + output_np = x1_np * x2_np + assert np.allclose(output_ms.asnumpy(), output_np) + + output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np)) + output_np = x1_np - x2_np + assert np.allclose(output_ms.asnumpy(), output_np) + + @pytest.mark.level0 @pytest.mark.platform_x86_gpu_training @pytest.mark.env_onecard