!5779 tenoradd profiling

Merge pull request !5779 from chenweifeng/broadcast-refactor

!5779 tenoradd profiling
Merge pull request !5779 from chenweifeng/broadcast-refactor
a3d0ddb4 · mindspore-ci-bot · Gitee · 72f77bde · 6ebe132c · a3d0ddb4
6 changed file
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
@@ -15,110 +15,216 @@
 */
 #include <vector>
+#include <iostream>
 #include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh"
 #include "runtime/device/gpu/cuda_common.h"
-template <typename T, typename S>
+// Basic function
+template <typename T>
 struct GreaterFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return lhs > rhs ? true : false; }
+  __device__ __host__ __forceinline__ bool operator()(const T &lhs, const T &rhs) { return lhs > rhs ? true : false; }
 };
-template <typename T, typename S>
+template <typename T>
 struct LessFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return lhs < rhs ? true : false; }
+  __device__ __host__ __forceinline__ bool operator()(const T &lhs, const T &rhs) { return lhs < rhs ? true : false; }
 };
-template <typename T, typename S>
+template <typename T>
 struct MinimumFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return lhs < rhs ? lhs : rhs; }
+  __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return lhs < rhs ? lhs : rhs; }
 };
-template <typename T, typename S>
+template <typename T>
 struct MaximumFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return lhs > rhs ? lhs : rhs; }
+  __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return lhs > rhs ? lhs : rhs; }
 };
-template <typename T, typename S>
+template <typename T>
 struct PowerFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return pow(lhs, rhs); }
+  __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return pow(lhs, rhs); }
 };
 template <>
-struct PowerFunc<half, half> {
+struct PowerFunc<half> {
-  __device__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
+  __device__ __host__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
    return __float2half(pow(__half2float(lhs), __half2float(rhs)));
  }
 };
-template <typename T, typename S>
+template <>
+struct PowerFunc<half2> {
+  __device__ __host__ __forceinline__ half2 operator()(const half2 &lhs, const half2 &rhs) {
+    float2 base = __half22float2(lhs);
+    float2 index = __half22float2(rhs);
+    base.x = pow(base.x, index.x);
+    base.y = pow(base.y, index.y);
+    return __float22half2_rn(base);
+  }
+};
+template <typename T>
 struct RealDivFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
+  __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
 };
-template <typename T, typename S>
+template <typename T>
 struct DivFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
+  __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
 };
-template <typename T, typename S>
+template <typename T>
 struct MulFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs * rhs); }
+  __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return (lhs * rhs); }
 };
-template <typename T, typename S>
+template <typename T>
 struct SubFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs - rhs); }
+  __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return (lhs - rhs); }
 };
-template <typename T, typename S>
+template <typename T>
 struct AddFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs + rhs); }
+  __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return (lhs + rhs); }
 };
-template <typename T, typename S>
+// convert to float to fix accuracy issue
+template <typename T>
 struct FloorDivFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) {
+  __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
-    return floor(static_cast<float>(lhs) / static_cast<float>(rhs));
+    return floorf(static_cast<float>(lhs) / static_cast<float>(rhs));
  }
 };
 template <>
-struct FloorDivFunc<half, half> {
+struct FloorDivFunc<half> {
-  __device__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
+  __device__ __host__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
-    return __float2half(floor(__half2float(lhs) / __half2float(rhs)));
+    return floorf(__half2float(lhs) / __half2float(rhs));
  }
 };
 template <>
-struct FloorDivFunc<half, bool> {
+struct FloorDivFunc<half2> {
-  // invalid branch
+  __device__ __host__ __forceinline__ half2 operator()(const half2 &lhs, const half2 &rhs) {
-  __device__ __forceinline__ half operator()(const half &lhs, const half &rhs) { return false; }
+    float2 l = __half22float2(lhs);
+    float2 r = __half22float2(rhs);
+    l.x = floorf(l.x / r.x);
+    l.y = floorf(l.y / r.y);
+    return __float22half2_rn(l);
+  }
 };
-template <typename T, typename S>
+template <typename T>
 struct AbsGradFunc {
-  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) {
+  __device__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
    T zero = 0.0;
    return lhs < zero ? -rhs : rhs;
  }
 };
 template <>
-struct PowerFunc<half, bool> {
+struct AbsGradFunc<half2> {
-  // invalid branch
+  __device__ __forceinline__ half2 operator()(const half2 &lhs, const half2 &rhs) {
-  __device__ __forceinline__ half operator()(const half &lhs, const half &rhs) { return false; }
+    half2 zero(0.0, 0.0);
+    return lhs < zero ? -rhs : rhs;
+  }
 };
+// Element-wise Comparation
+template <typename T, typename Func>
+__global__ void ElewiseCmpKernel(const int nums, const T *x0, const T *x1, bool *y) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < nums; pos += blockDim.x * gridDim.x) {
+    y[pos] = Func()(x0[pos], x1[pos]);
+  }
+}
+template <typename T>
+void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream) {
+  switch (op) {
+    case BROADCAST_TYPE_GREATER:
+      return ElewiseCmpKernel<T, GreaterFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_LESS:
+      return ElewiseCmpKernel<T, LessFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    default:
+      break;
+  }
+}
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, bool *y,
+                         cudaStream_t stream);
+template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int *x0, const int *x1, bool *y,
+                         cudaStream_t stream);
+// Element-wise ArithMetic
+template <typename T, typename Func>
+__global__ void ElewiseArithKernel(const int nums, const T *x0, const T *x1, T *y) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < nums; pos += blockDim.x * gridDim.x) {
+    y[pos] = Func()(x0[pos], x1[pos]);
+  }
+}
+template <typename T>
+void ElewiseArithKernel(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream) {
+  switch (op) {
+    case BROADCAST_TYPE_MINIMUM:
+      return ElewiseArithKernel<T, MinimumFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_MAXIMUM:
+      return ElewiseArithKernel<T, MaximumFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_POWER:
+      return ElewiseArithKernel<T, PowerFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_REALDIV:
+      return ElewiseArithKernel<T, RealDivFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_MUL:
+      return ElewiseArithKernel<T, MulFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_SUB:
+      return ElewiseArithKernel<T, SubFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_ADD:
+      return ElewiseArithKernel<T, AddFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_FLOORDIV:
+      return ElewiseArithKernel<T, FloorDivFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_ABSGRAD:
+      return ElewiseArithKernel<T, AbsGradFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    case BROADCAST_TYPE_DIV:
+      return ElewiseArithKernel<T, DivFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
+    default:
+      break;
+  }
+}
+template <typename T>
+void ElewiseArith(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream) {
+  return ElewiseArithKernel(nums, op, x0, x1, y, stream);
+}
+template <>
+void ElewiseArith(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, half *y,
+                  cudaStream_t stream) {
+  if (nums % 2 == 0) {
+    ElewiseArithKernel<half2>(nums / 2, op, reinterpret_cast<const half2 *>(x0), reinterpret_cast<const half2 *>(x1),
+                              reinterpret_cast<half2 *>(y), stream);
+  } else {
+    return ElewiseArithKernel(nums, op, x0, x1, y, stream);
+  }
+}
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, float *y,
+                           cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, half *y,
+                           cudaStream_t stream);
+template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int *x0, const int *x1, int *y,
+                           cudaStream_t stream);
+// Broadcast comparation
 __device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; }
-template <typename T, typename S, typename Func>
+template <typename T, typename Func>
-__device__ __forceinline__ void BroadcastOperator(const int &l0, const int &l1, const int &l2, const int &l3,
+__global__ void BroadcastCmpKernel(const int l0, const int l1, const int l2, const int l3, const int l4, const int l5,
-                                                  const int &l4, const int &l5, const int &l6, const int &r0,
+                                   const int l6, const int r0, const int r1, const int r2, const int r3, const int r4,
-                                                  const int &r1, const int &r2, const int &r3, const int &r4,
+                                   const int r5, const int r6, const int d0, const int d1, const int d2, const int d3,
-                                                  const int &r5, const int &r6, const int &d0, const int &d1,
+                                   const int d4, const int d5, const int d6, const T *x0, const T *x1, bool *y) {
-                                                  const int &d2, const int &d3, const int &d4, const int &d5,
-                                                  const int &d6, const T *input0, const T *input1, S *output) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < d0 * d1 * d2 * d3 * d4 * d5 * d6;
       pos += blockDim.x * gridDim.x) {
    int i = pos / (d1 * d2 * d3 * d4 * d5 * d6) % d0;
@@ -143,115 +249,152 @@ __device__ __forceinline__ void BroadcastOperator(const int &l0, const int &l1,
    r_index += Index(m, r4) * r5 * r6;
    r_index += Index(n, r5) * r6;
    r_index += Index(o, r6);
-    output[pos] = Func()(input0[l_index], input1[r_index]);
+    y[pos] = Func()(x0[l_index], x1[r_index]);
  }
 }
-template <typename T, typename S>
+template <typename T>
-__global__ void BroadcastKernel(const int l0, const int l1, const int l2, const int l3, const int l4, const int l5,
+void BroadcastCmp(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims, const std::vector<int> &y_dims,
-                                const int l6, const int r0, const int r1, const int r2, const int r3, const int r4,
+                  enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream) {
-                                const int r5, const int r6, const int d0, const int d1, const int d2, const int d3,
+  int size = 1;
-                                const int d4, const int d5, const int d6, enum BroadcastOpType op, const T *input0,
+  for (auto d : y_dims) {
-                                const T *input1, S *output) {
+    size *= d;
+  }
  switch (op) {
    case BROADCAST_TYPE_GREATER:
-      return BroadcastOperator<T, S, GreaterFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
+      return BroadcastCmpKernel<T, GreaterFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
-                                                        d2, d3, d4, d5, d6, input0, input1, output);
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
    case BROADCAST_TYPE_LESS:
-      return BroadcastOperator<T, S, LessFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
+      return BroadcastCmpKernel<T, LessFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
-                                                     d3, d4, d5, d6, input0, input1, output);
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
-    case BROADCAST_TYPE_MINIMUM:
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
-      return BroadcastOperator<T, S, MinimumFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
-                                                        d2, d3, d4, d5, d6, input0, input1, output);
+    default:
-    case BROADCAST_TYPE_MAXIMUM:
+      break;
-      return BroadcastOperator<T, S, MaximumFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
-                                                        d2, d3, d4, d5, d6, input0, input1, output);
-    case BROADCAST_TYPE_POWER:
-      return BroadcastOperator<T, S, PowerFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
-                                                      d2, d3, d4, d5, d6, input0, input1, output);
-    case BROADCAST_TYPE_REALDIV:
-      return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
-                                                        d2, d3, d4, d5, d6, input0, input1, output);
-    case BROADCAST_TYPE_MUL:
-      return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
-                                                    d3, d4, d5, d6, input0, input1, output);
-    case BROADCAST_TYPE_SUB:
-      return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
-                                                    d3, d4, d5, d6, input0, input1, output);
-    case BROADCAST_TYPE_ADD:
-      return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
-                                                    d3, d4, d5, d6, input0, input1, output);
-    case BROADCAST_TYPE_FLOORDIV:
-      return BroadcastOperator<T, S, FloorDivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
-                                                         d2, d3, d4, d5, d6, input0, input1, output);
-    case BROADCAST_TYPE_ABSGRAD:
-      return BroadcastOperator<T, S, AbsGradFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
-                                                        d2, d3, d4, d5, d6, input0, input1, output);
-    case BROADCAST_TYPE_DIV:
-      return BroadcastOperator<T, S, DivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
-                                                    d3, d4, d5, d6, input0, input1, output);
  }
 }
-template <typename T, typename S>
+template void BroadcastCmp(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
-void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
+                           const std::vector<int> &y_dims, enum BroadcastOpType op, const float *x0, const float *x1,
-               const std::vector<int> &output_shape, enum BroadcastOpType op, const T *input0, const T *input1,
+                           bool *y, cudaStream_t stream);
-               S *output, cudaStream_t stream) {
+template void BroadcastCmp(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
-  int size = 1;
+                           const std::vector<int> &y_dims, enum BroadcastOpType op, const half *x0, const half *x1,
-  for (auto d : output_shape) {
+                           bool *y, cudaStream_t stream);
-    size *= d;
+template void BroadcastCmp(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
-  }
+                           const std::vector<int> &y_dims, enum BroadcastOpType op, const int *x0, const int *x1,
-  BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(
+                           bool *y, cudaStream_t stream);
-    lhs_shape[0], lhs_shape[1], lhs_shape[2], lhs_shape[3], lhs_shape[4], lhs_shape[5], lhs_shape[6], rhs_shape[0],
-    rhs_shape[1], rhs_shape[2], rhs_shape[3], rhs_shape[4], rhs_shape[5], rhs_shape[6], output_shape[0],
-    output_shape[1], output_shape[2], output_shape[3], output_shape[4], output_shape[5], output_shape[6], op, input0,
-    input1, output);
-}
-template <typename T, typename S, typename Func>
+// Broadcast Arithmetic
-__device__ __forceinline__ void NoBroadcastOperator(const int &nums, const T *input0, const T *input1, S *output) {
+template <typename T, typename Func>
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < nums; pos += blockDim.x * gridDim.x) {
+__global__ void BroadcastArithKernel(const int l0, const int l1, const int l2, const int l3, const int l4, const int l5,
-    output[pos] = Func()(input0[pos], input1[pos]);
+                                     const int l6, const int r0, const int r1, const int r2, const int r3, const int r4,
+                                     const int r5, const int r6, const int d0, const int d1, const int d2, const int d3,
+                                     const int d4, const int d5, const int d6, const T *x0, const T *x1, T *y) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < d0 * d1 * d2 * d3 * d4 * d5 * d6;
+       pos += blockDim.x * gridDim.x) {
+    int i = pos / (d1 * d2 * d3 * d4 * d5 * d6) % d0;
+    int j = pos / (d2 * d3 * d4 * d5 * d6) % d1;
+    int k = pos / (d3 * d4 * d5 * d6) % d2;
+    int l = pos / (d4 * d5 * d6) % d3;
+    int m = pos / (d5 * d6) % d4;
+    int n = pos / d6 % d5;
+    int o = pos % d6;
+    int l_index = Index(i, l0) * l1 * l2 * l3 * l4 * l5 * l6;
+    l_index += Index(j, l1) * l2 * l3 * l4 * l5 * l6;
+    l_index += Index(k, l2) * l3 * l4 * l5 * l6;
+    l_index += Index(l, l3) * l4 * l5 * l6;
+    l_index += Index(m, l4) * l5 * l6;
+    l_index += Index(n, l5) * l6;
+    l_index += Index(o, l6);
+    int r_index = Index(i, r0) * r1 * r2 * r3 * r4 * r5 * r6;
+    r_index += Index(j, r1) * r2 * r3 * r4 * r5 * r6;
+    r_index += Index(k, r2) * r3 * r4 * r5 * r6;
+    r_index += Index(l, r3) * r4 * r5 * r6;
+    r_index += Index(m, r4) * r5 * r6;
+    r_index += Index(n, r5) * r6;
+    r_index += Index(o, r6);
+    y[pos] = Func()(x0[l_index], x1[r_index]);
  }
 }
-template <typename T, typename S>
+template <typename T>
-__global__ void NoBroadcastKernel(const int nums, enum BroadcastOpType op, const T *input0, const T *input1,
+void BroadcastArith(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims, const std::vector<int> &y_dims,
-                                  S *output) {
+                    enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream) {
+  int size = 1;
+  for (auto d : y_dims) {
+    size *= d;
+  }
  switch (op) {
-    case BROADCAST_TYPE_GREATER:
-      return NoBroadcastOperator<T, S, GreaterFunc<T, bool>>(nums, input0, input1, output);
-    case BROADCAST_TYPE_LESS:
-      return NoBroadcastOperator<T, S, LessFunc<T, bool>>(nums, input0, input1, output);
-    case BROADCAST_TYPE_MINIMUM:
-      return NoBroadcastOperator<T, S, MinimumFunc<T, S>>(nums, input0, input1, output);
    case BROADCAST_TYPE_MAXIMUM:
-      return NoBroadcastOperator<T, S, MaximumFunc<T, S>>(nums, input0, input1, output);
+      return BroadcastArithKernel<T, MaximumFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
+    case BROADCAST_TYPE_MINIMUM:
+      return BroadcastArithKernel<T, MinimumFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
    case BROADCAST_TYPE_POWER:
-      return NoBroadcastOperator<T, S, PowerFunc<T, S>>(nums, input0, input1, output);
+      return BroadcastArithKernel<T, PowerFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
    case BROADCAST_TYPE_REALDIV:
-      return NoBroadcastOperator<T, S, RealDivFunc<T, S>>(nums, input0, input1, output);
+      return BroadcastArithKernel<T, RealDivFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
    case BROADCAST_TYPE_MUL:
-      return NoBroadcastOperator<T, S, MulFunc<T, S>>(nums, input0, input1, output);
+      return BroadcastArithKernel<T, MulFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
    case BROADCAST_TYPE_SUB:
-      return NoBroadcastOperator<T, S, SubFunc<T, S>>(nums, input0, input1, output);
+      return BroadcastArithKernel<T, SubFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
    case BROADCAST_TYPE_ADD:
-      return NoBroadcastOperator<T, S, AddFunc<T, S>>(nums, input0, input1, output);
+      return BroadcastArithKernel<T, AddFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
    case BROADCAST_TYPE_FLOORDIV:
-      return NoBroadcastOperator<T, S, FloorDivFunc<T, S>>(nums, input0, input1, output);
+      return BroadcastArithKernel<T, FloorDivFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
    case BROADCAST_TYPE_ABSGRAD:
-      return NoBroadcastOperator<T, S, AbsGradFunc<T, S>>(nums, input0, input1, output);
+      return BroadcastArithKernel<T, AbsGradFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
    case BROADCAST_TYPE_DIV:
-      return NoBroadcastOperator<T, S, DivFunc<T, S>>(nums, input0, input1, output);
+      return BroadcastArithKernel<T, DivFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
+        x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
+        x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
+        y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
+    default:
+      break;
  }
 }
-template <typename T, typename S>
+template void BroadcastArith(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
-void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, const T *input1, S *output,
+                             const std::vector<int> &y_dims, enum BroadcastOpType op, const float *x0, const float *x1,
-                 cudaStream_t stream) {
+                             float *y, cudaStream_t stream);
-  NoBroadcastKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, op, input0, input1, output);
+template void BroadcastArith(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
-}
+                             const std::vector<int> &y_dims, enum BroadcastOpType op, const half *x0, const half *x1,
+                             half *y, cudaStream_t stream);
+template void BroadcastArith(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
+                             const std::vector<int> &y_dims, enum BroadcastOpType op, const int *x0, const int *x1,
+                             int *y, cudaStream_t stream);
+// BroadcastTo
 template <typename T>
 __global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0, const int o1,
                                  const int o2, const int o3, const T *input_addr, T *output_addr) {
@@ -274,36 +417,6 @@ void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, con
                                                                  output_addr);
 }
-template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
-                        const std::vector<int> &output_shape, enum BroadcastOpType op, const float *input0,
-                        const float *input1, bool *output, cudaStream_t stream);
-template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
-                        const std::vector<int> &output_shape, enum BroadcastOpType op, const float *input0,
-                        const float *input1, float *output, cudaStream_t stream);
-template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
-                        const std::vector<int> &output_shape, enum BroadcastOpType op, const half *input0,
-                        const half *input1, bool *output, cudaStream_t stream);
-template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
-                        const std::vector<int> &output_shape, enum BroadcastOpType op, const half *input0,
-                        const half *input1, half *output, cudaStream_t stream);
-template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
-                        const std::vector<int> &output_shape, enum BroadcastOpType op, const int *input0,
-                        const int *input1, int *output, cudaStream_t stream);
-template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
-                        const std::vector<int> &output_shape, enum BroadcastOpType op, const int *input0,
-                        const int *input1, bool *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
-                          bool *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
-                          float *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
-                          bool *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
-                          half *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
-                          cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, bool *output,
-                          cudaStream_t stream);
 template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
                          const int &o2, const int &o3, const float *input_addr, float *output_addr,
                          cudaStream_t stream);

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
@@ -36,17 +36,21 @@ enum BroadcastOpType {
  BROADCAST_TYPE_INVALID = 0xffffffff,
 };
-template <typename T, typename S>
+template <typename T>
-void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
+void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream);
-               const std::vector<int> &output_shape, enum BroadcastOpType op, const T *input0, const T *input1,
-               S *output, cudaStream_t stream);
+template <typename T>
+void ElewiseArith(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream);
-template <typename T, typename S>
+template <typename T>
-void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output,
+void BroadcastCmp(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims, const std::vector<int> &y_dims,
-                 cudaStream_t stream);
+                  enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream);
+template <typename T>
+void BroadcastArith(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims, const std::vector<int> &y_dims,
+                    enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream);
 template <typename T>
 void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
                 const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h
@@ -58,8 +58,8 @@ class AddNGpuFwdKernel : public GpuKernel {
    for (size_t i = 0; i < IntToSize(num_input_); i++) {
      T *input_addr = GetDeviceAddress<T>(inputs, i);
      if (cudnn_data_type_ == CUDNN_DATA_INT32) {
-        NoBroadcast(outputs[0]->size / sizeof(T), BROADCAST_TYPE_ADD, input_addr, output_addr, output_addr,
+        ElewiseArith(outputs[0]->size / sizeof(T), BROADCAST_TYPE_ADD, input_addr, output_addr, output_addr,
-                    reinterpret_cast<cudaStream_t>(stream_ptr));
+                     reinterpret_cast<cudaStream_t>(stream_ptr));
      } else {
        CHECK_CUDNN_RET_WITH_EXCEPT(cudnnAddTensor(cudnn_handle_, &alpha, input_descriptor_, input_addr,
                                                   &(i > 0 ? alpha : beta), input_descriptor_, output_addr),

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
@@ -19,119 +19,119 @@
 namespace mindspore {
 namespace kernel {
 // fp32
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Greater,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool),
-  BroadcastOpGpuKernel, float, bool)
+  BroadcastOpGpuKernel, float)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Less, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool),
-  BroadcastOpGpuKernel, float, bool)
+  BroadcastOpGpuKernel, float)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Maximum,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastOpGpuKernel, float, float)
+  BroadcastOpGpuKernel, float)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Minimum,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastOpGpuKernel, float, float)
+  BroadcastOpGpuKernel, float)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Pow, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastOpGpuKernel, float, float)
+  BroadcastOpGpuKernel, float)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  RealDiv,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastOpGpuKernel, float, float)
+  BroadcastOpGpuKernel, float)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Mul, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastOpGpuKernel, float, float)
+  BroadcastOpGpuKernel, float)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Sub, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastOpGpuKernel, float, float)
+  BroadcastOpGpuKernel, float)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  TensorAdd,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastOpGpuKernel, float, float)
+  BroadcastOpGpuKernel, float)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  FloorDiv,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastOpGpuKernel, float, float)
+  BroadcastOpGpuKernel, float)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  AbsGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastOpGpuKernel, float, float)
+  BroadcastOpGpuKernel, float)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Div, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastOpGpuKernel, float, float)
+  BroadcastOpGpuKernel, float)
 // fp16
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Greater,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool),
-  BroadcastOpGpuKernel, half, bool)
+  BroadcastOpGpuKernel, half)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Less, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool),
-  BroadcastOpGpuKernel, half, bool)
+  BroadcastOpGpuKernel, half)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Maximum,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastOpGpuKernel, half, half)
+  BroadcastOpGpuKernel, half)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Minimum,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastOpGpuKernel, half, half)
+  BroadcastOpGpuKernel, half)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Pow, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastOpGpuKernel, half, half)
+  BroadcastOpGpuKernel, half)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  RealDiv,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastOpGpuKernel, half, half)
+  BroadcastOpGpuKernel, half)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Mul, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastOpGpuKernel, half, half)
+  BroadcastOpGpuKernel, half)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Sub, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastOpGpuKernel, half, half)
+  BroadcastOpGpuKernel, half)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  TensorAdd,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastOpGpuKernel, half, half)
+  BroadcastOpGpuKernel, half)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  FloorDiv,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastOpGpuKernel, half, half)
+  BroadcastOpGpuKernel, half)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  AbsGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastOpGpuKernel, half, half)
+  BroadcastOpGpuKernel, half)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Div, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastOpGpuKernel, half, half)
+  BroadcastOpGpuKernel, half)
 // int32
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Less, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
-  BroadcastOpGpuKernel, int, bool)
+  BroadcastOpGpuKernel, int)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  TensorAdd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  BroadcastOpGpuKernel, int, int)
+  BroadcastOpGpuKernel, int)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Minimum, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  BroadcastOpGpuKernel, int, int)
+  BroadcastOpGpuKernel, int)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Maximum, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  BroadcastOpGpuKernel, int, int)
+  BroadcastOpGpuKernel, int)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Mul, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  BroadcastOpGpuKernel, int, int)
+  BroadcastOpGpuKernel, int)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  FloorDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  BroadcastOpGpuKernel, int, int)
+  BroadcastOpGpuKernel, int)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  AbsGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  BroadcastOpGpuKernel, int, int)
+  BroadcastOpGpuKernel, int)
-MS_REG_GPU_KERNEL_TWO(
+MS_REG_GPU_KERNEL_ONE(
  Div, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  BroadcastOpGpuKernel, int, int)
+  BroadcastOpGpuKernel, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
@@ -28,11 +28,16 @@
 namespace mindspore {
 namespace kernel {
 constexpr int MAX_DIMS = 7;
-template <typename T, typename S>
+template <typename T>
 class BroadcastOpGpuKernel : public GpuKernel {
 public:
  BroadcastOpGpuKernel()
-      : op_type_(BROADCAST_TYPE_INVALID), need_broadcast_(false), input1_num_(1), input2_num_(1), output_num_(1) {}
+      : op_type_(BROADCAST_TYPE_INVALID),
+        need_broadcast_(false),
+        is_comp_op_(false),
+        input1_num_(1),
+        input2_num_(1),
+        output_num_(1) {}
  ~BroadcastOpGpuKernel() override = default;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -43,13 +48,23 @@ class BroadcastOpGpuKernel : public GpuKernel {
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    T *lhs = GetDeviceAddress<T>(inputs, 0);
    T *rhs = GetDeviceAddress<T>(inputs, 1);
-    S *output = GetDeviceAddress<S>(outputs, 0);
-    if (need_broadcast_) {
+    if (is_comp_op_) {
-      Broadcast(lhs_shape_, rhs_shape_, output_shape_, op_type_, lhs, rhs, output,
+      bool *output = GetDeviceAddress<bool>(outputs, 0);
-                reinterpret_cast<cudaStream_t>(stream_ptr));
+      if (need_broadcast_) {
+        BroadcastCmp(lhs_shape_, rhs_shape_, output_shape_, op_type_, lhs, rhs, output,
+                     reinterpret_cast<cudaStream_t>(stream_ptr));
+      } else {
+        ElewiseCmp(output_num_, op_type_, lhs, rhs, output, reinterpret_cast<cudaStream_t>(stream_ptr));
+      }
    } else {
-      NoBroadcast(output_num_, op_type_, lhs, rhs, output, reinterpret_cast<cudaStream_t>(stream_ptr));
+      T *output = GetDeviceAddress<T>(outputs, 0);
+      if (need_broadcast_) {
+        BroadcastArith(lhs_shape_, rhs_shape_, output_shape_, op_type_, lhs, rhs, output,
+                       reinterpret_cast<cudaStream_t>(stream_ptr));
+      } else {
+        ElewiseArith(output_num_, op_type_, lhs, rhs, output, reinterpret_cast<cudaStream_t>(stream_ptr));
+      }
    }
    return true;
@@ -91,26 +106,42 @@ class BroadcastOpGpuKernel : public GpuKernel {
  void InitSizeLists() override {
    input_size_list_.push_back(input1_num_ * sizeof(T));
    input_size_list_.push_back(input2_num_ * sizeof(T));
-    output_size_list_.push_back(output_num_ * sizeof(S));
+    auto unit_size = is_comp_op_ ? sizeof(bool) : sizeof(T);
+    output_size_list_.push_back(output_num_ * unit_size);
  }
 private:
  void GetOpType(const CNodePtr &kernel_node) {
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
-    static std::map<std::string, BroadcastOpType> kBroadcastTypeMap = {
+    static std::map<std::string, BroadcastOpType> kBroadcastCmpTypeMap = {
-      {"Greater", BROADCAST_TYPE_GREATER},   {"Less", BROADCAST_TYPE_LESS},       {"Maximum", BROADCAST_TYPE_MAXIMUM},
+      {"Greater", BROADCAST_TYPE_GREATER},
-      {"Minimum", BROADCAST_TYPE_MINIMUM},   {"Pow", BROADCAST_TYPE_POWER},       {"RealDiv", BROADCAST_TYPE_REALDIV},
+      {"Less", BROADCAST_TYPE_LESS},
-      {"Mul", BROADCAST_TYPE_MUL},           {"Sub", BROADCAST_TYPE_SUB},         {"TensorAdd", BROADCAST_TYPE_ADD},
-      {"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD}, {"Div", BROADCAST_TYPE_DIV},
    };
-    auto iter = kBroadcastTypeMap.find(kernel_name);
+    auto iter = kBroadcastCmpTypeMap.find(kernel_name);
-    if (iter == kBroadcastTypeMap.end()) {
+    if (iter != kBroadcastCmpTypeMap.end()) {
-      MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
+      op_type_ = iter->second;
-    } else {
+      is_comp_op_ = true;
+      return;
+    }
+    static std::map<std::string, BroadcastOpType> kBroadcastArithmetricTypeMap = {
+      {"Maximum", BROADCAST_TYPE_MAXIMUM}, {"Minimum", BROADCAST_TYPE_MINIMUM},   {"Pow", BROADCAST_TYPE_POWER},
+      {"RealDiv", BROADCAST_TYPE_REALDIV}, {"Mul", BROADCAST_TYPE_MUL},           {"Sub", BROADCAST_TYPE_SUB},
+      {"TensorAdd", BROADCAST_TYPE_ADD},   {"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD},
+      {"Div", BROADCAST_TYPE_DIV},
+    };
+    iter = kBroadcastArithmetricTypeMap.find(kernel_name);
+    if (iter != kBroadcastArithmetricTypeMap.end()) {
      op_type_ = iter->second;
+      is_comp_op_ = false;
+      return;
    }
+    MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
  }
  bool IsBroadcast(const std::vector<size_t> &lhs, const std::vector<size_t> &rhs) {
@@ -127,6 +158,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
  BroadcastOpType op_type_;
  bool need_broadcast_;
+  bool is_comp_op_;
  int input1_num_;
  int input2_num_;
  int output_num_;
@@ -137,7 +169,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
-};
+};  // namespace kernel
 }  // namespace kernel
 }  // namespace mindspore

--- a/tests/st/ops/gpu/test_broadcast_op.py
+++ b/tests/st/ops/gpu/test_broadcast_op.py
@@ -160,3 +160,45 @@ def test_broadcast_diff_dims():
    output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np))
    output_np = x1_np - x2_np
    assert np.allclose(output_ms.asnumpy(), output_np)
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_broadcast_fp16():
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    x1_np = np.random.rand(3, 1, 5, 1).astype(np.float16)
+    x2_np = np.random.rand(1, 4, 1, 6).astype(np.float16)
+    output_ms = P.Minimum()(Tensor(x1_np), Tensor(x2_np))
+    output_np = np.minimum(x1_np, x2_np)
+    assert np.allclose(output_ms.asnumpy(), output_np)
+    output_ms = P.Maximum()(Tensor(x1_np), Tensor(x2_np))
+    output_np = np.maximum(x1_np, x2_np)
+    assert np.allclose(output_ms.asnumpy(), output_np)
+    output_ms = P.Greater()(Tensor(x1_np), Tensor(x2_np))
+    output_np = x1_np > x2_np
+    assert np.allclose(output_ms.asnumpy(), output_np)
+    output_ms = P.Less()(Tensor(x1_np), Tensor(x2_np))
+    output_np = x1_np < x2_np
+    assert np.allclose(output_ms.asnumpy(), output_np)
+    output_ms = P.Pow()(Tensor(x1_np), Tensor(x2_np))
+    output_np = np.power(x1_np, x2_np)
+    assert np.allclose(output_ms.asnumpy(), output_np)
+    output_ms = P.RealDiv()(Tensor(x1_np), Tensor(x2_np))
+    output_np = x1_np / x2_np
+    assert np.allclose(output_ms.asnumpy(), output_np)
+    output_ms = P.Mul()(Tensor(x1_np), Tensor(x2_np))
+    output_np = x1_np * x2_np
+    assert np.allclose(output_ms.asnumpy(), output_np)
+    output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np))
+    output_np = x1_np - x2_np
+    assert np.allclose(output_ms.asnumpy(), output_np)