提交 a3d0ddb4 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!5779 tenoradd profiling

Merge pull request !5779 from chenweifeng/broadcast-refactor
...@@ -15,110 +15,216 @@ ...@@ -15,110 +15,216 @@
*/ */
#include <vector> #include <vector>
#include <iostream>
#include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh" #include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh"
#include "runtime/device/gpu/cuda_common.h" #include "runtime/device/gpu/cuda_common.h"
template <typename T, typename S> // Basic function
template <typename T>
struct GreaterFunc { struct GreaterFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return lhs > rhs ? true : false; } __device__ __host__ __forceinline__ bool operator()(const T &lhs, const T &rhs) { return lhs > rhs ? true : false; }
}; };
template <typename T, typename S> template <typename T>
struct LessFunc { struct LessFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return lhs < rhs ? true : false; } __device__ __host__ __forceinline__ bool operator()(const T &lhs, const T &rhs) { return lhs < rhs ? true : false; }
}; };
template <typename T, typename S> template <typename T>
struct MinimumFunc { struct MinimumFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return lhs < rhs ? lhs : rhs; } __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return lhs < rhs ? lhs : rhs; }
}; };
template <typename T, typename S> template <typename T>
struct MaximumFunc { struct MaximumFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return lhs > rhs ? lhs : rhs; } __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return lhs > rhs ? lhs : rhs; }
}; };
template <typename T, typename S> template <typename T>
struct PowerFunc { struct PowerFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return pow(lhs, rhs); } __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return pow(lhs, rhs); }
}; };
template <> template <>
struct PowerFunc<half, half> { struct PowerFunc<half> {
__device__ __forceinline__ half operator()(const half &lhs, const half &rhs) { __device__ __host__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
return __float2half(pow(__half2float(lhs), __half2float(rhs))); return __float2half(pow(__half2float(lhs), __half2float(rhs)));
} }
}; };
template <typename T, typename S> template <>
struct PowerFunc<half2> {
__device__ __host__ __forceinline__ half2 operator()(const half2 &lhs, const half2 &rhs) {
float2 base = __half22float2(lhs);
float2 index = __half22float2(rhs);
base.x = pow(base.x, index.x);
base.y = pow(base.y, index.y);
return __float22half2_rn(base);
}
};
template <typename T>
struct RealDivFunc { struct RealDivFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); } __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
}; };
template <typename T, typename S> template <typename T>
struct DivFunc { struct DivFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); } __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
}; };
template <typename T, typename S> template <typename T>
struct MulFunc { struct MulFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs * rhs); } __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return (lhs * rhs); }
}; };
template <typename T, typename S> template <typename T>
struct SubFunc { struct SubFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs - rhs); } __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return (lhs - rhs); }
}; };
template <typename T, typename S> template <typename T>
struct AddFunc { struct AddFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs + rhs); } __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return (lhs + rhs); }
}; };
template <typename T, typename S> // convert to float to fix accuracy issue
template <typename T>
struct FloorDivFunc { struct FloorDivFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
return floor(static_cast<float>(lhs) / static_cast<float>(rhs)); return floorf(static_cast<float>(lhs) / static_cast<float>(rhs));
} }
}; };
template <> template <>
struct FloorDivFunc<half, half> { struct FloorDivFunc<half> {
__device__ __forceinline__ half operator()(const half &lhs, const half &rhs) { __device__ __host__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
return __float2half(floor(__half2float(lhs) / __half2float(rhs))); return floorf(__half2float(lhs) / __half2float(rhs));
} }
}; };
template <> template <>
struct FloorDivFunc<half, bool> { struct FloorDivFunc<half2> {
// invalid branch __device__ __host__ __forceinline__ half2 operator()(const half2 &lhs, const half2 &rhs) {
__device__ __forceinline__ half operator()(const half &lhs, const half &rhs) { return false; } float2 l = __half22float2(lhs);
float2 r = __half22float2(rhs);
l.x = floorf(l.x / r.x);
l.y = floorf(l.y / r.y);
return __float22half2_rn(l);
}
}; };
template <typename T, typename S> template <typename T>
struct AbsGradFunc { struct AbsGradFunc {
__device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { __device__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
T zero = 0.0; T zero = 0.0;
return lhs < zero ? -rhs : rhs; return lhs < zero ? -rhs : rhs;
} }
}; };
template <> template <>
struct PowerFunc<half, bool> { struct AbsGradFunc<half2> {
// invalid branch __device__ __forceinline__ half2 operator()(const half2 &lhs, const half2 &rhs) {
__device__ __forceinline__ half operator()(const half &lhs, const half &rhs) { return false; } half2 zero(0.0, 0.0);
return lhs < zero ? -rhs : rhs;
}
}; };
// Element-wise Comparation
template <typename T, typename Func>
__global__ void ElewiseCmpKernel(const int nums, const T *x0, const T *x1, bool *y) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < nums; pos += blockDim.x * gridDim.x) {
y[pos] = Func()(x0[pos], x1[pos]);
}
}
template <typename T>
void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream) {
switch (op) {
case BROADCAST_TYPE_GREATER:
return ElewiseCmpKernel<T, GreaterFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
case BROADCAST_TYPE_LESS:
return ElewiseCmpKernel<T, LessFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
default:
break;
}
}
template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, bool *y,
cudaStream_t stream);
template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, bool *y,
cudaStream_t stream);
template void ElewiseCmp(const int &nums, enum BroadcastOpType op, const int *x0, const int *x1, bool *y,
cudaStream_t stream);
// Element-wise ArithMetic
template <typename T, typename Func>
__global__ void ElewiseArithKernel(const int nums, const T *x0, const T *x1, T *y) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < nums; pos += blockDim.x * gridDim.x) {
y[pos] = Func()(x0[pos], x1[pos]);
}
}
template <typename T>
void ElewiseArithKernel(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream) {
switch (op) {
case BROADCAST_TYPE_MINIMUM:
return ElewiseArithKernel<T, MinimumFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
case BROADCAST_TYPE_MAXIMUM:
return ElewiseArithKernel<T, MaximumFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
case BROADCAST_TYPE_POWER:
return ElewiseArithKernel<T, PowerFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
case BROADCAST_TYPE_REALDIV:
return ElewiseArithKernel<T, RealDivFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
case BROADCAST_TYPE_MUL:
return ElewiseArithKernel<T, MulFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
case BROADCAST_TYPE_SUB:
return ElewiseArithKernel<T, SubFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
case BROADCAST_TYPE_ADD:
return ElewiseArithKernel<T, AddFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
case BROADCAST_TYPE_FLOORDIV:
return ElewiseArithKernel<T, FloorDivFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
case BROADCAST_TYPE_ABSGRAD:
return ElewiseArithKernel<T, AbsGradFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
case BROADCAST_TYPE_DIV:
return ElewiseArithKernel<T, DivFunc<T>><<<(nums + 255) / 256, 256, 0, stream>>>(nums, x0, x1, y);
default:
break;
}
}
template <typename T>
void ElewiseArith(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream) {
return ElewiseArithKernel(nums, op, x0, x1, y, stream);
}
template <>
void ElewiseArith(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, half *y,
cudaStream_t stream) {
if (nums % 2 == 0) {
ElewiseArithKernel<half2>(nums / 2, op, reinterpret_cast<const half2 *>(x0), reinterpret_cast<const half2 *>(x1),
reinterpret_cast<half2 *>(y), stream);
} else {
return ElewiseArithKernel(nums, op, x0, x1, y, stream);
}
}
template void ElewiseArith(const int &nums, enum BroadcastOpType op, const float *x0, const float *x1, float *y,
cudaStream_t stream);
template void ElewiseArith(const int &nums, enum BroadcastOpType op, const half *x0, const half *x1, half *y,
cudaStream_t stream);
template void ElewiseArith(const int &nums, enum BroadcastOpType op, const int *x0, const int *x1, int *y,
cudaStream_t stream);
// Broadcast comparation
__device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; } __device__ __forceinline__ int Index(const int &index, const int &dim) { return dim == 1 ? 0 : index; }
template <typename T, typename S, typename Func> template <typename T, typename Func>
__device__ __forceinline__ void BroadcastOperator(const int &l0, const int &l1, const int &l2, const int &l3, __global__ void BroadcastCmpKernel(const int l0, const int l1, const int l2, const int l3, const int l4, const int l5,
const int &l4, const int &l5, const int &l6, const int &r0, const int l6, const int r0, const int r1, const int r2, const int r3, const int r4,
const int &r1, const int &r2, const int &r3, const int &r4, const int r5, const int r6, const int d0, const int d1, const int d2, const int d3,
const int &r5, const int &r6, const int &d0, const int &d1, const int d4, const int d5, const int d6, const T *x0, const T *x1, bool *y) {
const int &d2, const int &d3, const int &d4, const int &d5,
const int &d6, const T *input0, const T *input1, S *output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < d0 * d1 * d2 * d3 * d4 * d5 * d6; for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < d0 * d1 * d2 * d3 * d4 * d5 * d6;
pos += blockDim.x * gridDim.x) { pos += blockDim.x * gridDim.x) {
int i = pos / (d1 * d2 * d3 * d4 * d5 * d6) % d0; int i = pos / (d1 * d2 * d3 * d4 * d5 * d6) % d0;
...@@ -143,115 +249,152 @@ __device__ __forceinline__ void BroadcastOperator(const int &l0, const int &l1, ...@@ -143,115 +249,152 @@ __device__ __forceinline__ void BroadcastOperator(const int &l0, const int &l1,
r_index += Index(m, r4) * r5 * r6; r_index += Index(m, r4) * r5 * r6;
r_index += Index(n, r5) * r6; r_index += Index(n, r5) * r6;
r_index += Index(o, r6); r_index += Index(o, r6);
output[pos] = Func()(input0[l_index], input1[r_index]); y[pos] = Func()(x0[l_index], x1[r_index]);
} }
} }
template <typename T, typename S> template <typename T>
__global__ void BroadcastKernel(const int l0, const int l1, const int l2, const int l3, const int l4, const int l5, void BroadcastCmp(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims, const std::vector<int> &y_dims,
const int l6, const int r0, const int r1, const int r2, const int r3, const int r4, enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream) {
const int r5, const int r6, const int d0, const int d1, const int d2, const int d3, int size = 1;
const int d4, const int d5, const int d6, enum BroadcastOpType op, const T *input0, for (auto d : y_dims) {
const T *input1, S *output) { size *= d;
}
switch (op) { switch (op) {
case BROADCAST_TYPE_GREATER: case BROADCAST_TYPE_GREATER:
return BroadcastOperator<T, S, GreaterFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, return BroadcastCmpKernel<T, GreaterFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
d2, d3, d4, d5, d6, input0, input1, output); x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
case BROADCAST_TYPE_LESS: case BROADCAST_TYPE_LESS:
return BroadcastOperator<T, S, LessFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2, return BroadcastCmpKernel<T, LessFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
d3, d4, d5, d6, input0, input1, output); x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
case BROADCAST_TYPE_MINIMUM: x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
return BroadcastOperator<T, S, MinimumFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
d2, d3, d4, d5, d6, input0, input1, output); default:
case BROADCAST_TYPE_MAXIMUM: break;
return BroadcastOperator<T, S, MaximumFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
d2, d3, d4, d5, d6, input0, input1, output);
case BROADCAST_TYPE_POWER:
return BroadcastOperator<T, S, PowerFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
d2, d3, d4, d5, d6, input0, input1, output);
case BROADCAST_TYPE_REALDIV:
return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
d2, d3, d4, d5, d6, input0, input1, output);
case BROADCAST_TYPE_MUL:
return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
d3, d4, d5, d6, input0, input1, output);
case BROADCAST_TYPE_SUB:
return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
d3, d4, d5, d6, input0, input1, output);
case BROADCAST_TYPE_ADD:
return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
d3, d4, d5, d6, input0, input1, output);
case BROADCAST_TYPE_FLOORDIV:
return BroadcastOperator<T, S, FloorDivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
d2, d3, d4, d5, d6, input0, input1, output);
case BROADCAST_TYPE_ABSGRAD:
return BroadcastOperator<T, S, AbsGradFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1,
d2, d3, d4, d5, d6, input0, input1, output);
case BROADCAST_TYPE_DIV:
return BroadcastOperator<T, S, DivFunc<T, S>>(l0, l1, l2, l3, l4, l5, l6, r0, r1, r2, r3, r4, r5, r6, d0, d1, d2,
d3, d4, d5, d6, input0, input1, output);
} }
} }
template <typename T, typename S> template void BroadcastCmp(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape, const std::vector<int> &y_dims, enum BroadcastOpType op, const float *x0, const float *x1,
const std::vector<int> &output_shape, enum BroadcastOpType op, const T *input0, const T *input1, bool *y, cudaStream_t stream);
S *output, cudaStream_t stream) { template void BroadcastCmp(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
int size = 1; const std::vector<int> &y_dims, enum BroadcastOpType op, const half *x0, const half *x1,
for (auto d : output_shape) { bool *y, cudaStream_t stream);
size *= d; template void BroadcastCmp(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
} const std::vector<int> &y_dims, enum BroadcastOpType op, const int *x0, const int *x1,
BroadcastKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>( bool *y, cudaStream_t stream);
lhs_shape[0], lhs_shape[1], lhs_shape[2], lhs_shape[3], lhs_shape[4], lhs_shape[5], lhs_shape[6], rhs_shape[0],
rhs_shape[1], rhs_shape[2], rhs_shape[3], rhs_shape[4], rhs_shape[5], rhs_shape[6], output_shape[0],
output_shape[1], output_shape[2], output_shape[3], output_shape[4], output_shape[5], output_shape[6], op, input0,
input1, output);
}
template <typename T, typename S, typename Func> // Broadcast Arithmetic
__device__ __forceinline__ void NoBroadcastOperator(const int &nums, const T *input0, const T *input1, S *output) { template <typename T, typename Func>
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < nums; pos += blockDim.x * gridDim.x) { __global__ void BroadcastArithKernel(const int l0, const int l1, const int l2, const int l3, const int l4, const int l5,
output[pos] = Func()(input0[pos], input1[pos]); const int l6, const int r0, const int r1, const int r2, const int r3, const int r4,
const int r5, const int r6, const int d0, const int d1, const int d2, const int d3,
const int d4, const int d5, const int d6, const T *x0, const T *x1, T *y) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < d0 * d1 * d2 * d3 * d4 * d5 * d6;
pos += blockDim.x * gridDim.x) {
int i = pos / (d1 * d2 * d3 * d4 * d5 * d6) % d0;
int j = pos / (d2 * d3 * d4 * d5 * d6) % d1;
int k = pos / (d3 * d4 * d5 * d6) % d2;
int l = pos / (d4 * d5 * d6) % d3;
int m = pos / (d5 * d6) % d4;
int n = pos / d6 % d5;
int o = pos % d6;
int l_index = Index(i, l0) * l1 * l2 * l3 * l4 * l5 * l6;
l_index += Index(j, l1) * l2 * l3 * l4 * l5 * l6;
l_index += Index(k, l2) * l3 * l4 * l5 * l6;
l_index += Index(l, l3) * l4 * l5 * l6;
l_index += Index(m, l4) * l5 * l6;
l_index += Index(n, l5) * l6;
l_index += Index(o, l6);
int r_index = Index(i, r0) * r1 * r2 * r3 * r4 * r5 * r6;
r_index += Index(j, r1) * r2 * r3 * r4 * r5 * r6;
r_index += Index(k, r2) * r3 * r4 * r5 * r6;
r_index += Index(l, r3) * r4 * r5 * r6;
r_index += Index(m, r4) * r5 * r6;
r_index += Index(n, r5) * r6;
r_index += Index(o, r6);
y[pos] = Func()(x0[l_index], x1[r_index]);
} }
} }
template <typename T, typename S> template <typename T>
__global__ void NoBroadcastKernel(const int nums, enum BroadcastOpType op, const T *input0, const T *input1, void BroadcastArith(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims, const std::vector<int> &y_dims,
S *output) { enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream) {
int size = 1;
for (auto d : y_dims) {
size *= d;
}
switch (op) { switch (op) {
case BROADCAST_TYPE_GREATER:
return NoBroadcastOperator<T, S, GreaterFunc<T, bool>>(nums, input0, input1, output);
case BROADCAST_TYPE_LESS:
return NoBroadcastOperator<T, S, LessFunc<T, bool>>(nums, input0, input1, output);
case BROADCAST_TYPE_MINIMUM:
return NoBroadcastOperator<T, S, MinimumFunc<T, S>>(nums, input0, input1, output);
case BROADCAST_TYPE_MAXIMUM: case BROADCAST_TYPE_MAXIMUM:
return NoBroadcastOperator<T, S, MaximumFunc<T, S>>(nums, input0, input1, output); return BroadcastArithKernel<T, MaximumFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
case BROADCAST_TYPE_MINIMUM:
return BroadcastArithKernel<T, MinimumFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
case BROADCAST_TYPE_POWER: case BROADCAST_TYPE_POWER:
return NoBroadcastOperator<T, S, PowerFunc<T, S>>(nums, input0, input1, output); return BroadcastArithKernel<T, PowerFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
case BROADCAST_TYPE_REALDIV: case BROADCAST_TYPE_REALDIV:
return NoBroadcastOperator<T, S, RealDivFunc<T, S>>(nums, input0, input1, output); return BroadcastArithKernel<T, RealDivFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
case BROADCAST_TYPE_MUL: case BROADCAST_TYPE_MUL:
return NoBroadcastOperator<T, S, MulFunc<T, S>>(nums, input0, input1, output); return BroadcastArithKernel<T, MulFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
case BROADCAST_TYPE_SUB: case BROADCAST_TYPE_SUB:
return NoBroadcastOperator<T, S, SubFunc<T, S>>(nums, input0, input1, output); return BroadcastArithKernel<T, SubFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
case BROADCAST_TYPE_ADD: case BROADCAST_TYPE_ADD:
return NoBroadcastOperator<T, S, AddFunc<T, S>>(nums, input0, input1, output); return BroadcastArithKernel<T, AddFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
case BROADCAST_TYPE_FLOORDIV: case BROADCAST_TYPE_FLOORDIV:
return NoBroadcastOperator<T, S, FloorDivFunc<T, S>>(nums, input0, input1, output); return BroadcastArithKernel<T, FloorDivFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
case BROADCAST_TYPE_ABSGRAD: case BROADCAST_TYPE_ABSGRAD:
return NoBroadcastOperator<T, S, AbsGradFunc<T, S>>(nums, input0, input1, output); return BroadcastArithKernel<T, AbsGradFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
case BROADCAST_TYPE_DIV: case BROADCAST_TYPE_DIV:
return NoBroadcastOperator<T, S, DivFunc<T, S>>(nums, input0, input1, output); return BroadcastArithKernel<T, DivFunc<T>><<<(size + 255) / 256, 256, 0, stream>>>(
x0_dims[0], x0_dims[1], x0_dims[2], x0_dims[3], x0_dims[4], x0_dims[5], x0_dims[6], x1_dims[0], x1_dims[1],
x1_dims[2], x1_dims[3], x1_dims[4], x1_dims[5], x1_dims[6], y_dims[0], y_dims[1], y_dims[2], y_dims[3],
y_dims[4], y_dims[5], y_dims[6], x0, x1, y);
default:
break;
} }
} }
template <typename T, typename S> template void BroadcastArith(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, const T *input1, S *output, const std::vector<int> &y_dims, enum BroadcastOpType op, const float *x0, const float *x1,
cudaStream_t stream) { float *y, cudaStream_t stream);
NoBroadcastKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, op, input0, input1, output); template void BroadcastArith(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
} const std::vector<int> &y_dims, enum BroadcastOpType op, const half *x0, const half *x1,
half *y, cudaStream_t stream);
template void BroadcastArith(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims,
const std::vector<int> &y_dims, enum BroadcastOpType op, const int *x0, const int *x1,
int *y, cudaStream_t stream);
// BroadcastTo
template <typename T> template <typename T>
__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0, const int o1, __global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0, const int o1,
const int o2, const int o3, const T *input_addr, T *output_addr) { const int o2, const int o3, const T *input_addr, T *output_addr) {
...@@ -274,36 +417,6 @@ void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, con ...@@ -274,36 +417,6 @@ void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, con
output_addr); output_addr);
} }
template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
const std::vector<int> &output_shape, enum BroadcastOpType op, const float *input0,
const float *input1, bool *output, cudaStream_t stream);
template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
const std::vector<int> &output_shape, enum BroadcastOpType op, const float *input0,
const float *input1, float *output, cudaStream_t stream);
template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
const std::vector<int> &output_shape, enum BroadcastOpType op, const half *input0,
const half *input1, bool *output, cudaStream_t stream);
template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
const std::vector<int> &output_shape, enum BroadcastOpType op, const half *input0,
const half *input1, half *output, cudaStream_t stream);
template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
const std::vector<int> &output_shape, enum BroadcastOpType op, const int *input0,
const int *input1, int *output, cudaStream_t stream);
template void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape,
const std::vector<int> &output_shape, enum BroadcastOpType op, const int *input0,
const int *input1, bool *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
bool *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const float *input0, const float *input1,
float *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
bool *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
half *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, bool *output,
cudaStream_t stream);
template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1, template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
const int &o2, const int &o3, const float *input_addr, float *output_addr, const int &o2, const int &o3, const float *input_addr, float *output_addr,
cudaStream_t stream); cudaStream_t stream);
......
...@@ -36,17 +36,21 @@ enum BroadcastOpType { ...@@ -36,17 +36,21 @@ enum BroadcastOpType {
BROADCAST_TYPE_INVALID = 0xffffffff, BROADCAST_TYPE_INVALID = 0xffffffff,
}; };
template <typename T, typename S> template <typename T>
void Broadcast(const std::vector<int> &lhs_shape, const std::vector<int> &rhs_shape, void ElewiseCmp(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream);
const std::vector<int> &output_shape, enum BroadcastOpType op, const T *input0, const T *input1,
S *output, cudaStream_t stream); template <typename T>
void ElewiseArith(const int &nums, enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream);
template <typename T, typename S> template <typename T>
void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output, void BroadcastCmp(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims, const std::vector<int> &y_dims,
cudaStream_t stream); enum BroadcastOpType op, const T *x0, const T *x1, bool *y, cudaStream_t stream);
template <typename T>
void BroadcastArith(const std::vector<int> &x0_dims, const std::vector<int> &x1_dims, const std::vector<int> &y_dims,
enum BroadcastOpType op, const T *x0, const T *x1, T *y, cudaStream_t stream);
template <typename T> template <typename T>
void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1, void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream); const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_ #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
...@@ -58,8 +58,8 @@ class AddNGpuFwdKernel : public GpuKernel { ...@@ -58,8 +58,8 @@ class AddNGpuFwdKernel : public GpuKernel {
for (size_t i = 0; i < IntToSize(num_input_); i++) { for (size_t i = 0; i < IntToSize(num_input_); i++) {
T *input_addr = GetDeviceAddress<T>(inputs, i); T *input_addr = GetDeviceAddress<T>(inputs, i);
if (cudnn_data_type_ == CUDNN_DATA_INT32) { if (cudnn_data_type_ == CUDNN_DATA_INT32) {
NoBroadcast(outputs[0]->size / sizeof(T), BROADCAST_TYPE_ADD, input_addr, output_addr, output_addr, ElewiseArith(outputs[0]->size / sizeof(T), BROADCAST_TYPE_ADD, input_addr, output_addr, output_addr,
reinterpret_cast<cudaStream_t>(stream_ptr)); reinterpret_cast<cudaStream_t>(stream_ptr));
} else { } else {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnAddTensor(cudnn_handle_, &alpha, input_descriptor_, input_addr, CHECK_CUDNN_RET_WITH_EXCEPT(cudnnAddTensor(cudnn_handle_, &alpha, input_descriptor_, input_addr,
&(i > 0 ? alpha : beta), input_descriptor_, output_addr), &(i > 0 ? alpha : beta), input_descriptor_, output_addr),
......
...@@ -19,119 +19,119 @@ ...@@ -19,119 +19,119 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
// fp32 // fp32
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Greater, Greater,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool), KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool),
BroadcastOpGpuKernel, float, bool) BroadcastOpGpuKernel, float)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Less, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool), Less, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool),
BroadcastOpGpuKernel, float, bool) BroadcastOpGpuKernel, float)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Maximum, Maximum,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float) BroadcastOpGpuKernel, float)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Minimum, Minimum,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float) BroadcastOpGpuKernel, float)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Pow, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), Pow, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float) BroadcastOpGpuKernel, float)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
RealDiv, RealDiv,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float) BroadcastOpGpuKernel, float)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Mul, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), Mul, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float) BroadcastOpGpuKernel, float)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Sub, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), Sub, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float) BroadcastOpGpuKernel, float)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
TensorAdd, TensorAdd,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float) BroadcastOpGpuKernel, float)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
FloorDiv, FloorDiv,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float) BroadcastOpGpuKernel, float)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
AbsGrad, AbsGrad,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float) BroadcastOpGpuKernel, float)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Div, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), Div, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastOpGpuKernel, float, float) BroadcastOpGpuKernel, float)
// fp16 // fp16
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Greater, Greater,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool), KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool),
BroadcastOpGpuKernel, half, bool) BroadcastOpGpuKernel, half)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Less, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool), Less, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeBool),
BroadcastOpGpuKernel, half, bool) BroadcastOpGpuKernel, half)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Maximum, Maximum,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half) BroadcastOpGpuKernel, half)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Minimum, Minimum,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half) BroadcastOpGpuKernel, half)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Pow, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), Pow, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half) BroadcastOpGpuKernel, half)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
RealDiv, RealDiv,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half) BroadcastOpGpuKernel, half)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Mul, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), Mul, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half) BroadcastOpGpuKernel, half)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Sub, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), Sub, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half) BroadcastOpGpuKernel, half)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
TensorAdd, TensorAdd,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half) BroadcastOpGpuKernel, half)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
FloorDiv, FloorDiv,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half) BroadcastOpGpuKernel, half)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
AbsGrad, AbsGrad,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half) BroadcastOpGpuKernel, half)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Div, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), Div, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastOpGpuKernel, half, half) BroadcastOpGpuKernel, half)
// int32 // int32
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Less, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool), Less, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
BroadcastOpGpuKernel, int, bool) BroadcastOpGpuKernel, int)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
TensorAdd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), TensorAdd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int) BroadcastOpGpuKernel, int)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Minimum, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), Minimum, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int) BroadcastOpGpuKernel, int)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Maximum, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), Maximum, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int) BroadcastOpGpuKernel, int)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Mul, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), Mul, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int) BroadcastOpGpuKernel, int)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
FloorDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), FloorDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int) BroadcastOpGpuKernel, int)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
AbsGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), AbsGrad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int) BroadcastOpGpuKernel, int)
MS_REG_GPU_KERNEL_TWO( MS_REG_GPU_KERNEL_ONE(
Div, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), Div, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int) BroadcastOpGpuKernel, int)
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore
...@@ -28,11 +28,16 @@ ...@@ -28,11 +28,16 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
constexpr int MAX_DIMS = 7; constexpr int MAX_DIMS = 7;
template <typename T, typename S> template <typename T>
class BroadcastOpGpuKernel : public GpuKernel { class BroadcastOpGpuKernel : public GpuKernel {
public: public:
BroadcastOpGpuKernel() BroadcastOpGpuKernel()
: op_type_(BROADCAST_TYPE_INVALID), need_broadcast_(false), input1_num_(1), input2_num_(1), output_num_(1) {} : op_type_(BROADCAST_TYPE_INVALID),
need_broadcast_(false),
is_comp_op_(false),
input1_num_(1),
input2_num_(1),
output_num_(1) {}
~BroadcastOpGpuKernel() override = default; ~BroadcastOpGpuKernel() override = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
...@@ -43,13 +48,23 @@ class BroadcastOpGpuKernel : public GpuKernel { ...@@ -43,13 +48,23 @@ class BroadcastOpGpuKernel : public GpuKernel {
const std::vector<AddressPtr> &outputs, void *stream_ptr) override { const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
T *lhs = GetDeviceAddress<T>(inputs, 0); T *lhs = GetDeviceAddress<T>(inputs, 0);
T *rhs = GetDeviceAddress<T>(inputs, 1); T *rhs = GetDeviceAddress<T>(inputs, 1);
S *output = GetDeviceAddress<S>(outputs, 0);
if (need_broadcast_) { if (is_comp_op_) {
Broadcast(lhs_shape_, rhs_shape_, output_shape_, op_type_, lhs, rhs, output, bool *output = GetDeviceAddress<bool>(outputs, 0);
reinterpret_cast<cudaStream_t>(stream_ptr)); if (need_broadcast_) {
BroadcastCmp(lhs_shape_, rhs_shape_, output_shape_, op_type_, lhs, rhs, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
ElewiseCmp(output_num_, op_type_, lhs, rhs, output, reinterpret_cast<cudaStream_t>(stream_ptr));
}
} else { } else {
NoBroadcast(output_num_, op_type_, lhs, rhs, output, reinterpret_cast<cudaStream_t>(stream_ptr)); T *output = GetDeviceAddress<T>(outputs, 0);
if (need_broadcast_) {
BroadcastArith(lhs_shape_, rhs_shape_, output_shape_, op_type_, lhs, rhs, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
ElewiseArith(output_num_, op_type_, lhs, rhs, output, reinterpret_cast<cudaStream_t>(stream_ptr));
}
} }
return true; return true;
...@@ -91,26 +106,42 @@ class BroadcastOpGpuKernel : public GpuKernel { ...@@ -91,26 +106,42 @@ class BroadcastOpGpuKernel : public GpuKernel {
void InitSizeLists() override { void InitSizeLists() override {
input_size_list_.push_back(input1_num_ * sizeof(T)); input_size_list_.push_back(input1_num_ * sizeof(T));
input_size_list_.push_back(input2_num_ * sizeof(T)); input_size_list_.push_back(input2_num_ * sizeof(T));
output_size_list_.push_back(output_num_ * sizeof(S));
auto unit_size = is_comp_op_ ? sizeof(bool) : sizeof(T);
output_size_list_.push_back(output_num_ * unit_size);
} }
private: private:
void GetOpType(const CNodePtr &kernel_node) { void GetOpType(const CNodePtr &kernel_node) {
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
static std::map<std::string, BroadcastOpType> kBroadcastTypeMap = { static std::map<std::string, BroadcastOpType> kBroadcastCmpTypeMap = {
{"Greater", BROADCAST_TYPE_GREATER}, {"Less", BROADCAST_TYPE_LESS}, {"Maximum", BROADCAST_TYPE_MAXIMUM}, {"Greater", BROADCAST_TYPE_GREATER},
{"Minimum", BROADCAST_TYPE_MINIMUM}, {"Pow", BROADCAST_TYPE_POWER}, {"RealDiv", BROADCAST_TYPE_REALDIV}, {"Less", BROADCAST_TYPE_LESS},
{"Mul", BROADCAST_TYPE_MUL}, {"Sub", BROADCAST_TYPE_SUB}, {"TensorAdd", BROADCAST_TYPE_ADD},
{"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD}, {"Div", BROADCAST_TYPE_DIV},
}; };
auto iter = kBroadcastTypeMap.find(kernel_name); auto iter = kBroadcastCmpTypeMap.find(kernel_name);
if (iter == kBroadcastTypeMap.end()) { if (iter != kBroadcastCmpTypeMap.end()) {
MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported."; op_type_ = iter->second;
} else { is_comp_op_ = true;
return;
}
static std::map<std::string, BroadcastOpType> kBroadcastArithmetricTypeMap = {
{"Maximum", BROADCAST_TYPE_MAXIMUM}, {"Minimum", BROADCAST_TYPE_MINIMUM}, {"Pow", BROADCAST_TYPE_POWER},
{"RealDiv", BROADCAST_TYPE_REALDIV}, {"Mul", BROADCAST_TYPE_MUL}, {"Sub", BROADCAST_TYPE_SUB},
{"TensorAdd", BROADCAST_TYPE_ADD}, {"FloorDiv", BROADCAST_TYPE_FLOORDIV}, {"AbsGrad", BROADCAST_TYPE_ABSGRAD},
{"Div", BROADCAST_TYPE_DIV},
};
iter = kBroadcastArithmetricTypeMap.find(kernel_name);
if (iter != kBroadcastArithmetricTypeMap.end()) {
op_type_ = iter->second; op_type_ = iter->second;
is_comp_op_ = false;
return;
} }
MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
} }
bool IsBroadcast(const std::vector<size_t> &lhs, const std::vector<size_t> &rhs) { bool IsBroadcast(const std::vector<size_t> &lhs, const std::vector<size_t> &rhs) {
...@@ -127,6 +158,7 @@ class BroadcastOpGpuKernel : public GpuKernel { ...@@ -127,6 +158,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
BroadcastOpType op_type_; BroadcastOpType op_type_;
bool need_broadcast_; bool need_broadcast_;
bool is_comp_op_;
int input1_num_; int input1_num_;
int input2_num_; int input2_num_;
int output_num_; int output_num_;
...@@ -137,7 +169,7 @@ class BroadcastOpGpuKernel : public GpuKernel { ...@@ -137,7 +169,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
std::vector<size_t> input_size_list_; std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_; std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_; std::vector<size_t> workspace_size_list_;
}; }; // namespace kernel
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore
......
...@@ -160,3 +160,45 @@ def test_broadcast_diff_dims(): ...@@ -160,3 +160,45 @@ def test_broadcast_diff_dims():
output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np)) output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np))
output_np = x1_np - x2_np output_np = x1_np - x2_np
assert np.allclose(output_ms.asnumpy(), output_np) assert np.allclose(output_ms.asnumpy(), output_np)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_broadcast_fp16():
context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
x1_np = np.random.rand(3, 1, 5, 1).astype(np.float16)
x2_np = np.random.rand(1, 4, 1, 6).astype(np.float16)
output_ms = P.Minimum()(Tensor(x1_np), Tensor(x2_np))
output_np = np.minimum(x1_np, x2_np)
assert np.allclose(output_ms.asnumpy(), output_np)
output_ms = P.Maximum()(Tensor(x1_np), Tensor(x2_np))
output_np = np.maximum(x1_np, x2_np)
assert np.allclose(output_ms.asnumpy(), output_np)
output_ms = P.Greater()(Tensor(x1_np), Tensor(x2_np))
output_np = x1_np > x2_np
assert np.allclose(output_ms.asnumpy(), output_np)
output_ms = P.Less()(Tensor(x1_np), Tensor(x2_np))
output_np = x1_np < x2_np
assert np.allclose(output_ms.asnumpy(), output_np)
output_ms = P.Pow()(Tensor(x1_np), Tensor(x2_np))
output_np = np.power(x1_np, x2_np)
assert np.allclose(output_ms.asnumpy(), output_np)
output_ms = P.RealDiv()(Tensor(x1_np), Tensor(x2_np))
output_np = x1_np / x2_np
assert np.allclose(output_ms.asnumpy(), output_np)
output_ms = P.Mul()(Tensor(x1_np), Tensor(x2_np))
output_np = x1_np * x2_np
assert np.allclose(output_ms.asnumpy(), output_np)
output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np))
output_np = x1_np - x2_np
assert np.allclose(output_ms.asnumpy(), output_np)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册