Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
a3d0ddb4
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a3d0ddb4
编写于
9月 07, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
9月 07, 2020
浏览文件
操作
浏览文件
下载
差异文件
!5779 tenoradd profiling
Merge pull request !5779 from chenweifeng/broadcast-refactor
上级
72f77bde
6ebe132c
变更
6
展开全部
隐藏空白更改
内联
并排
Showing
6 changed file
with
438 addition
and
247 deletion
+438
-247
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
...c/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+268
-155
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
.../backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
+12
-8
mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h
.../ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h
+2
-2
mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
.../backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
+64
-64
mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
...c/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
+50
-18
tests/st/ops/gpu/test_broadcast_op.py
tests/st/ops/gpu/test_broadcast_op.py
+42
-0
未找到文件。
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
浏览文件 @
a3d0ddb4
此差异已折叠。
点击以展开。
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
浏览文件 @
a3d0ddb4
...
...
@@ -36,17 +36,21 @@ enum BroadcastOpType {
BROADCAST_TYPE_INVALID
=
0xffffffff
,
};
template
<
typename
T
,
typename
S
>
void
Broadcast
(
const
std
::
vector
<
int
>
&
lhs_shape
,
const
std
::
vector
<
int
>
&
rhs_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
enum
BroadcastOpType
op
,
const
T
*
input0
,
const
T
*
input1
,
S
*
output
,
cudaStream_t
stream
);
template
<
typename
T
>
void
ElewiseCmp
(
const
int
&
nums
,
enum
BroadcastOpType
op
,
const
T
*
x0
,
const
T
*
x1
,
bool
*
y
,
cudaStream_t
stream
);
template
<
typename
T
>
void
ElewiseArith
(
const
int
&
nums
,
enum
BroadcastOpType
op
,
const
T
*
x0
,
const
T
*
x1
,
T
*
y
,
cudaStream_t
stream
);
template
<
typename
T
,
typename
S
>
void
NoBroadcast
(
const
int
&
size
,
enum
BroadcastOpType
op
,
const
T
*
input0
,
const
T
*
input1
,
S
*
output
,
cudaStream_t
stream
);
template
<
typename
T
>
void
BroadcastCmp
(
const
std
::
vector
<
int
>
&
x0_dims
,
const
std
::
vector
<
int
>
&
x1_dims
,
const
std
::
vector
<
int
>
&
y_dims
,
enum
BroadcastOpType
op
,
const
T
*
x0
,
const
T
*
x1
,
bool
*
y
,
cudaStream_t
stream
);
template
<
typename
T
>
void
BroadcastArith
(
const
std
::
vector
<
int
>
&
x0_dims
,
const
std
::
vector
<
int
>
&
x1_dims
,
const
std
::
vector
<
int
>
&
y_dims
,
enum
BroadcastOpType
op
,
const
T
*
x0
,
const
T
*
x1
,
T
*
y
,
cudaStream_t
stream
);
template
<
typename
T
>
void
BroadcastTo
(
const
int
&
i0
,
const
int
&
i1
,
const
int
&
i2
,
const
int
&
i3
,
const
int
&
o0
,
const
int
&
o1
,
const
int
&
o2
,
const
int
&
o3
,
const
T
*
input_addr
,
T
*
output_addr
,
cudaStream_t
stream
);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h
浏览文件 @
a3d0ddb4
...
...
@@ -58,8 +58,8 @@ class AddNGpuFwdKernel : public GpuKernel {
for
(
size_t
i
=
0
;
i
<
IntToSize
(
num_input_
);
i
++
)
{
T
*
input_addr
=
GetDeviceAddress
<
T
>
(
inputs
,
i
);
if
(
cudnn_data_type_
==
CUDNN_DATA_INT32
)
{
NoBroadcast
(
outputs
[
0
]
->
size
/
sizeof
(
T
),
BROADCAST_TYPE_ADD
,
input_addr
,
output_addr
,
output_addr
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
ElewiseArith
(
outputs
[
0
]
->
size
/
sizeof
(
T
),
BROADCAST_TYPE_ADD
,
input_addr
,
output_addr
,
output_addr
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
}
else
{
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnAddTensor
(
cudnn_handle_
,
&
alpha
,
input_descriptor_
,
input_addr
,
&
(
i
>
0
?
alpha
:
beta
),
input_descriptor_
,
output_addr
),
...
...
mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.cc
浏览文件 @
a3d0ddb4
...
...
@@ -19,119 +19,119 @@
namespace
mindspore
{
namespace
kernel
{
// fp32
MS_REG_GPU_KERNEL_
TWO
(
MS_REG_GPU_KERNEL_
ONE
(
Greater
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeBool
),
BroadcastOpGpuKernel
,
float
,
bool
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
float
)
MS_REG_GPU_KERNEL_
ONE
(
Less
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeBool
),
BroadcastOpGpuKernel
,
float
,
bool
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
float
)
MS_REG_GPU_KERNEL_
ONE
(
Maximum
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
BroadcastOpGpuKernel
,
float
,
float
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
float
)
MS_REG_GPU_KERNEL_
ONE
(
Minimum
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
BroadcastOpGpuKernel
,
float
,
float
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
float
)
MS_REG_GPU_KERNEL_
ONE
(
Pow
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
BroadcastOpGpuKernel
,
float
,
float
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
float
)
MS_REG_GPU_KERNEL_
ONE
(
RealDiv
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
BroadcastOpGpuKernel
,
float
,
float
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
float
)
MS_REG_GPU_KERNEL_
ONE
(
Mul
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
BroadcastOpGpuKernel
,
float
,
float
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
float
)
MS_REG_GPU_KERNEL_
ONE
(
Sub
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
BroadcastOpGpuKernel
,
float
,
float
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
float
)
MS_REG_GPU_KERNEL_
ONE
(
TensorAdd
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
BroadcastOpGpuKernel
,
float
,
float
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
float
)
MS_REG_GPU_KERNEL_
ONE
(
FloorDiv
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
BroadcastOpGpuKernel
,
float
,
float
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
float
)
MS_REG_GPU_KERNEL_
ONE
(
AbsGrad
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
BroadcastOpGpuKernel
,
float
,
float
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
float
)
MS_REG_GPU_KERNEL_
ONE
(
Div
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
BroadcastOpGpuKernel
,
float
,
float
)
BroadcastOpGpuKernel
,
float
)
// fp16
MS_REG_GPU_KERNEL_
TWO
(
MS_REG_GPU_KERNEL_
ONE
(
Greater
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeBool
),
BroadcastOpGpuKernel
,
half
,
bool
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
half
)
MS_REG_GPU_KERNEL_
ONE
(
Less
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeBool
),
BroadcastOpGpuKernel
,
half
,
bool
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
half
)
MS_REG_GPU_KERNEL_
ONE
(
Maximum
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
BroadcastOpGpuKernel
,
half
,
half
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
half
)
MS_REG_GPU_KERNEL_
ONE
(
Minimum
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
BroadcastOpGpuKernel
,
half
,
half
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
half
)
MS_REG_GPU_KERNEL_
ONE
(
Pow
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
BroadcastOpGpuKernel
,
half
,
half
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
half
)
MS_REG_GPU_KERNEL_
ONE
(
RealDiv
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
BroadcastOpGpuKernel
,
half
,
half
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
half
)
MS_REG_GPU_KERNEL_
ONE
(
Mul
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
BroadcastOpGpuKernel
,
half
,
half
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
half
)
MS_REG_GPU_KERNEL_
ONE
(
Sub
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
BroadcastOpGpuKernel
,
half
,
half
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
half
)
MS_REG_GPU_KERNEL_
ONE
(
TensorAdd
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
BroadcastOpGpuKernel
,
half
,
half
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
half
)
MS_REG_GPU_KERNEL_
ONE
(
FloorDiv
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
BroadcastOpGpuKernel
,
half
,
half
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
half
)
MS_REG_GPU_KERNEL_
ONE
(
AbsGrad
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
BroadcastOpGpuKernel
,
half
,
half
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
half
)
MS_REG_GPU_KERNEL_
ONE
(
Div
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
BroadcastOpGpuKernel
,
half
,
half
)
BroadcastOpGpuKernel
,
half
)
// int32
MS_REG_GPU_KERNEL_
TWO
(
MS_REG_GPU_KERNEL_
ONE
(
Less
,
KernelAttr
().
AddInputAttr
(
kNumberTypeInt32
).
AddInputAttr
(
kNumberTypeInt32
).
AddOutputAttr
(
kNumberTypeBool
),
BroadcastOpGpuKernel
,
int
,
bool
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
int
)
MS_REG_GPU_KERNEL_
ONE
(
TensorAdd
,
KernelAttr
().
AddInputAttr
(
kNumberTypeInt32
).
AddInputAttr
(
kNumberTypeInt32
).
AddOutputAttr
(
kNumberTypeInt32
),
BroadcastOpGpuKernel
,
int
,
int
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
int
)
MS_REG_GPU_KERNEL_
ONE
(
Minimum
,
KernelAttr
().
AddInputAttr
(
kNumberTypeInt32
).
AddInputAttr
(
kNumberTypeInt32
).
AddOutputAttr
(
kNumberTypeInt32
),
BroadcastOpGpuKernel
,
int
,
int
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
int
)
MS_REG_GPU_KERNEL_
ONE
(
Maximum
,
KernelAttr
().
AddInputAttr
(
kNumberTypeInt32
).
AddInputAttr
(
kNumberTypeInt32
).
AddOutputAttr
(
kNumberTypeInt32
),
BroadcastOpGpuKernel
,
int
,
int
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
int
)
MS_REG_GPU_KERNEL_
ONE
(
Mul
,
KernelAttr
().
AddInputAttr
(
kNumberTypeInt32
).
AddInputAttr
(
kNumberTypeInt32
).
AddOutputAttr
(
kNumberTypeInt32
),
BroadcastOpGpuKernel
,
int
,
int
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
int
)
MS_REG_GPU_KERNEL_
ONE
(
FloorDiv
,
KernelAttr
().
AddInputAttr
(
kNumberTypeInt32
).
AddInputAttr
(
kNumberTypeInt32
).
AddOutputAttr
(
kNumberTypeInt32
),
BroadcastOpGpuKernel
,
int
,
int
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
int
)
MS_REG_GPU_KERNEL_
ONE
(
AbsGrad
,
KernelAttr
().
AddInputAttr
(
kNumberTypeInt32
).
AddInputAttr
(
kNumberTypeInt32
).
AddOutputAttr
(
kNumberTypeInt32
),
BroadcastOpGpuKernel
,
int
,
int
)
MS_REG_GPU_KERNEL_
TWO
(
BroadcastOpGpuKernel
,
int
)
MS_REG_GPU_KERNEL_
ONE
(
Div
,
KernelAttr
().
AddInputAttr
(
kNumberTypeInt32
).
AddInputAttr
(
kNumberTypeInt32
).
AddOutputAttr
(
kNumberTypeInt32
),
BroadcastOpGpuKernel
,
int
,
int
)
BroadcastOpGpuKernel
,
int
)
}
// namespace kernel
}
// namespace mindspore
mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
浏览文件 @
a3d0ddb4
...
...
@@ -28,11 +28,16 @@
namespace
mindspore
{
namespace
kernel
{
constexpr
int
MAX_DIMS
=
7
;
template
<
typename
T
,
typename
S
>
template
<
typename
T
>
class
BroadcastOpGpuKernel
:
public
GpuKernel
{
public:
BroadcastOpGpuKernel
()
:
op_type_
(
BROADCAST_TYPE_INVALID
),
need_broadcast_
(
false
),
input1_num_
(
1
),
input2_num_
(
1
),
output_num_
(
1
)
{}
:
op_type_
(
BROADCAST_TYPE_INVALID
),
need_broadcast_
(
false
),
is_comp_op_
(
false
),
input1_num_
(
1
),
input2_num_
(
1
),
output_num_
(
1
)
{}
~
BroadcastOpGpuKernel
()
override
=
default
;
const
std
::
vector
<
size_t
>
&
GetInputSizeList
()
const
override
{
return
input_size_list_
;
}
...
...
@@ -43,13 +48,23 @@ class BroadcastOpGpuKernel : public GpuKernel {
const
std
::
vector
<
AddressPtr
>
&
outputs
,
void
*
stream_ptr
)
override
{
T
*
lhs
=
GetDeviceAddress
<
T
>
(
inputs
,
0
);
T
*
rhs
=
GetDeviceAddress
<
T
>
(
inputs
,
1
);
S
*
output
=
GetDeviceAddress
<
S
>
(
outputs
,
0
);
if
(
need_broadcast_
)
{
Broadcast
(
lhs_shape_
,
rhs_shape_
,
output_shape_
,
op_type_
,
lhs
,
rhs
,
output
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
if
(
is_comp_op_
)
{
bool
*
output
=
GetDeviceAddress
<
bool
>
(
outputs
,
0
);
if
(
need_broadcast_
)
{
BroadcastCmp
(
lhs_shape_
,
rhs_shape_
,
output_shape_
,
op_type_
,
lhs
,
rhs
,
output
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
}
else
{
ElewiseCmp
(
output_num_
,
op_type_
,
lhs
,
rhs
,
output
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
}
}
else
{
NoBroadcast
(
output_num_
,
op_type_
,
lhs
,
rhs
,
output
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
T
*
output
=
GetDeviceAddress
<
T
>
(
outputs
,
0
);
if
(
need_broadcast_
)
{
BroadcastArith
(
lhs_shape_
,
rhs_shape_
,
output_shape_
,
op_type_
,
lhs
,
rhs
,
output
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
}
else
{
ElewiseArith
(
output_num_
,
op_type_
,
lhs
,
rhs
,
output
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
}
}
return
true
;
...
...
@@ -91,26 +106,42 @@ class BroadcastOpGpuKernel : public GpuKernel {
void
InitSizeLists
()
override
{
input_size_list_
.
push_back
(
input1_num_
*
sizeof
(
T
));
input_size_list_
.
push_back
(
input2_num_
*
sizeof
(
T
));
output_size_list_
.
push_back
(
output_num_
*
sizeof
(
S
));
auto
unit_size
=
is_comp_op_
?
sizeof
(
bool
)
:
sizeof
(
T
);
output_size_list_
.
push_back
(
output_num_
*
unit_size
);
}
private:
void
GetOpType
(
const
CNodePtr
&
kernel_node
)
{
std
::
string
kernel_name
=
AnfAlgo
::
GetCNodeName
(
kernel_node
);
static
std
::
map
<
std
::
string
,
BroadcastOpType
>
kBroadcastTypeMap
=
{
{
"Greater"
,
BROADCAST_TYPE_GREATER
},
{
"Less"
,
BROADCAST_TYPE_LESS
},
{
"Maximum"
,
BROADCAST_TYPE_MAXIMUM
},
{
"Minimum"
,
BROADCAST_TYPE_MINIMUM
},
{
"Pow"
,
BROADCAST_TYPE_POWER
},
{
"RealDiv"
,
BROADCAST_TYPE_REALDIV
},
{
"Mul"
,
BROADCAST_TYPE_MUL
},
{
"Sub"
,
BROADCAST_TYPE_SUB
},
{
"TensorAdd"
,
BROADCAST_TYPE_ADD
},
{
"FloorDiv"
,
BROADCAST_TYPE_FLOORDIV
},
{
"AbsGrad"
,
BROADCAST_TYPE_ABSGRAD
},
{
"Div"
,
BROADCAST_TYPE_DIV
},
static
std
::
map
<
std
::
string
,
BroadcastOpType
>
kBroadcastCmpTypeMap
=
{
{
"Greater"
,
BROADCAST_TYPE_GREATER
},
{
"Less"
,
BROADCAST_TYPE_LESS
},
};
auto
iter
=
kBroadcastTypeMap
.
find
(
kernel_name
);
if
(
iter
==
kBroadcastTypeMap
.
end
())
{
MS_LOG
(
EXCEPTION
)
<<
"operation "
<<
kernel_name
<<
" is not supported."
;
}
else
{
auto
iter
=
kBroadcastCmpTypeMap
.
find
(
kernel_name
);
if
(
iter
!=
kBroadcastCmpTypeMap
.
end
())
{
op_type_
=
iter
->
second
;
is_comp_op_
=
true
;
return
;
}
static
std
::
map
<
std
::
string
,
BroadcastOpType
>
kBroadcastArithmetricTypeMap
=
{
{
"Maximum"
,
BROADCAST_TYPE_MAXIMUM
},
{
"Minimum"
,
BROADCAST_TYPE_MINIMUM
},
{
"Pow"
,
BROADCAST_TYPE_POWER
},
{
"RealDiv"
,
BROADCAST_TYPE_REALDIV
},
{
"Mul"
,
BROADCAST_TYPE_MUL
},
{
"Sub"
,
BROADCAST_TYPE_SUB
},
{
"TensorAdd"
,
BROADCAST_TYPE_ADD
},
{
"FloorDiv"
,
BROADCAST_TYPE_FLOORDIV
},
{
"AbsGrad"
,
BROADCAST_TYPE_ABSGRAD
},
{
"Div"
,
BROADCAST_TYPE_DIV
},
};
iter
=
kBroadcastArithmetricTypeMap
.
find
(
kernel_name
);
if
(
iter
!=
kBroadcastArithmetricTypeMap
.
end
())
{
op_type_
=
iter
->
second
;
is_comp_op_
=
false
;
return
;
}
MS_LOG
(
EXCEPTION
)
<<
"operation "
<<
kernel_name
<<
" is not supported."
;
}
bool
IsBroadcast
(
const
std
::
vector
<
size_t
>
&
lhs
,
const
std
::
vector
<
size_t
>
&
rhs
)
{
...
...
@@ -127,6 +158,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
BroadcastOpType
op_type_
;
bool
need_broadcast_
;
bool
is_comp_op_
;
int
input1_num_
;
int
input2_num_
;
int
output_num_
;
...
...
@@ -137,7 +169,7 @@ class BroadcastOpGpuKernel : public GpuKernel {
std
::
vector
<
size_t
>
input_size_list_
;
std
::
vector
<
size_t
>
output_size_list_
;
std
::
vector
<
size_t
>
workspace_size_list_
;
};
};
// namespace kernel
}
// namespace kernel
}
// namespace mindspore
...
...
tests/st/ops/gpu/test_broadcast_op.py
浏览文件 @
a3d0ddb4
...
...
@@ -160,3 +160,45 @@ def test_broadcast_diff_dims():
output_ms
=
P
.
Sub
()(
Tensor
(
x1_np
),
Tensor
(
x2_np
))
output_np
=
x1_np
-
x2_np
assert
np
.
allclose
(
output_ms
.
asnumpy
(),
output_np
)
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
def
test_broadcast_fp16
():
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
,
device_target
=
'GPU'
)
x1_np
=
np
.
random
.
rand
(
3
,
1
,
5
,
1
).
astype
(
np
.
float16
)
x2_np
=
np
.
random
.
rand
(
1
,
4
,
1
,
6
).
astype
(
np
.
float16
)
output_ms
=
P
.
Minimum
()(
Tensor
(
x1_np
),
Tensor
(
x2_np
))
output_np
=
np
.
minimum
(
x1_np
,
x2_np
)
assert
np
.
allclose
(
output_ms
.
asnumpy
(),
output_np
)
output_ms
=
P
.
Maximum
()(
Tensor
(
x1_np
),
Tensor
(
x2_np
))
output_np
=
np
.
maximum
(
x1_np
,
x2_np
)
assert
np
.
allclose
(
output_ms
.
asnumpy
(),
output_np
)
output_ms
=
P
.
Greater
()(
Tensor
(
x1_np
),
Tensor
(
x2_np
))
output_np
=
x1_np
>
x2_np
assert
np
.
allclose
(
output_ms
.
asnumpy
(),
output_np
)
output_ms
=
P
.
Less
()(
Tensor
(
x1_np
),
Tensor
(
x2_np
))
output_np
=
x1_np
<
x2_np
assert
np
.
allclose
(
output_ms
.
asnumpy
(),
output_np
)
output_ms
=
P
.
Pow
()(
Tensor
(
x1_np
),
Tensor
(
x2_np
))
output_np
=
np
.
power
(
x1_np
,
x2_np
)
assert
np
.
allclose
(
output_ms
.
asnumpy
(),
output_np
)
output_ms
=
P
.
RealDiv
()(
Tensor
(
x1_np
),
Tensor
(
x2_np
))
output_np
=
x1_np
/
x2_np
assert
np
.
allclose
(
output_ms
.
asnumpy
(),
output_np
)
output_ms
=
P
.
Mul
()(
Tensor
(
x1_np
),
Tensor
(
x2_np
))
output_np
=
x1_np
*
x2_np
assert
np
.
allclose
(
output_ms
.
asnumpy
(),
output_np
)
output_ms
=
P
.
Sub
()(
Tensor
(
x1_np
),
Tensor
(
x2_np
))
output_np
=
x1_np
-
x2_np
assert
np
.
allclose
(
output_ms
.
asnumpy
(),
output_np
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录