Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
2d0ee054
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2d0ee054
编写于
5月 26, 2020
作者:
Z
z00478463
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
for pylint 2nd
上级
648501da
变更
12
展开全部
隐藏空白更改
内联
并排
Showing
12 changed file
with
2056 addition
and
1592 deletion
+2056
-1592
mindspore/ops/_op_impl/_custom_op/__init__.py
mindspore/ops/_op_impl/_custom_op/__init__.py
+0
-11
mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
+87
-76
mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
+36
-30
mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
+551
-313
mindspore/ops/_op_impl/_custom_op/img2col_impl.py
mindspore/ops/_op_impl/_custom_op/img2col_impl.py
+870
-771
mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
...re/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
+102
-80
mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
...e/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
+92
-63
mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
...s/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
+83
-75
mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
...s/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
+32
-32
mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
+51
-44
mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
+24
-19
mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
+128
-78
未找到文件。
mindspore/ops/_op_impl/_custom_op/__init__.py
浏览文件 @
2d0ee054
...
...
@@ -14,14 +14,3 @@
# ============================================================================
"""custom ops"""
from
.batch_matmul_impl
import
CusBatchMatMul
from
.cholesky_trsm_impl
import
CusCholeskyTrsm
from
.fused_abs_max1_impl
import
CusFusedAbsMax1
from
.img2col_impl
import
CusImg2Col
from
.matmul_cube_dense_left_impl
import
CusMatMulCubeDenseLeft
from
.matmul_cube_dense_right_impl
import
CusMatMulCubeDenseRight
from
.matmul_cube_fracz_left_cast_impl
import
CusMatMulCubeFraczLeftCast
from
.matmul_cube_fracz_right_mul_impl
import
CusMatMulCubeFraczRightMul
from
.matmul_cube_impl
import
CusMatMulCube
from
.matrix_combine_impl
import
CusMatrixCombine
from
.transpose02314_impl
import
CusTranspose02314
mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
浏览文件 @
2d0ee054
...
...
@@ -14,29 +14,31 @@
# ============================================================================
"""batch_matmul_impl"""
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
from
te
import
tik
from
topi.cce
import
util
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
cus_batchmatmul_op_info
=
TBERegOp
(
"CusBatchMatMul"
)
\
.
fusion_type
(
"OPAQUE"
)
\
.
async_flag
(
False
)
\
.
binfile_name
(
"batchmatmul.so"
)
\
.
compute_cost
(
10
)
\
.
kernel_name
(
"CusBatchMatMul"
)
\
.
partial_flag
(
True
)
\
.
input
(
0
,
"x1"
,
False
,
"required"
,
"all"
)
\
.
input
(
1
,
"x2"
,
False
,
"required"
,
"all"
)
\
.
output
(
0
,
"y"
,
False
,
"required"
,
"all"
)
\
.
dtype_format
(
DataType
.
F32_Default
,
DataType
.
F32_Default
,
DataType
.
F32_Default
)
\
.
get_op_info
()
.
fusion_type
(
"OPAQUE"
)
\
.
async_flag
(
False
)
\
.
binfile_name
(
"batchmatmul.so"
)
\
.
compute_cost
(
10
)
\
.
kernel_name
(
"CusBatchMatMul"
)
\
.
partial_flag
(
True
)
\
.
input
(
0
,
"x1"
,
False
,
"required"
,
"all"
)
\
.
input
(
1
,
"x2"
,
False
,
"required"
,
"all"
)
\
.
output
(
0
,
"y"
,
False
,
"required"
,
"all"
)
\
.
dtype_format
(
DataType
.
F32_Default
,
DataType
.
F32_Default
,
DataType
.
F32_Default
)
\
.
get_op_info
()
def
_get_flattern_shape
(
shape
):
flattern_shape
=
1
for
dim
in
shape
:
flattern_shape
*=
dim
return
(
flattern_shape
,)
def
_inner_matmul_new
(
tik_instance
,
dtype
,
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
):
input_1_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
128
],
name
=
"input_1_local_UB"
,
scope
=
tik
.
scope_ubuf
)
t_1_0_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
64
*
128
],
name
=
"t_1_0_local_UB"
,
scope
=
tik
.
scope_ubuf
)
...
...
@@ -66,12 +68,13 @@ def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_
matmul_hybrid_f_t_local_UB
,
1
,
1
,
1
,
1
,
8
,
8
,
8
)
tik_instance
.
data_move
(
res
[
res_index
+
thread_idx2
*
64
],
matmul_hybrid_f_t_local_UB
,
0
,
1
,
8
,
0
,
0
)
def
_inner_matmul_new_1_64_32_64
(
tik_instance
,
dtype
,
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
):
input_1_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
64
],
name
=
"input_1_local_UB"
,
scope
=
tik
.
scope_ubuf
)
tik_instance
.
data_move
(
input_1_local_UB
,
input1
[
input1_index
],
0
,
1
,
8
,
0
,
0
)
with
tik_instance
.
for_range
(
0
,
2
,
thread_num
=
2
)
as
thread_idx2
:
input_2_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
32
*
64
],
name
=
"input_2_local_UB"
,
input_2_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
32
*
64
],
name
=
"input_2_local_UB"
,
scope
=
tik
.
scope_ubuf
)
t_1_local_UB
=
input_2_local_UB
matmul_hybrid_f_t_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
32
],
name
=
"matmul_hybrid_f_t_local_UB"
,
...
...
@@ -83,6 +86,8 @@ def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, inpu
1
,
1
,
1
,
8
)
tik_instance
.
data_move
(
res
[
res_index
+
thread_idx2
*
32
],
matmul_hybrid_f_t_local_UB
,
0
,
1
,
4
,
0
,
0
)
@
op_info_register
(
cus_batchmatmul_op_info
)
def
CusBatchMatMul
(
input_x1
,
input_x2
,
output
,
transpose_a
=
False
,
transpose_b
=
True
,
kernel_name
=
"batchmatmul"
):
if
util
.
get_product_version
()
==
util
.
VERSION_MINI
:
...
...
@@ -97,51 +102,54 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
dtype
,
input_x2
.
get
(
"dtype"
).
lower
()))
input_shape
=
(
tuple
(
x1_shape
),
tuple
(
x2_shape
),
dtype
,
transpose_a
,
transpose_b
)
support_shape
=
[((
8
,
128
,
128
),
(
8
,
128
,
128
),
"float32"
,
False
,
True
),
((
36
,
128
,
128
),
(
36
,
128
,
128
),
"float32"
,
False
,
True
),
((
5
,
128
,
128
),
(
5
,
128
,
128
),
"float32"
,
False
,
True
),
((
18
,
128
,
128
),
(
18
,
128
,
128
),
"float32"
,
False
,
True
),
((
16
,
128
,
128
),
(
16
,
128
,
128
),
"float32"
,
False
,
True
),
((
9
,
128
,
128
),
(
9
,
128
,
128
),
"float32"
,
False
,
True
),
((
1
,
64
,
64
),
(
1
,
64
,
64
),
"float32"
,
False
,
True
),
((
1
,
128
,
128
),
(
1
,
128
,
128
),
"float32"
,
False
,
True
),
((
4
,
128
,
128
),
(
4
,
128
,
128
),
"float32"
,
False
,
True
),
((
2
,
128
,
128
),
(
2
,
128
,
128
),
"float32"
,
False
,
True
)]
((
36
,
128
,
128
),
(
36
,
128
,
128
),
"float32"
,
False
,
True
),
((
5
,
128
,
128
),
(
5
,
128
,
128
),
"float32"
,
False
,
True
),
((
18
,
128
,
128
),
(
18
,
128
,
128
),
"float32"
,
False
,
True
),
((
16
,
128
,
128
),
(
16
,
128
,
128
),
"float32"
,
False
,
True
),
((
9
,
128
,
128
),
(
9
,
128
,
128
),
"float32"
,
False
,
True
),
((
1
,
64
,
64
),
(
1
,
64
,
64
),
"float32"
,
False
,
True
),
((
1
,
128
,
128
),
(
1
,
128
,
128
),
"float32"
,
False
,
True
),
((
4
,
128
,
128
),
(
4
,
128
,
128
),
"float32"
,
False
,
True
),
((
2
,
128
,
128
),
(
2
,
128
,
128
),
"float32"
,
False
,
True
)]
if
input_shape
not
in
support_shape
:
raise
RuntimeError
(
"input_shape %s is not supported"
%
str
(
input_shape
))
# if not transpose_a and transpose_b:
batch
,
m
,
k
=
x1_shape
_
,
n
,
_
=
x2_shape
input1_shape
=
_get_flattern_shape
(
x1_shape
)
input1
=
tik_instance
.
Tensor
(
dtype
,
input1_shape
,
name
=
"input1"
,
scope
=
tik
.
scope_gm
)
input2_shape
=
_get_flattern_shape
(
x2_shape
)
input2
=
tik_instance
.
Tensor
(
dtype
,
input2_shape
,
name
=
"input2"
,
scope
=
tik
.
scope_gm
)
output_shape
=
x1_shape
res_shape
=
_get_flattern_shape
(
output_shape
)
res
=
tik_instance
.
Tensor
(
dtype
,
res_shape
,
name
=
"res"
,
scope
=
tik
.
scope_gm
)
if
input_shape
==
((
36
,
128
,
128
),
(
36
,
128
,
128
),
"float32"
,
False
,
True
):
with
tik_instance
.
for_range
(
0
,
18
,
block_num
=
18
)
as
block_idx
:
with
tik_instance
.
for_range
(
0
,
2
)
as
cc0
:
with
tik_instance
.
for_range
(
0
,
128
,
thread_num
=
2
)
as
cc1
:
input1_index
=
block_idx
*
32768
+
cc0
*
16384
+
cc1
*
128
input2_index
=
block_idx
*
32768
+
cc0
*
16384
res_index
=
block_idx
*
32768
+
cc0
*
16384
+
cc1
*
128
input1_index
=
block_idx
*
32768
+
cc0
*
16384
+
cc1
*
128
input2_index
=
block_idx
*
32768
+
cc0
*
16384
res_index
=
block_idx
*
32768
+
cc0
*
16384
+
cc1
*
128
_inner_matmul_new
(
tik_instance
,
dtype
,
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
)
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
)
if
input_shape
==
((
5
,
128
,
128
),
(
5
,
128
,
128
),
"float32"
,
False
,
True
):
with
tik_instance
.
for_range
(
0
,
30
,
block_num
=
30
)
as
block_idx
:
with
tik_instance
.
for_range
(
0
,
11
)
as
cc1_db
:
with
tik_instance
.
for_range
(
0
,
2
,
thread_num
=
2
)
as
thread_idx
:
with
tik_instance
.
if_scope
(((((
block_idx
%
6
)
*
22
)
+
(
cc1_db
*
2
)
+
thread_idx
)
<
128
)):
input_1_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
128
],
name
=
"input_1_local_UB"
,
scope
=
tik
.
scope_ubuf
)
t_1_0_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
64
*
128
],
name
=
"t_1_0_local_UB"
,
scope
=
tik
.
scope_ubuf
)
tik_instance
.
data_move
(
input_1_local_UB
,
input1
[(
block_idx
//
6
)
*
16384
+
(
block_idx
%
6
)
*
2816
+
cc1_db
*
256
+
thread_idx
*
128
],
0
,
1
,
16
,
0
,
0
)
input_1_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
128
],
name
=
"input_1_local_UB"
,
scope
=
tik
.
scope_ubuf
)
t_1_0_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
64
*
128
],
name
=
"t_1_0_local_UB"
,
scope
=
tik
.
scope_ubuf
)
tik_instance
.
data_move
(
input_1_local_UB
,
input1
[
(
block_idx
//
6
)
*
16384
+
(
block_idx
%
6
)
*
2816
+
cc1_db
*
256
+
thread_idx
*
128
],
0
,
1
,
16
,
0
,
0
)
with
tik_instance
.
for_range
(
0
,
2
)
as
vec_i
:
tik_instance
.
vadds
(
64
,
t_1_0_local_UB
[
vec_i
*
64
],
input_1_local_UB
[
vec_i
*
64
],
0
,
64
,
1
,
1
,
16
,
0
)
...
...
@@ -150,58 +158,61 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
scope
=
tik
.
scope_ubuf
)
t_1_local_UB
=
input_2_local_UB
bisec_last_axis_local_UB
=
input_2_local_UB
matmul_hybrid_f_t_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
64
],
name
=
"matmul_hybrid_f_t_local_UB"
,
matmul_hybrid_f_t_local_UB
=
tik_instance
.
Tensor
(
dtype
,
[
64
],
name
=
"matmul_hybrid_f_t_local_UB"
,
scope
=
tik
.
scope_ubuf
)
matmul_hybrid_f_t_local_UB_dst_tmp
=
tik_instance
.
Tensor
(
dtype
,
[
64
],
name
=
"matmul_hybrid_f_t_local_UB_dst_tmp"
,
scope
=
tik
.
scope_ubuf
)
tik_instance
.
vector_dup
(
64
,
matmul_hybrid_f_t_local_UB
,
0
,
1
,
1
,
8
)
tik_instance
.
data_move
(
input_2_local_UB
,
input2
[(
block_idx
//
6
)
*
16384
+
thread_idx2
*
8192
],
0
,
1
,
tik_instance
.
data_move
(
input_2_local_UB
,
input2
[(
block_idx
//
6
)
*
16384
+
thread_idx2
*
8192
],
0
,
1
,
1024
,
0
,
0
)
tik_instance
.
vmul
(
64
,
t_1_local_UB
,
t_1_0_local_UB
,
input_2_local_UB
,
128
,
1
,
1
,
1
,
8
,
8
,
8
)
tik_instance
.
vadd
(
64
,
bisec_last_axis_local_UB
,
t_1_local_UB
,
t_1_local_UB
[
64
],
64
,
1
,
1
,
1
,
16
,
16
,
16
)
tik_instance
.
vector_dup
(
64
,
matmul_hybrid_f_t_local_UB_dst_tmp
,
0
,
1
,
1
,
8
)
with
tik_instance
.
for_range
(
0
,
64
)
as
cc6
:
tik_instance
.
vcadd
(
64
,
matmul_hybrid_f_t_local_UB_dst_tmp
[
cc6
],
bisec_last_axis_local_UB
[
cc6
*
128
],
tik_instance
.
vcadd
(
64
,
matmul_hybrid_f_t_local_UB_dst_tmp
[
cc6
],
bisec_last_axis_local_UB
[
cc6
*
128
],
1
,
1
,
1
,
8
)
tik_instance
.
vadd
(
64
,
matmul_hybrid_f_t_local_UB
,
matmul_hybrid_f_t_local_UB_dst_tmp
,
matmul_hybrid_f_t_local_UB
,
1
,
1
,
1
,
1
,
8
,
8
,
8
)
tik_instance
.
data_move
(
res
[(
block_idx
//
6
)
*
16384
+
(
block_idx
%
6
)
*
2816
+
cc1_db
*
256
+
thread_idx
*
128
+
thread_idx2
*
64
],
matmul_hybrid_f_t_local_UB
,
0
,
1
,
8
,
0
,
0
)
tik_instance
.
data_move
(
res
[(
block_idx
//
6
)
*
16384
+
(
block_idx
%
6
)
*
2816
+
cc1_db
*
256
+
thread_idx
*
128
+
thread_idx2
*
64
],
matmul_hybrid_f_t_local_UB
,
0
,
1
,
8
,
0
,
0
)
if
input_shape
==
((
18
,
128
,
128
),
(
18
,
128
,
128
),
"float32"
,
False
,
True
):
with
tik_instance
.
for_range
(
0
,
18
,
block_num
=
18
)
as
block_idx
:
with
tik_instance
.
for_range
(
0
,
128
,
thread_num
=
2
)
as
cc0
:
input1_index
=
block_idx
*
16384
+
cc0
*
128
input2_index
=
block_idx
*
16384
res_index
=
block_idx
*
16384
+
cc0
*
128
res_index
=
block_idx
*
16384
+
cc0
*
128
_inner_matmul_new
(
tik_instance
,
dtype
,
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
)
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
)
if
input_shape
==
((
9
,
128
,
128
),
(
9
,
128
,
128
),
"float32"
,
False
,
True
):
with
tik_instance
.
for_range
(
0
,
27
,
block_num
=
27
)
as
block_idx
:
with
tik_instance
.
for_range
(
0
,
42
,
thread_num
=
2
)
as
cc0
:
input1_index
=
(
block_idx
//
3
)
*
16384
+
(
block_idx
%
3
)
*
5504
+
cc0
*
128
input2_index
=
(
block_idx
//
3
)
*
16384
res_index
=
(
block_idx
//
3
)
*
16384
+
(
block_idx
%
3
)
*
5504
+
cc0
*
128
input1_index
=
(
block_idx
//
3
)
*
16384
+
(
block_idx
%
3
)
*
5504
+
cc0
*
128
input2_index
=
(
block_idx
//
3
)
*
16384
res_index
=
(
block_idx
//
3
)
*
16384
+
(
block_idx
%
3
)
*
5504
+
cc0
*
128
_inner_matmul_new
(
tik_instance
,
dtype
,
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
)
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
)
with
tik_instance
.
if_scope
((
block_idx
%
3
)
<
2
):
input1_index
=
(
block_idx
//
3
)
*
16384
+
(
block_idx
%
3
)
*
5504
+
42
*
128
input1_index
=
(
block_idx
//
3
)
*
16384
+
(
block_idx
%
3
)
*
5504
+
42
*
128
input2_index
=
(
block_idx
//
3
)
*
16384
res_index
=
(
block_idx
//
3
)
*
16384
+
(
block_idx
%
3
)
*
5504
+
42
*
128
res_index
=
(
block_idx
//
3
)
*
16384
+
(
block_idx
%
3
)
*
5504
+
42
*
128
_inner_matmul_new
(
tik_instance
,
dtype
,
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
)
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
)
if
input_shape
==
((
1
,
64
,
64
),
(
1
,
64
,
64
),
"float32"
,
False
,
True
):
with
tik_instance
.
for_range
(
0
,
32
,
block_num
=
32
)
as
block_idx
:
with
tik_instance
.
for_range
(
0
,
2
,
thread_num
=
2
)
as
cc0
:
...
...
@@ -209,35 +220,35 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
input2_index
=
0
res_index
=
block_idx
*
128
+
cc0
*
64
_inner_matmul_new_1_64_32_64
(
tik_instance
,
dtype
,
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
)
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
)
input_shape_list
=
[((
1
,
128
,
128
),
(
1
,
128
,
128
),
"float32"
,
False
,
True
),
((
2
,
128
,
128
),
(
2
,
128
,
128
),
"float32"
,
False
,
True
),
((
4
,
128
,
128
),
(
4
,
128
,
128
),
"float32"
,
False
,
True
),
((
8
,
128
,
128
),
(
8
,
128
,
128
),
"float32"
,
False
,
True
),
((
16
,
128
,
128
),
(
16
,
128
,
128
),
"float32"
,
False
,
True
)
]
]
if
input_shape
in
input_shape_list
:
block_num
=
32
input1_unit_size
=
128
input2_unint_size
=
128
*
128
input2_unint_size
=
128
*
128
with
tik_instance
.
for_range
(
0
,
block_num
,
block_num
=
block_num
)
as
block_idx
:
block_process_ele_num
=
(
batch
*
m
*
k
)
//
block_num
loop_time
=
(
batch
*
m
*
k
)
//
block_num
//
input1_unit_size
loop_time
=
(
batch
*
m
*
k
)
//
block_num
//
input1_unit_size
thread_num
=
2
with
tik_instance
.
for_range
(
0
,
loop_time
,
thread_num
=
thread_num
)
as
cc0
:
input1_index
=
block_idx
*
block_process_ele_num
+
cc0
*
input1_unit_size
input1_index
=
block_idx
*
block_process_ele_num
+
cc0
*
input1_unit_size
if
batch
>
1
:
input2_index
=
block_idx
//
(
block_num
//
batch
)
*
input2_unint_size
input2_index
=
block_idx
//
(
block_num
//
batch
)
*
input2_unint_size
else
:
input2_index
=
0
res_index
=
block_idx
*
block_process_ele_num
+
cc0
*
input1_unit_size
res_index
=
block_idx
*
block_process_ele_num
+
cc0
*
input1_unit_size
_inner_matmul_new
(
tik_instance
,
dtype
,
input1
,
input1_index
,
input2
,
input2_index
,
res
,
res_index
)
tik_instance
.
BuildCCE
(
kernel_name
,
inputs
=
[
input1
,
input2
],
outputs
=
[
res
])
return
tik_instance
mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
浏览文件 @
2d0ee054
...
...
@@ -13,24 +13,25 @@
# limitations under the License.
# ============================================================================
"""CusCholeskyTrsm"""
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
from
te
import
tik
from
topi.cce
import
util
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
cus_cholesky_trsm_op_info
=
TBERegOp
(
"CusCholeskyTrsm"
)
\
.
fusion_type
(
"OPAQUE"
)
\
.
async_flag
(
False
)
\
.
binfile_name
(
"choleskytrsm.so"
)
\
.
compute_cost
(
10
)
\
.
kernel_name
(
"CusCholeskyTrsm"
)
\
.
partial_flag
(
True
)
\
.
input
(
0
,
"x1"
,
False
,
"required"
,
"all"
)
\
.
output
(
0
,
"y"
,
False
,
"required"
,
"all"
)
\
.
dtype_format
(
DataType
.
F32_Default
,
DataType
.
F32_Default
)
\
.
get_op_info
()
.
fusion_type
(
"OPAQUE"
)
\
.
async_flag
(
False
)
\
.
binfile_name
(
"choleskytrsm.so"
)
\
.
compute_cost
(
10
)
\
.
kernel_name
(
"CusCholeskyTrsm"
)
\
.
partial_flag
(
True
)
\
.
input
(
0
,
"x1"
,
False
,
"required"
,
"all"
)
\
.
output
(
0
,
"y"
,
False
,
"required"
,
"all"
)
\
.
dtype_format
(
DataType
.
F32_Default
,
DataType
.
F32_Default
)
\
.
get_op_info
()
@
op_info_register
(
cus_cholesky_trsm_op_info
)
def
CusCholeskyTrsm
(
input_x
,
output
,
kernel_name
):
def
CusCholeskyTrsm
(
input_x
,
output
,
kernel_name
):
input_x_shape
=
input_x
.
get
(
"shape"
)
output_shape
=
output
.
get
(
"shape"
)
split_dim
=
128
...
...
@@ -47,34 +48,36 @@ def CusCholeskyTrsm(input_x,output, kernel_name):
input_x
=
tik_instance
.
Tensor
(
"float32"
,
input_x_shape
,
name
=
"input_x"
,
scope
=
tik
.
scope_gm
)
res
=
tik_instance
.
Tensor
(
"float32"
,
output_shape
,
name
=
"res"
,
scope
=
tik
.
scope_gm
)
with
tik_instance
.
for_range
(
0
,
blocks
,
block_num
=
blocks
)
as
block_index
:
input_x_ub
=
tik_instance
.
Tensor
(
"float32"
,
(
split_dim
,
split_dim
),
name
=
"input_x_ub"
,
scope
=
tik
.
scope_ubuf
)
temp_ub
=
tik_instance
.
Tensor
(
"float32"
,
(
split_dim
,
split_dim
),
name
=
"temp_ub"
,
scope
=
tik
.
scope_ubuf
)
with
tik_instance
.
for_range
(
0
,
blocks
,
block_num
=
blocks
)
as
block_index
:
input_x_ub
=
tik_instance
.
Tensor
(
"float32"
,
(
split_dim
,
split_dim
),
name
=
"input_x_ub"
,
scope
=
tik
.
scope_ubuf
)
temp_ub
=
tik_instance
.
Tensor
(
"float32"
,
(
split_dim
,
split_dim
),
name
=
"temp_ub"
,
scope
=
tik
.
scope_ubuf
)
assist_1_ub
=
tik_instance
.
Tensor
(
"float32"
,
(
split_dim
,),
name
=
"assist_1_ub"
,
scope
=
tik
.
scope_ubuf
)
assist_2_ub
=
tik_instance
.
Tensor
(
"float32"
,
(
split_dim
,),
name
=
"assist_2_ub"
,
scope
=
tik
.
scope_ubuf
)
with
tik_instance
.
for_range
(
0
,
split_dim
)
as
i
:
tik_instance
.
data_move
(
input_x_ub
[
i
,
0
],
input_x
[
block_index
*
split_dim
+
i
,
block_index
*
split_dim
],
0
,
1
,
vector_repeat_times
*
8
,
0
,
0
)
scalar1
=
tik_instance
.
Scalar
(
"float32"
,
init_value
=
-
0.5
)
with
tik_instance
.
for_range
(
0
,
split_dim
)
as
i
:
tik_instance
.
data_move
(
input_x_ub
[
i
,
0
],
input_x
[
block_index
*
split_dim
+
i
,
block_index
*
split_dim
],
0
,
1
,
vector_repeat_times
*
8
,
0
,
0
)
scalar1
=
tik_instance
.
Scalar
(
"float32"
,
init_value
=-
0.5
)
with
tik_instance
.
for_range
(
0
,
split_dim
)
as
i
:
scalar2
=
tik_instance
.
Scalar
(
"float32"
)
tik_instance
.
vln
(
64
,
assist_1_ub
[
0
],
input_x_ub
[
i
,
0
],
vector_repeat_times
,
1
,
1
,
8
,
8
)
scalar2
=
tik_instance
.
Scalar
(
"float32"
)
tik_instance
.
vln
(
64
,
assist_1_ub
[
0
],
input_x_ub
[
i
,
0
],
vector_repeat_times
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
assist_2_ub
[
0
],
assist_1_ub
[
0
],
scalar1
,
vector_repeat_times
,
1
,
1
,
8
,
8
)
tik_instance
.
vexp
(
64
,
assist_1_ub
[
0
],
assist_2_ub
[
0
],
vector_repeat_times
,
1
,
1
,
8
,
8
)
scalar2
.
set_as
(
assist_1_ub
[
i
])
tik_instance
.
vmuls
(
64
,
input_x_ub
[
i
,
0
],
input_x_ub
[
i
,
0
],
scalar2
,
vector_repeat_times
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
input_x_ub
[
i
,
0
],
input_x_ub
[
i
,
0
],
scalar2
,
vector_repeat_times
,
1
,
1
,
8
,
8
)
with
tik_instance
.
for_range
(
i
+
1
,
split_dim
)
as
j
:
scalar3
=
tik_instance
.
Scalar
(
"float32"
)
scalar3
=
tik_instance
.
Scalar
(
"float32"
)
scalar3
.
set_as
(
input_x_ub
[
i
,
j
])
tik_instance
.
vmuls
(
64
,
temp_ub
[
j
,
0
],
input_x_ub
[
i
,
0
],
scalar3
,
vector_repeat_times
,
1
,
1
,
8
,
8
)
tik_instance
.
vsub
(
64
,
input_x_ub
[
i
+
1
,
0
],
input_x_ub
[
i
+
1
,
0
],
temp_ub
[
i
+
1
,
0
],
(
split_dim
-
1
-
i
)
*
vector_repeat_times
,
1
,
1
,
1
,
8
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
temp_ub
[
j
,
0
],
input_x_ub
[
i
,
0
],
scalar3
,
vector_repeat_times
,
1
,
1
,
8
,
8
)
tik_instance
.
vsub
(
64
,
input_x_ub
[
i
+
1
,
0
],
input_x_ub
[
i
+
1
,
0
],
temp_ub
[
i
+
1
,
0
],
(
split_dim
-
1
-
i
)
*
vector_repeat_times
,
1
,
1
,
1
,
8
,
8
,
8
)
zero
=
tik_instance
.
Scalar
(
"float32"
)
zero
.
set_as
(
0.0
)
one
=
tik_instance
.
Scalar
(
"float32"
)
one
.
set_as
(
1.0
)
with
tik_instance
.
for_range
(
0
,
split_dim
)
as
i
:
tik_instance
.
vector_dup
(
64
,
temp_ub
[
i
,
0
],
zero
,
vector_repeat_times
,
1
,
8
)
tik_instance
.
vector_dup
(
64
,
temp_ub
[
i
,
0
],
zero
,
vector_repeat_times
,
1
,
8
)
temp_ub
.
__setitem__
(
i
*
split_dim
+
i
,
one
)
chol_diag_element_final
=
tik_instance
.
Scalar
(
"float32"
)
...
...
@@ -89,16 +92,19 @@ def CusCholeskyTrsm(input_x,output, kernel_name):
with
tik_instance
.
for_range
(
0
,
i
)
as
j
:
chol_diag_element_loop
=
tik_instance
.
Scalar
(
"float32"
)
chol_diag_element_loop
.
set_as
(
input_x_ub
[
index
,
index
+
1
+
j
])
tik_instance
.
vmuls
(
64
,
assist_2_ub
,
temp_ub
[
j
+
index
+
1
,
0
],
chol_diag_element_loop
,
vector_repeat_times
,
1
,
1
,
8
,
8
)
tik_instance
.
vadd
(
64
,
assist_1_ub
,
assist_2_ub
,
assist_1_ub
,
vector_repeat_times
,
1
,
1
,
1
,
8
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
assist_2_ub
,
temp_ub
[
j
+
index
+
1
,
0
],
chol_diag_element_loop
,
vector_repeat_times
,
1
,
1
,
8
,
8
)
tik_instance
.
vadd
(
64
,
assist_1_ub
,
assist_2_ub
,
assist_1_ub
,
vector_repeat_times
,
1
,
1
,
1
,
8
,
8
,
8
)
temp_scalar
=
tik_instance
.
Scalar
(
"float32"
)
temp_scalar
.
set_as
(
input_x_ub
[
index
,
index
])
chol_diag_element
=
tik_instance
.
Scalar
(
"float32"
)
chol_diag_element
.
set_as
(
1.0
/
temp_scalar
)
tik_instance
.
vsub
(
64
,
temp_ub
[
index
,
0
],
temp_ub
[
index
,
0
],
assist_1_ub
,
vector_repeat_times
,
1
,
1
,
1
,
8
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
temp_ub
[
index
,
0
],
temp_ub
[
index
,
0
],
chol_diag_element
,
vector_repeat_times
,
1
,
1
,
8
,
8
)
tik_instance
.
vsub
(
64
,
temp_ub
[
index
,
0
],
temp_ub
[
index
,
0
],
assist_1_ub
,
vector_repeat_times
,
1
,
1
,
1
,
8
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
temp_ub
[
index
,
0
],
temp_ub
[
index
,
0
],
chol_diag_element
,
vector_repeat_times
,
1
,
1
,
8
,
8
)
tik_instance
.
data_move
(
res
[
block_index
,
0
,
0
],
temp_ub
,
0
,
1
,
8
*
vector_repeat_times
*
split_dim
,
0
,
0
)
tik_instance
.
data_move
(
res
[
block_index
,
0
,
0
],
temp_ub
,
0
,
1
,
8
*
vector_repeat_times
*
split_dim
,
0
,
0
)
tik_instance
.
BuildCCE
(
kernel_name
=
kernel_name
,
inputs
=
[
input_x
],
outputs
=
[
res
])
return
tik_instance
mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
浏览文件 @
2d0ee054
此差异已折叠。
点击以展开。
mindspore/ops/_op_impl/_custom_op/img2col_impl.py
浏览文件 @
2d0ee054
此差异已折叠。
点击以展开。
mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
浏览文件 @
2d0ee054
...
...
@@ -17,17 +17,15 @@ limitations under the License.
matmul
"""
from
__future__
import
absolute_import
import
te.lang.cce
import
te.platform.cce_params
as
cce
from
te.platform.fusion_manager
import
fusion_manager
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
from
te
import
tik
from
te
import
tvm
from
topi
import
generic
from
topi.cce
import
util
from
impl.matmul_vector
import
matmul_vector_cce
from
te
import
tik
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
# General limitation of the size for input shape: 2**31
SHAPE_SIZE_LIMIT
=
2147483648
NoneType
=
type
(
None
)
...
...
@@ -46,6 +44,7 @@ matmul_cube_dense_left_op_info = TBERegOp("CusMatMulCubeDenseLeft") \
.
dtype_format
(
DataType
.
F16_Default
,
DataType
.
F16_FracNZ
,
DataType
.
F16_Default
,
DataType
.
F16_FracNZ
)
\
.
get_op_info
()
# pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
def
_shape_check
(
shape_a
,
shape_b
,
shape_bias
,
src_dtype
,
trans_a
,
trans_b
):
"""
...
...
@@ -115,16 +114,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
if
m_shape
!=
1
:
if
n_shape
==
1
:
if
km_shape
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
)
!=
0
:
if
km_shape
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
)
!=
0
:
raise
RuntimeError
(
"input shape K1 should be multiple of %d"
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
))
elif
km_shape
%
k_block_size
!=
0
:
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
))
elif
km_shape
%
k_block_size
!=
0
:
raise
RuntimeError
(
"input shape K1 should be multiple of %d"
%
cce
.
BLOCK_IN
)
else
:
if
km_shape
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
)
!=
0
:
if
km_shape
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
)
!=
0
:
raise
RuntimeError
(
"input shape K1 should be multiple of %d"
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
))
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
))
if
n_shape
%
cce
.
BLOCK_IN
!=
0
and
n_shape
!=
1
:
raise
RuntimeError
(
"input shape N should be 1 or multiple of %d"
%
cce
.
BLOCK_IN
)
...
...
@@ -132,7 +131,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
if
len
(
shape_bias
):
if
len
(
shape_bias
)
==
1
:
if
is_gevm
or
is_gemv
:
if
shape_bias
[
0
]
!=
m_shape
*
n_shape
:
if
shape_bias
[
0
]
!=
m_shape
*
n_shape
:
raise
RuntimeError
(
"broadcast case shape bias for gemv must be equal m*n"
)
else
:
if
shape_bias
[
0
]
!=
n_shape
:
...
...
@@ -143,33 +142,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
else
:
raise
RuntimeError
(
"unsupport input shape now for batch bias case"
)
def
_get_bias
(
shape_bias
):
bias_length
=
shape_bias
[
0
]
if
bias_length
%
16
==
0
:
if
bias_length
%
16
==
0
:
return
shape_bias
else
:
bias_length
=
(
bias_length
//
16
)
*
16
+
16
bias_length
=
(
bias_length
//
16
)
*
16
+
16
shape_bias
=
[]
shape_bias
.
append
(
bias_length
)
return
shape_bias
def
_get_input_shape
(
shape_x
):
dim_a
=
shape_x
[
0
]
dim_b
=
shape_x
[
1
]
res
=
[]
if
dim_a
%
16
!=
0
:
dim_a
=
(
dim_a
//
16
)
*
16
+
16
if
dim_a
%
16
!=
0
:
dim_a
=
(
dim_a
//
16
)
*
16
+
16
res
.
append
(
dim_a
)
else
:
res
.
append
(
dim_a
)
if
dim_b
%
16
!=
0
:
dim_b
=
(
dim_b
//
16
)
*
16
+
16
if
dim_b
%
16
!=
0
:
dim_b
=
(
dim_b
//
16
)
*
16
+
16
res
.
append
(
dim_b
)
else
:
res
.
append
(
dim_b
)
return
res
def
check_supported
(
input_x1
,
input_x2
,
bias
=
None
,
output_y
=
{},
trans_a
=
False
,
trans_b
=
False
,
kernel_name
=
"matmulcube"
):
shape_a
=
input_x1
.
get
(
"shape"
)
shape_b
=
input_x2
.
get
(
"shape"
)
...
...
@@ -184,7 +186,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
if
bias
is
not
None
and
bool
(
bias
):
shape_bias
=
bias
.
get
(
"shape"
)
try
:
trans_a_f
=
bool
(
1
-
trans_a
)
trans_a_f
=
bool
(
1
-
trans_a
)
if
src_dtype
==
"float32"
or
src_dtype
==
"int32"
:
if
len
(
shape_a
)
!=
2
and
len
(
shape_b
)
!=
2
:
return
False
...
...
@@ -205,44 +207,46 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
return
False
elif
shape_a
[
1
]
!=
shape_b
[
0
]:
return
False
if
trans_a_f
and
trans_b
and
shape_b
[
1
]
==
1
:
return
False
if
src_dtype
==
"float16"
:
if
len
(
shape_a
)
!=
2
and
len
(
shape_b
)
!=
2
:
return
False
if
trans_a
:
m_shape
=
shape_a
[
1
]
k_shape
=
shape_a
[
0
]
else
:
m_shape
=
shape_a
[
0
]
k_shape
=
shape_a
[
1
]
if
trans_b
:
n_shape
=
shape_b
[
0
]
k_b_shape
=
shape_b
[
1
]
else
:
n_shape
=
shape_b
[
1
]
k_b_shape
=
shape_b
[
0
]
if
k_shape
!=
k_b_shape
:
return
False
if
m_shape
==
1
or
n_shape
==
1
:
if
k_shape
%
256
!=
0
:
return
False
except
RuntimeError
as
e
:
return
False
return
True
# pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
# @util.check_input_type(dict, dict, (dict, NoneType), dict, bool, bool, str)
@
op_info_register
(
matmul_cube_dense_left_op_info
)
def
CusMatMulCubeDenseLeft
(
input_x1
,
input_x2
,
bias
=
None
,
output_y
=
{},
trans_a
=
False
,
trans_b
=
False
,
kernel_name
=
"matmulcube"
):
def
CusMatMulCubeDenseLeft
(
input_x1
,
input_x2
,
bias
=
None
,
output_y
=
{},
trans_a
=
False
,
trans_b
=
False
,
kernel_name
=
"matmulcube"
):
"""
calculating matrix multiplication with bias, C = A*B + bias, support input
data with fractal format.
...
...
@@ -279,87 +283,87 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
print
(
shape_a
,
shape_b
)
print
(
"============"
)
if
input_x2
.
get
(
"format"
)
==
"FRACTAL_Z"
:
n
,
c
,
h
,
w
=
shape_b
n
,
c
,
h
,
w
=
shape_b
c0
=
16
c1
=
c
//
c0
if
c1
==
0
:
c1
=
1
shape_b
=
[
n
,
c1
*
h
*
w
*
c0
]
shape_a
=
[
n
,
n
]
shape_a
=
[
n
,
n
]
if
input_x1
.
get
(
"format"
)
==
"FRACTAL_Z"
:
n
,
c
,
h
,
w
=
shape_a
n
,
c
,
h
,
w
=
shape_a
c0
=
16
c1
=
c
//
c0
if
c1
==
0
:
c1
=
1
shape_a
=
[
n
,
c1
*
h
*
w
*
c0
]
shape_b
=
[
c1
*
h
*
w
*
c0
,
c1
*
h
*
w
*
c0
]
if
input_x2
.
get
(
"format"
)
==
"FRACTAL_NZ"
:
shape_a
=
[
shape_b
[
0
],
shape_b
[
0
]]
shape_b
=
shape_b
if
input_x1
.
get
(
"format"
)
==
"FRACTAL_NZ"
:
shape_a
=
shape_a
shape_b
=
[
shape_a
[
1
],
shape_a
[
1
]]
shape_a
=
list
(
shape_a
)
shape_b
=
list
(
shape_b
)
shape_a
=
_get_input_shape
(
shape_a
)
shape_b
=
_get_input_shape
(
shape_b
)
util
.
check_kernel_name
(
kernel_name
)
util
.
check_shape_rule
(
shape_a
)
util
.
check_shape_rule
(
shape_b
)
util
.
check_shape_size
(
shape_a
,
SHAPE_SIZE_LIMIT
)
util
.
check_shape_size
(
shape_b
,
SHAPE_SIZE_LIMIT
)
shape_a
=
[
shape_a
[
1
],
shape_a
[
0
]]
trans_a
=
bool
(
1
-
trans_a
)
trans_a
=
bool
(
1
-
trans_a
)
shape_b
=
[
shape_b
[
1
],
shape_b
[
0
]]
trans_b
=
bool
(
1
-
trans_b
)
trans_b
=
bool
(
1
-
trans_b
)
shape_bias
=
()
if
bias
is
not
None
and
bool
(
bias
):
shape_bias
=
bias
.
get
(
"shape"
)
shape_bias
=
list
(
shape_bias
)
shape_bias
=
_get_bias
(
shape_bias
)
src_dtype
=
input_x1
.
get
(
"dtype"
).
lower
()
dst_dtype
=
output_y
.
get
(
"dtype"
).
lower
()
_shape_check
(
shape_a
,
shape_b
,
shape_bias
,
src_dtype
,
trans_a
,
trans_b
)
m_shape
=
shape_a
[
len
(
shape_a
)
-
2
]
km_shape
=
shape_a
[
len
(
shape_a
)
-
1
]
kn_shape
=
shape_b
[
len
(
shape_a
)
-
2
]
n_shape
=
shape_b
[
len
(
shape_a
)
-
1
]
if
src_dtype
==
"float16"
:
block_reduce
=
cce
.
BLOCK_REDUCE
block_in
=
cce
.
BLOCK_IN
block_out
=
cce
.
BLOCK_OUT
if
trans_a
and
km_shape
==
1
:
block_in
=
cce
.
BLOCK_VECTOR
if
not
trans_a
and
m_shape
==
1
:
block_in
=
cce
.
BLOCK_VECTOR
if
trans_b
and
kn_shape
==
1
:
block_out
=
cce
.
BLOCK_VECTOR
if
not
trans_b
and
n_shape
==
1
:
block_out
=
cce
.
BLOCK_VECTOR
if
trans_a
:
shape_a_temp
=
(
m_shape
//
block_reduce
,
km_shape
//
block_in
,
block_reduce
,
block_in
)
else
:
shape_a_temp
=
(
m_shape
//
block_in
,
km_shape
//
block_reduce
,
block_in
,
block_reduce
)
if
trans_b
:
shape_b_temp
=
(
kn_shape
//
block_out
,
n_shape
//
block_reduce
,
block_reduce
,
block_out
)
else
:
...
...
@@ -368,7 +372,7 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
format_a
=
"FRACTAL_NZ"
shape_b_temp
=
(
shape_b_temp
[
0
],
shape_b_temp
[
1
],
shape_b_temp
[
2
],
shape_b_temp
[
3
])
format_b
=
"FRACTAL_NZ"
print
(
"======================================="
)
print
(
shape_a_temp
,
shape_b_temp
)
print
(
format_a
,
format_b
)
...
...
@@ -378,67 +382,85 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
dtype
=
src_dtype
)
tensor_b
=
tvm
.
placeholder
(
shape_b_temp
,
name
=
'tensor_b'
,
dtype
=
src_dtype
)
if
len
(
shape_bias
)
>
0
:
tensor_bias
=
tvm
.
placeholder
(
shape_bias
,
name
=
'tensor_bias'
,
dtype
=
dst_dtype
)
if
shape_a_temp
[
0
]
==
63
and
shape_a_temp
[
1
]
==
63
and
shape_b_temp
[
0
]
==
128
and
shape_b_temp
[
1
]
==
63
:
if
util
.
get_product_version
()
==
util
.
VERSION_MINI
:
tik_instance
=
tik
.
Tik
(
tik
.
Dprofile
(
"v100"
,
"mini"
))
tik_instance
=
tik
.
Tik
(
tik
.
Dprofile
(
"v100"
,
"mini"
))
else
:
tik_instance
=
tik
.
Tik
(
tik
.
Dprofile
(
"v100"
,
"cloud"
))
tik_instance
=
tik
.
Tik
(
tik
.
Dprofile
(
"v100"
,
"cloud"
))
input_x1
=
tik_instance
.
Tensor
(
"float16"
,
shape_a_temp
,
name
=
"left_matrix"
,
scope
=
tik
.
scope_gm
)
input_x2
=
tik_instance
.
Tensor
(
"float16"
,
shape_b_temp
,
name
=
"right_matrix"
,
scope
=
tik
.
scope_gm
)
resMatmul
=
tik_instance
.
Tensor
(
"float16"
,
shape_output
,
name
=
"output"
,
scope
=
tik
.
scope_gm
)
with
tik_instance
.
for_range
(
0
,
32
,
block_num
=
32
)
as
block_index
:
resMatmul_local_UB
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
256
,),
scope
=
tik
.
scope_ubuf
,
name
=
"resMatmul_local_UB"
)
resMatmul_local_UB_local_L0C
=
tik_instance
.
Tensor
(
"float32"
,
(
128
*
256
,),
scope
=
tik
.
scope_cc
,
name
=
"resMatmul_local_UB"
)
input_1_local_L1_local_L0A
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
128
,),
scope
=
tik
.
scope_ca
,
name
=
"input_1_local_L1_local_L0A"
)
input_2_local_L1
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
256
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_2_local_L1"
)
input_1_local_L1
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
128
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_1_local_L1"
)
input_2_local_L1_local_L0B
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
256
,),
scope
=
tik
.
scope_cb
,
name
=
"input_2_local_L1_local_L0B"
)
with
tik_instance
.
for_range
(
0
,
32
,
block_num
=
32
)
as
block_index
:
resMatmul_local_UB
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
256
,),
scope
=
tik
.
scope_ubuf
,
name
=
"resMatmul_local_UB"
)
resMatmul_local_UB_local_L0C
=
tik_instance
.
Tensor
(
"float32"
,
(
128
*
256
,),
scope
=
tik
.
scope_cc
,
name
=
"resMatmul_local_UB"
)
input_1_local_L1_local_L0A
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
128
,),
scope
=
tik
.
scope_ca
,
name
=
"input_1_local_L1_local_L0A"
)
input_2_local_L1
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
256
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_2_local_L1"
)
input_1_local_L1
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
128
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_1_local_L1"
)
input_2_local_L1_local_L0B
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
256
,),
scope
=
tik
.
scope_cb
,
name
=
"input_2_local_L1_local_L0B"
)
core_m_idx
=
block_index
%
8
core_n_idx
=
block_index
//
8
with
tik_instance
.
if_scope
(
core_m_idx
!=
7
):
tik_instance
.
data_move
(
input_1_local_L1
,
input_x1
[
core_m_idx
*
(
8
*
256
+
128
*
1008
)],
0
,
8
,
128
,
55
*
16
,
0
)
tik_instance
.
data_move
(
input_2_local_L1
,
input_x2
[
core_m_idx
*
8
*
256
+
core_n_idx
*
512
*
1008
],
0
,
32
,
128
,
55
*
16
,
0
)
tik_instance
.
data_move
(
input_1_local_L1
,
input_x1
[
core_m_idx
*
(
8
*
256
+
128
*
1008
)],
0
,
8
,
128
,
55
*
16
,
0
)
tik_instance
.
data_move
(
input_2_local_L1
,
input_x2
[
core_m_idx
*
8
*
256
+
core_n_idx
*
512
*
1008
],
0
,
32
,
128
,
55
*
16
,
0
)
with
tik_instance
.
for_range
(
0
,
8
)
as
cc12
:
tik_instance
.
load2dv1
(
input_1_local_L1_local_L0A
[
cc12
*
2048
],
input_1_local_L1
[
cc12
*
256
],
0
,
8
,
8
,
0
,
False
)
tik_instance
.
load2dv1
(
input_1_local_L1_local_L0A
[
cc12
*
2048
],
input_1_local_L1
[
cc12
*
256
],
0
,
8
,
8
,
0
,
False
)
with
tik_instance
.
for_range
(
0
,
2
)
as
cc6
:
with
tik_instance
.
for_range
(
0
,
8
)
as
cc121
:
tik_instance
.
load2dv1
(
input_2_local_L1_local_L0B
[
cc121
*
4096
],
input_2_local_L1
[
cc6
*
32768
+
cc121
*
256
],
0
,
16
,
8
,
0
,
True
)
tik_instance
.
mmad
(
resMatmul_local_UB_local_L0C
,
input_1_local_L1_local_L0A
,
input_2_local_L1_local_L0B
,
128
,
128
,
256
,
0
)
tik_instance
.
load2dv1
(
input_2_local_L1_local_L0B
[
cc121
*
4096
],
input_2_local_L1
[
cc6
*
32768
+
cc121
*
256
],
0
,
16
,
8
,
0
,
True
)
tik_instance
.
mmad
(
resMatmul_local_UB_local_L0C
,
input_1_local_L1_local_L0A
,
input_2_local_L1_local_L0B
,
128
,
128
,
256
,
0
)
tik_instance
.
data_move
(
resMatmul_local_UB
,
resMatmul_local_UB_local_L0C
,
0
,
1
,
128
,
0
,
0
,
1
)
tik_instance
.
data_move
(
resMatmul
[
cc6
*
256
*
1008
+
core_m_idx
*
8
*
256
+
core_n_idx
*
512
*
1008
],
resMatmul_local_UB
,
0
,
16
,
256
//
2
,
0
,
55
*
16
*
2
//
2
)
tik_instance
.
data_move
(
resMatmul
[
cc6
*
256
*
1008
+
core_m_idx
*
8
*
256
+
core_n_idx
*
512
*
1008
],
resMatmul_local_UB
,
0
,
16
,
256
//
2
,
0
,
55
*
16
*
2
//
2
)
with
tik_instance
.
else_scope
():
tik_instance
.
data_move
(
input_1_local_L1
,
input_x1
[
core_m_idx
*
(
8
*
256
+
128
*
1008
)],
0
,
7
,
112
,
56
*
16
,
0
)
tik_instance
.
data_move
(
input_2_local_L1
,
input_x2
[
core_m_idx
*
8
*
256
+
core_n_idx
*
512
*
1008
],
0
,
32
,
112
,
56
*
16
,
0
)
tik_instance
.
data_move
(
input_1_local_L1
,
input_x1
[
core_m_idx
*
(
8
*
256
+
128
*
1008
)],
0
,
7
,
112
,
56
*
16
,
0
)
tik_instance
.
data_move
(
input_2_local_L1
,
input_x2
[
core_m_idx
*
8
*
256
+
core_n_idx
*
512
*
1008
],
0
,
32
,
112
,
56
*
16
,
0
)
with
tik_instance
.
for_range
(
0
,
7
)
as
cc10
:
tik_instance
.
load2dv1
(
input_1_local_L1_local_L0A
[
cc10
*
1792
],
input_1_local_L1
[
cc10
*
256
],
0
,
7
,
7
,
0
,
False
)
tik_instance
.
load2dv1
(
input_1_local_L1_local_L0A
[
cc10
*
1792
],
input_1_local_L1
[
cc10
*
256
],
0
,
7
,
7
,
0
,
False
)
with
tik_instance
.
for_range
(
0
,
2
)
as
cc5
:
with
tik_instance
.
for_range
(
0
,
7
)
as
cc101
:
tik_instance
.
load2dv1
(
input_2_local_L1_local_L0B
[
cc101
*
4096
],
input_2_local_L1
[
cc5
*
28672
+
cc101
*
256
],
0
,
16
,
7
,
0
,
True
)
tik_instance
.
mmad
(
resMatmul_local_UB_local_L0C
,
input_1_local_L1_local_L0A
,
input_2_local_L1_local_L0B
,
112
,
112
,
256
,
0
)
tik_instance
.
load2dv1
(
input_2_local_L1_local_L0B
[
cc101
*
4096
],
input_2_local_L1
[
cc5
*
28672
+
cc101
*
256
],
0
,
16
,
7
,
0
,
True
)
tik_instance
.
mmad
(
resMatmul_local_UB_local_L0C
,
input_1_local_L1_local_L0A
,
input_2_local_L1_local_L0B
,
112
,
112
,
256
,
0
)
tik_instance
.
data_move
(
resMatmul_local_UB
,
resMatmul_local_UB_local_L0C
,
0
,
1
,
112
,
0
,
0
,
1
)
tik_instance
.
data_move
(
resMatmul
[
cc5
*
256
*
1008
+
core_m_idx
*
8
*
256
+
core_n_idx
*
512
*
1008
],
resMatmul_local_UB
,
0
,
16
,
224
//
2
,
0
,
56
*
16
*
2
//
2
)
tik_instance
.
data_move
(
resMatmul
[
cc5
*
256
*
1008
+
core_m_idx
*
8
*
256
+
core_n_idx
*
512
*
1008
],
resMatmul_local_UB
,
0
,
16
,
224
//
2
,
0
,
56
*
16
*
2
//
2
)
tik_instance
.
BuildCCE
(
kernel_name
=
kernel_name
,
inputs
=
[
input_x1
,
input_x2
],
outputs
=
[
resMatmul
])
return
tik_instance
else
:
print
(
"come into tbe, shape is error!"
)
result
=
te
.
lang
.
cce
.
matmul
(
tensor_a
,
tensor_b
,
trans_a
,
trans_b
,
format_a
=
format_a
,
format_b
=
format_b
,
dst_dtype
=
dst_dtype
,
tensor_bias
=
tensor_bias
)
with
tvm
.
target
.
cce
():
schedule
=
generic
.
auto_schedule
(
result
)
tensor_list
=
[
tensor_a
,
tensor_b
,
result
]
if
len
(
shape_bias
)
>
0
:
tensor_list
=
[
tensor_a
,
tensor_b
,
tensor_bias
,
result
]
config
=
{
"print_ir"
:
False
,
"name"
:
kernel_name
,
"tensor_list"
:
tensor_list
}
te
.
lang
.
cce
.
cce_build_code
(
schedule
,
config
)
mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
浏览文件 @
2d0ee054
...
...
@@ -18,15 +18,10 @@ limitations under the License.
matmul
"""
from
__future__
import
absolute_import
import
te.lang.cce
import
te.platform.cce_params
as
cce
from
te.platform.fusion_manager
import
fusion_manager
from
te
import
tvm
from
topi
import
generic
from
topi.cce
import
util
from
impl.matmul_vector
import
matmul_vector_cce
from
te
import
tik
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
from
te
import
tik
from
topi.cce
import
util
matmul_cube_dense_right_op_info
=
TBERegOp
(
"CusMatMulCubeDenseRight"
)
\
.
fusion_type
(
"OPAQUE"
)
\
...
...
@@ -40,23 +35,26 @@ matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
.
input
(
2
,
"x3"
,
False
,
"required"
,
"all"
)
\
.
input
(
3
,
"x4"
,
False
,
"optional"
,
"all"
)
\
.
output
(
0
,
"y"
,
False
,
"required"
,
"all"
)
\
.
dtype_format
(
DataType
.
F16_FracNZ
,
DataType
.
F16_Default
,
DataType
.
F32_Default
,
DataType
.
F16_Default
,
DataType
.
F32_FracNZ
)
\
.
dtype_format
(
DataType
.
F16_FracNZ
,
DataType
.
F16_Default
,
DataType
.
F32_Default
,
DataType
.
F16_Default
,
DataType
.
F32_FracNZ
)
\
.
get_op_info
()
@
op_info_register
(
matmul_cube_dense_right_op_info
)
def
CusMatMulCubeDenseRight
(
input_x1
,
input_x2
,
input_x3
,
bias
=
None
,
output_y
=
{},
trans_a
=
False
,
trans_b
=
False
,
kernel_name
=
"matmulcube"
):
def
CusMatMulCubeDenseRight
(
input_x1
,
input_x2
,
input_x3
,
bias
=
None
,
output_y
=
{},
trans_a
=
False
,
trans_b
=
False
,
kernel_name
=
"matmulcube"
):
shape_a_temp
=
(
128
,
63
,
16
,
16
)
shape_b_temp
=
(
128
,
128
,
16
,
16
)
shape_output
=
output_y
.
get
(
"shape"
)
matrix_max_shape
=
(
1
,)
support_shape
=
[(
shape_a_temp
,
shape_b_temp
,
matrix_max_shape
),]
support_shape
=
[(
shape_a_temp
,
shape_b_temp
,
matrix_max_shape
),
]
shape_a_input
=
input_x1
.
get
(
"shape"
)
shape_b_input
=
input_x2
.
get
(
"shape"
)
matrix_max_input
=
input_x3
.
get
(
"shape"
)
input_shape
=
(
tuple
(
shape_a_input
),
tuple
(
shape_b_input
),
tuple
(
matrix_max_input
))
if
input_shape
not
in
support_shape
:
raise
RuntimeError
(
"input_shape %s is not supported"
%
str
(
input_shape
))
if
shape_a_temp
[
0
]
==
128
and
shape_a_temp
[
1
]
==
63
and
shape_b_temp
[
0
]
==
128
and
shape_b_temp
[
1
]
==
128
:
if
util
.
get_product_version
()
==
util
.
VERSION_MINI
:
tik_instance
=
tik
.
Tik
(
tik
.
Dprofile
(
"v100"
,
"mini"
))
...
...
@@ -64,79 +62,110 @@ def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}
tik_instance
=
tik
.
Tik
(
tik
.
Dprofile
(
"v100"
,
"cloud"
))
input_x1
=
tik_instance
.
Tensor
(
"float16"
,
shape_a_temp
,
name
=
"left_matrix"
,
scope
=
tik
.
scope_gm
)
input_x2
=
tik_instance
.
Tensor
(
"float16"
,
shape_b_temp
,
name
=
"right_matrix"
,
scope
=
tik
.
scope_gm
)
input_x3
=
tik_instance
.
Tensor
(
"float32"
,
[
1
,],
name
=
"matrix_max"
,
scope
=
tik
.
scope_gm
)
input_x3
=
tik_instance
.
Tensor
(
"float32"
,
[
1
,
],
name
=
"matrix_max"
,
scope
=
tik
.
scope_gm
)
resMatmul
=
tik_instance
.
Tensor
(
"float32"
,
shape_output
,
name
=
"output"
,
scope
=
tik
.
scope_gm
)
with
tik_instance
.
for_range
(
0
,
32
,
block_num
=
32
)
as
block_index
:
core_m_idx
=
block_index
//
16
core_n_idx
=
block_index
%
16
matrix_max_scalar
=
tik_instance
.
Scalar
(
"float32"
)
matrix_max_local_UB
=
tik_instance
.
Tensor
(
"float32"
,
(
8
,),
scope
=
tik
.
scope_ubuf
,
name
=
"matrix_max_local_UB"
)
matrix_max_local_UB
=
tik_instance
.
Tensor
(
"float32"
,
(
8
,),
scope
=
tik
.
scope_ubuf
,
name
=
"matrix_max_local_UB"
)
tik_instance
.
data_move
(
matrix_max_local_UB
,
input_x3
,
0
,
1
,
1
,
0
,
0
)
matrix_max_scalar
.
set_as
(
matrix_max_local_UB
[
0
])
resMatmul_local_UB
=
tik_instance
.
Tensor
(
"float32"
,
(
256
*
128
,),
scope
=
tik
.
scope_ubuf
,
name
=
"resMatmul_local_UB"
)
resMatmul_local_UB1
=
tik_instance
.
Tensor
(
"float32"
,
(
240
*
128
,),
scope
=
tik
.
scope_ubuf
,
name
=
"resMatmul_local_UB1"
)
resMatmul_local_UB_local_L0C
=
tik_instance
.
Tensor
(
"float32"
,
(
256
*
128
,),
scope
=
tik
.
scope_cc
,
name
=
"resMatmul_local_UB_local_L0C"
)
resMatmul_local_UB_local_L0C1
=
tik_instance
.
Tensor
(
"float32"
,
(
240
*
128
,),
scope
=
tik
.
scope_cc
,
name
=
"resMatmul_local_UB_local_L0C1"
)
resMatmul_local_UB
=
tik_instance
.
Tensor
(
"float32"
,
(
256
*
128
,),
scope
=
tik
.
scope_ubuf
,
name
=
"resMatmul_local_UB"
)
resMatmul_local_UB1
=
tik_instance
.
Tensor
(
"float32"
,
(
240
*
128
,),
scope
=
tik
.
scope_ubuf
,
name
=
"resMatmul_local_UB1"
)
resMatmul_local_UB_local_L0C
=
tik_instance
.
Tensor
(
"float32"
,
(
256
*
128
,),
scope
=
tik
.
scope_cc
,
name
=
"resMatmul_local_UB_local_L0C"
)
resMatmul_local_UB_local_L0C1
=
tik_instance
.
Tensor
(
"float32"
,
(
240
*
128
,),
scope
=
tik
.
scope_cc
,
name
=
"resMatmul_local_UB_local_L0C1"
)
input_1_local_L1_local_L0A
=
tik_instance
.
Tensor
(
"float16"
,
(
256
*
128
,),
scope
=
tik
.
scope_ca
,
name
=
"input_1_local_L1_local_L0A"
)
input_2_local_L1
=
tik_instance
.
Tensor
(
"float16"
,
(
8
*
128
*
16
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_2_local_L1"
)
input_2_local_L11
=
tik_instance
.
Tensor
(
"float16"
,
(
8
*
128
*
16
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_2_local_L11"
)
input_1_local_L1
=
tik_instance
.
Tensor
(
"float16"
,
(
8
*
256
*
16
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_1_local_L1"
)
input_1_local_L11
=
tik_instance
.
Tensor
(
"float16"
,
(
8
*
240
*
16
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_1_local_L11"
)
input_2_local_L1_local_L0B
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
128
,),
scope
=
tik
.
scope_cb
,
name
=
"input_2_local_L1_local_L0B"
)
input_2_local_L1_local_L0B1
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
128
,),
scope
=
tik
.
scope_cb
,
name
=
"input_2_local_L1_local_L0B1"
)
input_1_local_L1_local_L0A
=
tik_instance
.
Tensor
(
"float16"
,
(
256
*
128
,),
scope
=
tik
.
scope_ca
,
name
=
"input_1_local_L1_local_L0A"
)
input_2_local_L1
=
tik_instance
.
Tensor
(
"float16"
,
(
8
*
128
*
16
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_2_local_L1"
)
input_2_local_L11
=
tik_instance
.
Tensor
(
"float16"
,
(
8
*
128
*
16
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_2_local_L11"
)
input_1_local_L1
=
tik_instance
.
Tensor
(
"float16"
,
(
8
*
256
*
16
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_1_local_L1"
)
input_1_local_L11
=
tik_instance
.
Tensor
(
"float16"
,
(
8
*
240
*
16
,),
scope
=
tik
.
scope_cbuf
,
name
=
"input_1_local_L11"
)
input_2_local_L1_local_L0B
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
128
,),
scope
=
tik
.
scope_cb
,
name
=
"input_2_local_L1_local_L0B"
)
input_2_local_L1_local_L0B1
=
tik_instance
.
Tensor
(
"float16"
,
(
128
*
128
,),
scope
=
tik
.
scope_cb
,
name
=
"input_2_local_L1_local_L0B1"
)
with
tik_instance
.
if_scope
(
core_m_idx
==
0
):
with
tik_instance
.
for_range
(
0
,
2
)
as
cc1
:
tik_instance
.
data_move
(
input_2_local_L1
,
input_x2
[
core_n_idx
*
262144
+
core_n_idx
*
2048
],
0
,
8
,
128
,
1920
,
0
)
tik_instance
.
data_move
(
input_1_local_L1
,
input_x1
[
core_n_idx
*
129024
+
cc1
*
4096
],
0
,
8
,
256
,
752
,
0
)
tik_instance
.
data_move
(
input_2_local_L1
,
input_x2
[
core_n_idx
*
262144
+
core_n_idx
*
2048
],
0
,
8
,
128
,
1920
,
0
)
tik_instance
.
data_move
(
input_1_local_L1
,
input_x1
[
core_n_idx
*
129024
+
cc1
*
4096
],
0
,
8
,
256
,
752
,
0
)
with
tik_instance
.
for_range
(
0
,
8
)
as
cc10
:
tik_instance
.
load2dv1
(
input_2_local_L1_local_L0B
[
cc10
*
2048
],
input_2_local_L1
[
cc10
*
256
],
0
,
8
,
8
,
0
,
True
)
tik_instance
.
load2dv1
(
input_2_local_L1_local_L0B
[
cc10
*
2048
],
input_2_local_L1
[
cc10
*
256
],
0
,
8
,
8
,
0
,
True
)
with
tik_instance
.
for_range
(
0
,
16
)
as
cc101
:
tik_instance
.
load2dv1
(
input_1_local_L1_local_L0A
[
cc101
*
2048
],
input_1_local_L1
[
cc101
*
256
],
0
,
8
,
16
,
0
,
False
)
tik_instance
.
mmad
(
resMatmul_local_UB_local_L0C
,
input_1_local_L1_local_L0A
,
input_2_local_L1_local_L0B
,
256
,
128
,
128
,
0
)
tik_instance
.
load2dv1
(
input_1_local_L1_local_L0A
[
cc101
*
2048
],
input_1_local_L1
[
cc101
*
256
],
0
,
8
,
16
,
0
,
False
)
tik_instance
.
mmad
(
resMatmul_local_UB_local_L0C
,
input_1_local_L1_local_L0A
,
input_2_local_L1_local_L0B
,
256
,
128
,
128
,
0
)
tik_instance
.
data_move
(
resMatmul_local_UB
,
resMatmul_local_UB_local_L0C
,
0
,
1
,
128
,
0
,
0
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
,
resMatmul_local_UB
,
matrix_max_scalar
,
255
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
[
255
*
64
],
resMatmul_local_UB
[
255
*
64
],
matrix_max_scalar
,
255
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
[
510
*
64
],
resMatmul_local_UB
[
510
*
64
],
matrix_max_scalar
,
2
,
1
,
1
,
8
,
8
)
tik_instance
.
data_move
(
resMatmul
[
core_n_idx
*
129024
+
cc1
*
4096
],
resMatmul_local_UB
,
0
,
8
,
512
,
0
,
1504
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
,
resMatmul_local_UB
,
matrix_max_scalar
,
255
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
[
255
*
64
],
resMatmul_local_UB
[
255
*
64
],
matrix_max_scalar
,
255
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
[
510
*
64
],
resMatmul_local_UB
[
510
*
64
],
matrix_max_scalar
,
2
,
1
,
1
,
8
,
8
)
tik_instance
.
data_move
(
resMatmul
[
core_n_idx
*
129024
+
cc1
*
4096
],
resMatmul_local_UB
,
0
,
8
,
512
,
0
,
1504
)
with
tik_instance
.
else_scope
():
tik_instance
.
data_move
(
input_2_local_L1
,
input_x2
[
core_n_idx
*
262144
+
core_n_idx
*
2048
],
0
,
8
,
128
,
1920
,
0
)
tik_instance
.
data_move
(
input_2_local_L1
,
input_x2
[
core_n_idx
*
262144
+
core_n_idx
*
2048
],
0
,
8
,
128
,
1920
,
0
)
tik_instance
.
data_move
(
input_1_local_L1
,
input_x1
[
core_n_idx
*
129024
+
2
*
4096
],
0
,
8
,
256
,
752
,
0
)
with
tik_instance
.
for_range
(
0
,
8
)
as
cc10
:
tik_instance
.
load2dv1
(
input_2_local_L1_local_L0B
[
cc10
*
2048
],
input_2_local_L1
[
cc10
*
256
],
0
,
8
,
8
,
0
,
True
)
tik_instance
.
load2dv1
(
input_2_local_L1_local_L0B
[
cc10
*
2048
],
input_2_local_L1
[
cc10
*
256
],
0
,
8
,
8
,
0
,
True
)
with
tik_instance
.
for_range
(
0
,
16
)
as
cc101
:
tik_instance
.
load2dv1
(
input_1_local_L1_local_L0A
[
cc101
*
2048
],
input_1_local_L1
[
cc101
*
256
],
0
,
8
,
16
,
0
,
False
)
tik_instance
.
mmad
(
resMatmul_local_UB_local_L0C
,
input_1_local_L1_local_L0A
,
input_2_local_L1_local_L0B
,
256
,
128
,
128
,
0
)
tik_instance
.
load2dv1
(
input_1_local_L1_local_L0A
[
cc101
*
2048
],
input_1_local_L1
[
cc101
*
256
],
0
,
8
,
16
,
0
,
False
)
tik_instance
.
mmad
(
resMatmul_local_UB_local_L0C
,
input_1_local_L1_local_L0A
,
input_2_local_L1_local_L0B
,
256
,
128
,
128
,
0
)
tik_instance
.
data_move
(
resMatmul_local_UB
,
resMatmul_local_UB_local_L0C
,
0
,
1
,
128
,
0
,
0
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
,
resMatmul_local_UB
,
matrix_max_scalar
,
255
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
[
255
*
64
],
resMatmul_local_UB
[
255
*
64
],
matrix_max_scalar
,
255
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
[
510
*
64
],
resMatmul_local_UB
[
510
*
64
],
matrix_max_scalar
,
2
,
1
,
1
,
8
,
8
)
tik_instance
.
data_move
(
resMatmul
[
core_n_idx
*
129024
+
2
*
4096
],
resMatmul_local_UB
,
0
,
8
,
512
,
0
,
1504
)
tik_instance
.
data_move
(
input_2_local_L11
,
input_x2
[
core_n_idx
*
262144
+
core_n_idx
*
2048
],
0
,
8
,
128
,
1920
,
0
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
,
resMatmul_local_UB
,
matrix_max_scalar
,
255
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
[
255
*
64
],
resMatmul_local_UB
[
255
*
64
],
matrix_max_scalar
,
255
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB
[
510
*
64
],
resMatmul_local_UB
[
510
*
64
],
matrix_max_scalar
,
2
,
1
,
1
,
8
,
8
)
tik_instance
.
data_move
(
resMatmul
[
core_n_idx
*
129024
+
2
*
4096
],
resMatmul_local_UB
,
0
,
8
,
512
,
0
,
1504
)
tik_instance
.
data_move
(
input_2_local_L11
,
input_x2
[
core_n_idx
*
262144
+
core_n_idx
*
2048
],
0
,
8
,
128
,
1920
,
0
)
tik_instance
.
data_move
(
input_1_local_L11
,
input_x1
[
core_n_idx
*
129024
+
12288
],
0
,
8
,
240
,
768
,
0
)
with
tik_instance
.
for_range
(
0
,
8
)
as
cc102
:
tik_instance
.
load2dv1
(
input_2_local_L1_local_L0B1
[
cc102
*
2048
],
input_2_local_L11
[
cc102
*
256
],
0
,
8
,
8
,
0
,
True
)
tik_instance
.
load2dv1
(
input_2_local_L1_local_L0B1
[
cc102
*
2048
],
input_2_local_L11
[
cc102
*
256
],
0
,
8
,
8
,
0
,
True
)
with
tik_instance
.
for_range
(
0
,
16
)
as
cc103
:
tik_instance
.
load2dv1
(
input_1_local_L1_local_L0A
[
cc103
*
2048
],
input_1_local_L11
[
cc103
*
256
],
0
,
8
,
15
,
0
,
False
)
tik_instance
.
mmad
(
resMatmul_local_UB_local_L0C1
,
input_1_local_L1_local_L0A
,
input_2_local_L1_local_L0B1
,
240
,
128
,
128
,
0
)
tik_instance
.
load2dv1
(
input_1_local_L1_local_L0A
[
cc103
*
2048
],
input_1_local_L11
[
cc103
*
256
],
0
,
8
,
15
,
0
,
False
)
tik_instance
.
mmad
(
resMatmul_local_UB_local_L0C1
,
input_1_local_L1_local_L0A
,
input_2_local_L1_local_L0B1
,
240
,
128
,
128
,
0
)
tik_instance
.
data_move
(
resMatmul_local_UB1
,
resMatmul_local_UB_local_L0C1
,
0
,
1
,
120
,
0
,
0
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB1
,
resMatmul_local_UB1
,
matrix_max_scalar
,
255
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB1
[
255
*
64
],
resMatmul_local_UB1
[
255
*
64
],
matrix_max_scalar
,
225
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB1
,
resMatmul_local_UB1
,
matrix_max_scalar
,
255
,
1
,
1
,
8
,
8
)
tik_instance
.
vmuls
(
64
,
resMatmul_local_UB1
[
255
*
64
],
resMatmul_local_UB1
[
255
*
64
],
matrix_max_scalar
,
225
,
1
,
1
,
8
,
8
)
tik_instance
.
data_move
(
resMatmul
[
core_n_idx
*
129024
+
12288
],
resMatmul_local_UB1
,
0
,
8
,
480
,
0
,
1536
)
tik_instance
.
BuildCCE
(
kernel_name
=
kernel_name
,
inputs
=
[
input_x1
,
input_x2
,
input_x3
],
outputs
=
[
resMatmul
])
return
tik_instance
mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
浏览文件 @
2d0ee054
...
...
@@ -17,11 +17,12 @@ limitations under the License.
matmul
"""
from
__future__
import
absolute_import
import
te.platform.cce_params
as
cce
from
te
import
tvm
from
topi.cce
import
util
from
te
import
tik
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
from
te
import
tik
from
topi.cce
import
util
# General limitation of the size for input shape: 2**31
SHAPE_SIZE_LIMIT
=
2147483648
NoneType
=
type
(
None
)
...
...
@@ -40,6 +41,7 @@ matmul_cube_fracz_left_cast_op_info = TBERegOp("CusMatMulCubeFraczLeftCast") \
.
dtype_format
(
DataType
.
F16_Default
,
DataType
.
F32_FracZ
,
DataType
.
F16_Default
,
DataType
.
F16_FracZ
)
\
.
get_op_info
()
# pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
def
_shape_check
(
shape_a
,
shape_b
,
shape_bias
,
src_dtype
,
trans_a
,
trans_b
):
"""
...
...
@@ -137,6 +139,7 @@ src_dtype: str
else
:
raise
RuntimeError
(
"unsupport input shape now for batch bias case"
)
def
_get_bias
(
shape_bias
):
bias_length
=
shape_bias
[
0
]
if
bias_length
%
16
==
0
:
...
...
@@ -147,6 +150,7 @@ def _get_bias(shape_bias):
shape_bias
.
append
(
bias_length
)
return
shape_bias
def
_get_input_shape
(
shape_x
):
dim_a
=
shape_x
[
0
]
dim_b
=
shape_x
[
1
]
...
...
@@ -164,6 +168,7 @@ def _get_input_shape(shape_x):
res
.
append
(
dim_b
)
return
res
def
check_supported
(
input_x1
,
input_x2
,
bias
=
None
,
output_y
=
{},
trans_a
=
False
,
trans_b
=
False
,
kernel_name
=
"matmulcube"
):
shape_a
=
input_x1
.
get
(
"shape"
)
shape_b
=
input_x2
.
get
(
"shape"
)
...
...
@@ -199,40 +204,41 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
return
False
elif
shape_a
[
1
]
!=
shape_b
[
0
]:
return
False
if
trans_a_f
and
trans_b
and
shape_b
[
1
]
==
1
:
return
False
if
src_dtype
==
"float16"
:
if
len
(
shape_a
)
!=
2
and
len
(
shape_b
)
!=
2
:
return
False
if
trans_a
:
m_shape
=
shape_a
[
1
]
k_shape
=
shape_a
[
0
]
else
:
m_shape
=
shape_a
[
0
]
k_shape
=
shape_a
[
1
]
if
trans_b
:
n_shape
=
shape_b
[
0
]
k_b_shape
=
shape_b
[
1
]
else
:
n_shape
=
shape_b
[
1
]
k_b_shape
=
shape_b
[
0
]
if
k_shape
!=
k_b_shape
:
return
False
if
m_shape
==
1
or
n_shape
==
1
:
if
k_shape
%
256
!=
0
:
return
False
except
RuntimeError
as
e
:
return
False
return
True
# pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
@
op_info_register
(
matmul_cube_fracz_left_cast_op_info
)
def
CusMatMulCubeFraczLeftCast
(
input_x1
,
input_x2
,
bias
=
None
,
output_y
=
{},
trans_a
=
False
,
trans_b
=
False
,
...
...
@@ -278,7 +284,7 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
c1
=
1
shape_b
=
[
n
,
c1
*
h
*
w
*
c0
]
shape_a
=
[
n
,
n
]
if
input_x1
.
get
(
"format"
)
==
"FRACTAL_Z"
:
n
,
c
,
h
,
w
=
shape_a
c0
=
16
...
...
@@ -291,26 +297,26 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
if
input_x2
.
get
(
"format"
)
==
"FRACTAL_NZ"
:
shape_a
=
[
shape_b
[
0
],
shape_b
[
0
]]
shape_b
=
shape_b
if
input_x1
.
get
(
"format"
)
==
"FRACTAL_NZ"
:
shape_a
=
shape_a
shape_b
=
[
shape_a
[
1
],
shape_a
[
1
]]
shape_a
=
list
(
shape_a
)
shape_b
=
list
(
shape_b
)
shape_a
=
_get_input_shape
(
shape_a
)
shape_b
=
_get_input_shape
(
shape_b
)
util
.
check_kernel_name
(
kernel_name
)
util
.
check_shape_rule
(
shape_a
)
util
.
check_shape_rule
(
shape_b
)
util
.
check_shape_size
(
shape_a
,
SHAPE_SIZE_LIMIT
)
util
.
check_shape_size
(
shape_b
,
SHAPE_SIZE_LIMIT
)
shape_a
=
[
shape_a
[
1
],
shape_a
[
0
]]
trans_a
=
bool
(
1
-
trans_a
)
shape_b
=
[
shape_b
[
1
],
shape_b
[
0
]]
trans_b
=
bool
(
1
-
trans_b
)
...
...
@@ -319,45 +325,45 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
shape_bias
=
bias
.
get
(
"shape"
)
shape_bias
=
list
(
shape_bias
)
shape_bias
=
_get_bias
(
shape_bias
)
src_dtype
=
input_x1
.
get
(
"dtype"
).
lower
()
_shape_check
(
shape_a
,
shape_b
,
shape_bias
,
src_dtype
,
trans_a
,
trans_b
)
m_shape
=
shape_a
[
len
(
shape_a
)
-
2
]
km_shape
=
shape_a
[
len
(
shape_a
)
-
1
]
kn_shape
=
shape_b
[
len
(
shape_a
)
-
2
]
n_shape
=
shape_b
[
len
(
shape_a
)
-
1
]
if
src_dtype
==
"float16"
:
block_reduce
=
cce
.
BLOCK_REDUCE
block_in
=
cce
.
BLOCK_IN
block_out
=
cce
.
BLOCK_OUT
if
trans_a
and
km_shape
==
1
:
block_in
=
cce
.
BLOCK_VECTOR
if
not
trans_a
and
m_shape
==
1
:
block_in
=
cce
.
BLOCK_VECTOR
if
trans_b
and
kn_shape
==
1
:
block_out
=
cce
.
BLOCK_VECTOR
if
not
trans_b
and
n_shape
==
1
:
block_out
=
cce
.
BLOCK_VECTOR
if
trans_a
:
shape_a_temp
=
(
m_shape
//
block_reduce
,
km_shape
//
block_in
,
block_reduce
,
block_in
)
else
:
shape_a_temp
=
(
m_shape
//
block_in
,
km_shape
//
block_reduce
,
block_in
,
block_reduce
)
if
trans_b
:
shape_b_temp
=
(
kn_shape
//
block_out
,
n_shape
//
block_reduce
,
block_reduce
,
block_out
)
else
:
shape_b_temp
=
(
kn_shape
//
block_reduce
,
n_shape
//
block_out
,
block_out
,
block_reduce
)
shape_a_temp
=
(
shape_a_temp
[
0
],
shape_a_temp
[
1
],
shape_a_temp
[
2
],
shape_a_temp
[
3
])
shape_b_temp
=
(
shape_b_temp
[
0
],
shape_b_temp
[
1
],
shape_b_temp
[
2
],
shape_b_temp
[
3
])
if
util
.
get_product_version
()
==
util
.
VERSION_MINI
:
tik_instance
=
tik
.
Tik
(
tik
.
Dprofile
(
"v100"
,
"mini"
))
else
:
...
...
@@ -372,7 +378,8 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
diag_opt
=
diag_opt
,
diag_size
=
DIAG_SIZE
)
tik_instance
.
BuildCCE
(
kernel_name
=
kernel_name
,
inputs
=
[
input_x1
,
input_x2
],
outputs
=
[
res_matmul
])
return
tik_instance
def
get_cus_tile_info
(
input_x1
,
input_x2
,
diag_size
):
tile_map
=
{
((
32
,
32
,
16
,
16
),
(
128
,
32
,
16
,
16
)):
(
8
,
8
,
16
),
...
...
@@ -381,10 +388,10 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
((
128
,
128
,
16
,
16
),
(
32
,
128
,
16
,
16
)):
(
8
,
8
,
16
),
((
16
,
16
,
16
,
16
),
(
144
,
16
,
16
,
16
)):
(
8
,
8
,
9
),
((
64
,
64
,
16
,
16
),
(
16
,
64
,
16
,
16
)):
(
8
,
8
,
4
),
((
16
,
16
,
16
,
16
),
(
64
,
16
,
16
,
16
)):
(
8
,
8
,
4
),
((
32
,
32
,
16
,
16
),
(
8
,
32
,
16
,
16
)):
(
8
,
8
,
1
),
((
16
,
16
,
16
,
16
),
(
64
,
16
,
16
,
16
)):
(
8
,
8
,
4
),
((
32
,
32
,
16
,
16
),
(
8
,
32
,
16
,
16
)):
(
8
,
8
,
1
),
((
128
,
128
,
16
,
16
),
(
64
,
128
,
16
,
16
)):
(
8
,
8
,
16
),
((
16
,
16
,
16
,
16
),
(
4
,
16
,
16
,
16
)):
(
8
,
8
,
1
),
((
16
,
16
,
16
,
16
),
(
4
,
16
,
16
,
16
)):
(
8
,
8
,
1
),
((
16
,
16
,
16
,
16
),
(
32
,
16
,
16
,
16
)):
(
8
,
8
,
2
),
((
64
,
64
,
16
,
16
),
(
32
,
64
,
16
,
16
)):
(
8
,
8
,
8
),
((
32
,
32
,
16
,
16
),
(
64
,
32
,
16
,
16
)):
(
8
,
8
,
8
),
...
...
@@ -398,13 +405,14 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
}
shape_info
=
(
tuple
(
input_x1
.
shape
),
tuple
(
input_x2
.
shape
))
diag_opt
=
False
if
input_x1
.
shape
[
0
]
*
input_x1
.
shape
[
3
]
>
diag_size
:
if
input_x1
.
shape
[
0
]
*
input_x1
.
shape
[
3
]
>
diag_size
:
diag_opt
=
True
if
shape_info
not
in
tile_map
:
raise
ValueError
(
"shape %s is not supported"
%
str
(
shape_info
))
mo_tile
,
ko_tile
,
no_tile
=
tile_map
[
shape_info
]
return
mo_tile
,
ko_tile
,
no_tile
,
diag_opt
def
cus_cube_matmul_cast
(
tik_instance
,
input_x1
,
trans_a
,
input_x2
,
trans_b
,
res
,
mo_tile
,
ko_tile
,
no_tile
,
diag_opt
=
False
,
diag_size
=
128
):
ko
,
mo
,
mi
,
ki
=
input_x1
.
shape
...
...
@@ -420,7 +428,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
raise
ValueError
(
"shape of input_x1 or input_x2 is not supported!"
)
if
not
trans_a
or
not
trans_b
:
raise
ValueError
(
"only trans_a=False and trans_b=False be supported!"
)
core_m_num
=
mo
//
mo_tile
loop_n_num
=
no
//
no_tile
if
loop_n_num
*
core_m_num
<=
maxblocknum
:
...
...
@@ -432,7 +440,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
else
:
raise
ValueError
(
"Does not support this scenario!"
)
block_num
=
core_m_num
*
core_n_num
loop_k_num
=
ko
//
ko_tile
if
diag_opt
:
loop_k_num
=
diag_outer
//
ko_tile
...
...
@@ -445,7 +453,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
core_n
=
block_idx
%
core_n_num
with
tik_instance
.
for_range
(
0
,
loop_n_num
)
as
cc_n
:
res_L0C
=
tik_instance
.
Tensor
(
"float32"
,
[
no_tile
,
mo_tile
,
c0
,
c0
],
name
=
"resMatmul_L0C"
,
scope
=
tik
.
scope_cc
)
name
=
"resMatmul_L0C"
,
scope
=
tik
.
scope_cc
)
with
tik_instance
.
for_range
(
0
,
loop_k_num
,
thread_num
=
thread_num_k
)
as
thread_idx_k
:
# input_x2 -> input_x2_ub -(fp322fp16)-> input_x2_cast_ub -> input_x2_L1
input_x2_ub
=
tik_instance
.
Tensor
(
"float32"
,
[
no_tile
,
ko_tile_inner
,
c0
,
c0
],
name
=
"input_x2_ub"
,
...
...
@@ -476,41 +484,41 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
input_x2_cast_ub
[
count
*
repeate_times_max
*
vectorfp32_size
],
input_x2_ub
[
count
*
repeate_times_max
*
vectorfp32_size
],
repeate_num
,
1
,
1
,
4
,
8
)
input_x2_L1
=
tik_instance
.
Tensor
(
"float16"
,
[
no_tile
,
ko_tile_inner
,
c0
,
c0
],
name
=
"input_x2_L1"
,
scope
=
tik
.
scope_cbuf
)
tik_instance
.
data_move
(
input_x2_L1
,
input_x2_cast_ub
,
0
,
1
,
no_tile
*
ko_tile_inner
*
c0
*
c0
*
fp16_size
//
blocksize
,
0
,
0
)
# input_x1 -> input_x1_L1
input_x1_L1
=
tik_instance
.
Tensor
(
input_x1
.
dtype
,
[
ko_tile_inner
,
mo_tile
,
c0
,
c0
],
name
=
"input_x1_L1"
,
scope
=
tik
.
scope_cbuf
)
tik_instance
.
data_move
(
input_x1_L1
,
input_x1
[
k_idx
,
core_m
*
mo_tile
,
0
,
0
],
0
,
ko_tile_inner
,
mo_tile
*
c0
*
c0
*
fp16_size
//
blocksize
,
(
mo
-
mo_tile
)
*
c0
*
c0
*
fp16_size
//
blocksize
,
0
)
# input_x2_L1 -> input_x2_L0B
input_x2_L0B
=
tik_instance
.
Tensor
(
"float16"
,
[
ko_tile_inner
,
no_tile
,
c0
,
c0
],
name
=
"input_x2_L0B"
,
scope
=
tik
.
scope_cb
)
with
tik_instance
.
for_range
(
0
,
ko_tile_inner
)
as
cc2
:
tik_instance
.
load2dv1
(
input_x2_L0B
[
cc2
,
0
,
0
,
0
],
input_x2_L1
[
0
,
cc2
,
0
,
0
],
0
,
no_tile
,
ko_tile_inner
,
0
,
True
)
# input_x1_L1 -> input_x1_L0A
input_x1_L0A
=
tik_instance
.
Tensor
(
input_x1
.
dtype
,
[
mo_tile
,
ko_tile_inner
,
c0
,
c0
],
name
=
"input_x1_L0A"
,
scope
=
tik
.
scope_ca
)
with
tik_instance
.
for_range
(
0
,
mo_tile
)
as
cc1
:
tik_instance
.
load2dv1
(
input_x1_L0A
[
cc1
,
0
,
0
,
0
],
input_x1_L1
[
0
,
cc1
,
0
,
0
],
0
,
ko_tile_inner
,
mo_tile
,
0
,
False
)
with
tik_instance
.
if_scope
(
thread_idx_k
==
0
):
tik_instance
.
mmad
(
res_L0C
,
input_x1_L0A
,
input_x2_L0B
,
mo_tile
*
c0
,
ko_tile_inner
*
c0
,
no_tile
*
c0
,
0
)
with
tik_instance
.
else_scope
():
tik_instance
.
mmad
(
res_L0C
,
input_x1_L0A
,
input_x2_L0B
,
mo_tile
*
c0
,
ko_tile_inner
*
c0
,
no_tile
*
c0
,
1
)
res_ub
=
tik_instance
.
Tensor
(
input_x1
.
dtype
,
[
no_tile
,
mo_tile
,
c0
,
c0
],
name
=
"resMatmul_ub"
,
scope
=
tik
.
scope_ubuf
)
tik_instance
.
data_move
(
res_ub
,
res_L0C
,
0
,
1
,
no_tile
*
mo_tile
,
0
,
0
,
1
)
tik_instance
.
data_move
(
res
[(
core_n
*
loop_n_num
+
cc_n
)
*
no_tile
,
core_m
*
mo_tile
,
0
,
0
],
res_ub
,
0
,
no_tile
,
mo_tile
*
c0
*
c0
*
fp16_size
//
blocksize
,
0
,
(
mo
-
mo_tile
)
*
c0
*
c0
*
fp16_size
//
blocksize
)
input_x2_L1
=
tik_instance
.
Tensor
(
"float16"
,
[
no_tile
,
ko_tile_inner
,
c0
,
c0
],
name
=
"input_x2_L1"
,
scope
=
tik
.
scope_cbuf
)
tik_instance
.
data_move
(
input_x2_L1
,
input_x2_cast_ub
,
0
,
1
,
no_tile
*
ko_tile_inner
*
c0
*
c0
*
fp16_size
//
blocksize
,
0
,
0
)
# input_x1 -> input_x1_L1
input_x1_L1
=
tik_instance
.
Tensor
(
input_x1
.
dtype
,
[
ko_tile_inner
,
mo_tile
,
c0
,
c0
],
name
=
"input_x1_L1"
,
scope
=
tik
.
scope_cbuf
)
tik_instance
.
data_move
(
input_x1_L1
,
input_x1
[
k_idx
,
core_m
*
mo_tile
,
0
,
0
],
0
,
ko_tile_inner
,
mo_tile
*
c0
*
c0
*
fp16_size
//
blocksize
,
(
mo
-
mo_tile
)
*
c0
*
c0
*
fp16_size
//
blocksize
,
0
)
# input_x2_L1 -> input_x2_L0B
input_x2_L0B
=
tik_instance
.
Tensor
(
"float16"
,
[
ko_tile_inner
,
no_tile
,
c0
,
c0
],
name
=
"input_x2_L0B"
,
scope
=
tik
.
scope_cb
)
with
tik_instance
.
for_range
(
0
,
ko_tile_inner
)
as
cc2
:
tik_instance
.
load2dv1
(
input_x2_L0B
[
cc2
,
0
,
0
,
0
],
input_x2_L1
[
0
,
cc2
,
0
,
0
],
0
,
no_tile
,
ko_tile_inner
,
0
,
True
)
# input_x1_L1 -> input_x1_L0A
input_x1_L0A
=
tik_instance
.
Tensor
(
input_x1
.
dtype
,
[
mo_tile
,
ko_tile_inner
,
c0
,
c0
],
name
=
"input_x1_L0A"
,
scope
=
tik
.
scope_ca
)
with
tik_instance
.
for_range
(
0
,
mo_tile
)
as
cc1
:
tik_instance
.
load2dv1
(
input_x1_L0A
[
cc1
,
0
,
0
,
0
],
input_x1_L1
[
0
,
cc1
,
0
,
0
],
0
,
ko_tile_inner
,
mo_tile
,
0
,
False
)
with
tik_instance
.
if_scope
(
thread_idx_k
==
0
):
tik_instance
.
mmad
(
res_L0C
,
input_x1_L0A
,
input_x2_L0B
,
mo_tile
*
c0
,
ko_tile_inner
*
c0
,
no_tile
*
c0
,
0
)
with
tik_instance
.
else_scope
():
tik_instance
.
mmad
(
res_L0C
,
input_x1_L0A
,
input_x2_L0B
,
mo_tile
*
c0
,
ko_tile_inner
*
c0
,
no_tile
*
c0
,
1
)
res_ub
=
tik_instance
.
Tensor
(
input_x1
.
dtype
,
[
no_tile
,
mo_tile
,
c0
,
c0
],
name
=
"resMatmul_ub"
,
scope
=
tik
.
scope_ubuf
)
tik_instance
.
data_move
(
res_ub
,
res_L0C
,
0
,
1
,
no_tile
*
mo_tile
,
0
,
0
,
1
)
tik_instance
.
data_move
(
res
[(
core_n
*
loop_n_num
+
cc_n
)
*
no_tile
,
core_m
*
mo_tile
,
0
,
0
],
res_ub
,
0
,
no_tile
,
mo_tile
*
c0
*
c0
*
fp16_size
//
blocksize
,
0
,
(
mo
-
mo_tile
)
*
c0
*
c0
*
fp16_size
//
blocksize
)
mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
浏览文件 @
2d0ee054
...
...
@@ -18,37 +18,35 @@ limitations under the License.
matmul
"""
from
__future__
import
absolute_import
import
te.lang.cce
import
te.platform.cce_params
as
cce
from
te.platform.fusion_manager
import
fusion_manager
from
te
import
tvm
from
topi
import
generic
from
topi.cce
import
util
from
te
import
tik
from
impl.matmul_vector
import
matmul_vector_cce
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
from
te
import
tik
from
topi.cce
import
util
# General limitation of the size for input shape: 2**31
SHAPE_SIZE_LIMIT
=
2147483648
NoneType
=
type
(
None
)
cus_matmul_cube_fracz_right_mul_op_info
=
TBERegOp
(
"CusMatMulCubeFraczRightMul"
)
\
.
fusion_type
(
"OPAQUE"
)
\
.
async_flag
(
False
)
\
.
binfile_name
(
"matmulcubefraczrightmul.so"
)
\
.
compute_cost
(
10
)
\
.
kernel_name
(
"CusMatMulCubeFraczRightMul"
)
\
.
partial_flag
(
True
)
\
.
input
(
0
,
"x1"
,
False
,
"required"
,
"all"
)
\
.
input
(
1
,
"x2"
,
False
,
"required"
,
"all"
)
\
.
input
(
2
,
"x3"
,
False
,
"required"
,
"all"
)
\
.
input
(
3
,
"x4"
,
False
,
"optional"
,
"all"
)
\
.
output
(
0
,
"y"
,
False
,
"required"
,
"all"
)
\
.
dtype_format
(
DataType
.
F16_FracZ
,
DataType
.
F16_Default
,
DataType
.
F32_Default
,
DataType
.
F16_Default
,
DataType
.
F32_FracZ
)
\
.
get_op_info
()
.
fusion_type
(
"OPAQUE"
)
\
.
async_flag
(
False
)
\
.
binfile_name
(
"matmulcubefraczrightmul.so"
)
\
.
compute_cost
(
10
)
\
.
kernel_name
(
"CusMatMulCubeFraczRightMul"
)
\
.
partial_flag
(
True
)
\
.
input
(
0
,
"x1"
,
False
,
"required"
,
"all"
)
\
.
input
(
1
,
"x2"
,
False
,
"required"
,
"all"
)
\
.
input
(
2
,
"x3"
,
False
,
"required"
,
"all"
)
\
.
input
(
3
,
"x4"
,
False
,
"optional"
,
"all"
)
\
.
output
(
0
,
"y"
,
False
,
"required"
,
"all"
)
\
.
dtype_format
(
DataType
.
F16_FracZ
,
DataType
.
F16_Default
,
DataType
.
F32_Default
,
DataType
.
F16_Default
,
DataType
.
F32_FracZ
)
\
.
get_op_info
()
@
op_info_register
(
cus_matmul_cube_fracz_right_mul_op_info
)
def
CusMatMulCubeFraczRightMul
(
input_x1
,
input_x2
,
input_x3
,
bias
=
None
,
output_y
=
{},
trans_a
=
False
,
trans_b
=
False
,
kernel_name
=
"matmulcube"
):
def
CusMatMulCubeFraczRightMul
(
input_x1
,
input_x2
,
input_x3
,
bias
=
None
,
output_y
=
{},
trans_a
=
False
,
trans_b
=
False
,
kernel_name
=
"matmulcube"
):
if
util
.
get_product_version
()
==
util
.
VERSION_MINI
:
tik_instance
=
tik
.
Tik
(
tik
.
Dprofile
(
"v100"
,
"mini"
))
else
:
...
...
@@ -61,10 +59,10 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
input_x3_shape
=
input_x3
.
get
(
"shape"
)
input_x3_dtype
=
input_x3
.
get
(
"dtype"
).
lower
()
output_shape
=
output_y
.
get
(
"shape"
)
Supported
=
[((
72
,
8
,
16
,
16
),
"float16"
,
(
72
,
72
,
16
,
16
),
"float16"
,
(
1
,),
"float32"
),
((
32
,
8
,
16
,
16
),
"float16"
,
(
32
,
32
,
16
,
16
),
"float16"
,
(
1
,),
"float32"
),
((
8
,
32
,
16
,
16
),
"float16"
,
(
8
,
8
,
16
,
16
),
"float16"
,
(
1
,),
"float32"
),
((
4
,
4
,
16
,
16
),
"float16"
,
(
4
,
4
,
16
,
16
),
"float16"
,
(
1
,),
"float32"
),
Supported
=
[((
72
,
8
,
16
,
16
),
"float16"
,
(
72
,
72
,
16
,
16
),
"float16"
,
(
1
,),
"float32"
),
((
32
,
8
,
16
,
16
),
"float16"
,
(
32
,
32
,
16
,
16
),
"float16"
,
(
1
,),
"float32"
),
((
8
,
32
,
16
,
16
),
"float16"
,
(
8
,
8
,
16
,
16
),
"float16"
,
(
1
,),
"float32"
),
((
4
,
4
,
16
,
16
),
"float16"
,
(
4
,
4
,
16
,
16
),
"float16"
,
(
1
,),
"float32"
),
((
4
,
16
,
16
,
16
),
'float16'
,
(
4
,
4
,
16
,
16
),
'float16'
,
(
1
,),
'float32'
),
((
49
,
4
,
16
,
16
),
'float16'
,
(
49
,
49
,
16
,
16
),
'float16'
,
(
1
,),
'float32'
),
((
36
,
4
,
16
,
16
),
'float16'
,
(
36
,
36
,
16
,
16
),
'float16'
,
(
1
,),
'float32'
),
...
...
@@ -81,7 +79,8 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
((
32
,
128
,
16
,
16
),
'float16'
,
(
32
,
32
,
16
,
16
),
'float16'
,
(
1
,),
'float32'
),
((
64
,
32
,
16
,
16
),
'float16'
,
(
64
,
64
,
16
,
16
),
'float16'
,
(
1
,),
'float32'
),
((
16
,
64
,
16
,
16
),
'float16'
,
(
16
,
16
,
16
,
16
),
'float16'
,
(
1
,),
'float32'
)]
input_shape
=
(
tuple
(
input_x1_shape
),
input_x1_dtype
,
tuple
(
input_x2_shape
),
input_x2_dtype
,
tuple
(
input_x3_shape
),
input_x3_dtype
)
input_shape
=
(
tuple
(
input_x1_shape
),
input_x1_dtype
,
tuple
(
input_x2_shape
),
input_x2_dtype
,
tuple
(
input_x3_shape
),
input_x3_dtype
)
if
input_shape
not
in
Supported
:
raise
RuntimeError
(
"input_shape %s is not supported"
%
str
(
input_shape
))
...
...
@@ -93,6 +92,7 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
tik_instance
.
BuildCCE
(
kernel_name
=
kernel_name
,
inputs
=
[
input_x1
,
input_x2
,
input_x3
],
outputs
=
[
resMatmul
])
return
tik_instance
def
cus_cube_matmul_right_mul
(
tik_instance
,
input_x1
,
input_x2
,
input_x3
,
res
):
diag_size
=
128
...
...
@@ -176,7 +176,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
name
=
"resMatmul_L0C"
,
scope
=
tik
.
scope_cc
)
with
tik_instance
.
for_range
(
0
,
loop_k_num
,
thread_num
=
thread_num_k
)
as
thread_idx_k
:
if
diag_opt
:
k_idx
=
(
core_n
*
loop_n_num
+
cc_n
)
*
no_tile
+
thread_idx_k
*
ko_tile_inner
k_idx
=
(
core_n
*
loop_n_num
+
cc_n
)
*
no_tile
+
thread_idx_k
*
ko_tile_inner
else
:
k_idx
=
thread_idx_k
*
ko_tile_inner
# input_x1 -> input_x1_L1
...
...
@@ -191,7 +191,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
input_x2_L1
=
tik_instance
.
Tensor
(
"float16"
,
[
no_tile
,
ko_tile_inner
,
c0
,
c0
],
name
=
"input_x2_L1"
,
scope
=
tik
.
scope_cbuf
)
tik_instance
.
data_move
(
input_x2_L1
,
input_x2
[(
core_n
*
loop_n_num
+
cc_n
)
*
no_tile
,
input_x2
[(
core_n
*
loop_n_num
+
cc_n
)
*
no_tile
,
k_idx
,
0
,
0
],
0
,
no_tile
,
ko_tile_inner
*
c0
*
c0
*
fp16_size
//
blocksize
,
(
ko
-
ko_tile_inner
)
*
c0
*
c0
*
fp16_size
//
blocksize
,
0
)
...
...
@@ -215,9 +215,9 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
tik_instance
.
mmad
(
res_L0C
,
input_x1_L0A
,
input_x2_L0B
,
mo_tile
*
c0
,
ko_tile_inner
*
c0
,
no_tile
*
c0
,
1
)
res_ub
=
tik_instance
.
Tensor
(
"float32"
,
[
no_tile
,
mo_tile
,
c0
,
c0
],
name
=
"resMatmul_ub"
,
scope
=
tik
.
scope_ubuf
)
name
=
"resMatmul_ub"
,
scope
=
tik
.
scope_ubuf
)
tik_instance
.
data_move
(
res_ub
,
res_L0C
,
0
,
1
,
no_tile
*
mo_tile
,
0
,
0
)
input_3_local_UB
=
tik_instance
.
Tensor
(
"float32"
,
(
8
,),
scope
=
tik
.
scope_ubuf
,
name
=
"input_3_local_UB"
)
tik_instance
.
data_move
(
input_3_local_UB
,
input_x3
,
0
,
1
,
1
,
0
,
0
)
matrix_max_scalar
=
tik_instance
.
Scalar
(
"float32"
)
...
...
@@ -236,7 +236,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
res_ub
[
count
*
repeate_times_max
*
vectorfp32_size
],
res_ub
[
count
*
repeate_times_max
*
vectorfp32_size
],
matrix_max_scalar
,
repeate_num
,
1
,
1
,
8
,
8
)
tik_instance
.
data_move
(
res
[(
core_n
*
loop_n_num
+
cc_n
)
*
no_tile
,
(
core_m
*
loop_m_num
+
cc_m
)
*
mo_tile
,
0
,
0
],
res_ub
,
0
,
no_tile
,
...
...
mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
浏览文件 @
2d0ee054
...
...
@@ -18,13 +18,15 @@ limitations under the License.
matmul
"""
from
__future__
import
absolute_import
import
te.lang.cce
import
te.platform.cce_params
as
cce
from
impl.matmul_vector
import
matmul_vector_cce
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
from
te
import
tvm
from
topi
import
generic
from
topi.cce
import
util
from
impl.matmul_vector
import
matmul_vector_cce
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
# General limitation of the size for input shape: 2**31
SHAPE_SIZE_LIMIT
=
2147483648
NoneType
=
type
(
None
)
...
...
@@ -36,8 +38,8 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
.
compute_cost
(
10
)
\
.
kernel_name
(
"CusMatMulCube"
)
\
.
partial_flag
(
True
)
\
.
attr
(
"transpose_a"
,
"required"
,
"bool"
,
"all"
)
\
.
attr
(
"transpose_b"
,
"required"
,
"bool"
,
"all"
)
\
.
attr
(
"transpose_a"
,
"required"
,
"bool"
,
"all"
)
\
.
attr
(
"transpose_b"
,
"required"
,
"bool"
,
"all"
)
\
.
input
(
0
,
"x1"
,
False
,
"required"
,
"all"
)
\
.
input
(
1
,
"x2"
,
False
,
"required"
,
"all"
)
\
.
input
(
2
,
"x3"
,
False
,
"optional"
,
"all"
)
\
...
...
@@ -45,6 +47,7 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
.
dtype_format
(
DataType
.
F16_FracNZ
,
DataType
.
F16_FracNZ
,
DataType
.
F16_Default
,
DataType
.
F32_FracNZ
)
\
.
get_op_info
()
# pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
def
_shape_check
(
shape_a
,
shape_b
,
shape_bias
,
src_dtype
,
trans_a
,
trans_b
):
"""
...
...
@@ -113,16 +116,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
if
m_shape
!=
1
:
if
n_shape
==
1
:
if
km_shape
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
)
!=
0
:
if
km_shape
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
)
!=
0
:
raise
RuntimeError
(
"input shape K1 should be multiple of %d"
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
))
elif
km_shape
%
k_block_size
!=
0
:
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
))
elif
km_shape
%
k_block_size
!=
0
:
raise
RuntimeError
(
"input shape K1 should be multiple of %d"
%
cce
.
BLOCK_IN
)
else
:
if
km_shape
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
)
!=
0
:
if
km_shape
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
)
!=
0
:
raise
RuntimeError
(
"input shape K1 should be multiple of %d"
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
))
%
(
cce
.
BLOCK_IN
*
cce
.
BLOCK_IN
))
if
n_shape
%
cce
.
BLOCK_IN
!=
0
and
n_shape
!=
1
:
raise
RuntimeError
(
"input shape N should be 1 or multiple of %d"
%
cce
.
BLOCK_IN
)
...
...
@@ -130,7 +133,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
if
len
(
shape_bias
):
if
len
(
shape_bias
)
==
1
:
if
is_gevm
or
is_gemv
:
if
shape_bias
[
0
]
!=
m_shape
*
n_shape
:
if
shape_bias
[
0
]
!=
m_shape
*
n_shape
:
raise
RuntimeError
(
"broadcast case shape bias for gemv must be equal m*n"
)
else
:
if
shape_bias
[
0
]
!=
n_shape
:
...
...
@@ -141,33 +144,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
else
:
raise
RuntimeError
(
"unsupport input shape now for batch bias case"
)
def
_get_bias
(
shape_bias
):
bias_length
=
shape_bias
[
0
]
if
bias_length
%
16
==
0
:
if
bias_length
%
16
==
0
:
return
shape_bias
else
:
bias_length
=
(
bias_length
//
16
)
*
16
+
16
bias_length
=
(
bias_length
//
16
)
*
16
+
16
shape_bias
=
[]
shape_bias
.
append
(
bias_length
)
return
shape_bias
def
_get_input_shape
(
shape_x
):
dim_a
=
shape_x
[
0
]
dim_b
=
shape_x
[
1
]
res
=
[]
if
dim_a
%
16
!=
0
:
dim_a
=
(
dim_a
//
16
)
*
16
+
16
if
dim_a
%
16
!=
0
:
dim_a
=
(
dim_a
//
16
)
*
16
+
16
res
.
append
(
dim_a
)
else
:
res
.
append
(
dim_a
)
if
dim_b
%
16
!=
0
:
dim_b
=
(
dim_b
//
16
)
*
16
+
16
if
dim_b
%
16
!=
0
:
dim_b
=
(
dim_b
//
16
)
*
16
+
16
res
.
append
(
dim_b
)
else
:
res
.
append
(
dim_b
)
return
res
def
check_supported
(
input_x1
,
input_x2
,
bias
=
None
,
output_y
=
{},
trans_a
=
False
,
trans_b
=
False
,
kernel_name
=
"matmulcube"
):
shape_a
=
input_x1
.
get
(
"shape"
)
shape_b
=
input_x2
.
get
(
"shape"
)
...
...
@@ -182,7 +188,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
if
bias
is
not
None
and
bool
(
bias
):
shape_bias
=
bias
.
get
(
"shape"
)
try
:
trans_a_f
=
bool
(
1
-
trans_a
)
trans_a_f
=
bool
(
1
-
trans_a
)
if
src_dtype
==
"float32"
or
src_dtype
==
"int32"
:
if
len
(
shape_a
)
!=
2
and
len
(
shape_b
)
!=
2
:
return
False
...
...
@@ -203,10 +209,10 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
return
False
elif
shape_a
[
1
]
!=
shape_b
[
0
]:
return
False
if
trans_a_f
and
trans_b
and
shape_b
[
1
]
==
1
:
return
False
if
src_dtype
==
"float16"
:
if
len
(
shape_a
)
!=
2
and
len
(
shape_b
)
!=
2
:
return
False
...
...
@@ -217,26 +223,27 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
else
:
m_shape
=
shape_a
[
0
]
k_shape
=
shape_a
[
1
]
if
trans_b
:
n_shape
=
shape_b
[
0
]
k_b_shape
=
shape_b
[
1
]
else
:
n_shape
=
shape_b
[
1
]
k_b_shape
=
shape_b
[
0
]
if
k_shape
!=
k_b_shape
:
return
False
if
m_shape
==
1
or
n_shape
==
1
:
if
k_shape
%
256
!=
0
:
return
False
except
RuntimeError
as
e
:
return
False
return
True
# pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
@
op_info_register
(
matmul_cube_op_info
)
def
CusMatMulCube
(
input_x1
,
input_x2
,
bias
=
None
,
output_y
=
{},
trans_a
=
False
,
trans_b
=
False
,
kernel_name
=
"matmulcube"
):
...
...
@@ -269,18 +276,18 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
"""
shape_a
=
input_x1
.
get
(
"ori_shape"
)
shape_b
=
input_x2
.
get
(
"ori_shape"
)
if
shape_a
is
not
None
:
if
len
(
shape_a
)
<
2
:
shape_a
=
input_x1
.
get
(
"shape"
)
if
shape_b
is
not
None
:
if
len
(
shape_b
)
<
2
:
shape_b
=
input_x2
.
get
(
"shape"
)
shape_a
=
list
(
shape_a
)
shape_b
=
list
(
shape_b
)
if
input_x1
.
get
(
"format"
)
==
"FRACTAL_NZ"
:
shape_a
=
_get_input_shape
(
shape_a
)
shape_b
=
_get_input_shape
(
shape_b
)
...
...
@@ -290,21 +297,21 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
util
.
check_shape_rule
(
shape_b
)
util
.
check_shape_size
(
shape_a
,
SHAPE_SIZE_LIMIT
)
util
.
check_shape_size
(
shape_b
,
SHAPE_SIZE_LIMIT
)
if
input_x1
.
get
(
"format"
)
==
"FRACTAL_NZ"
:
shape_a
=
[
shape_a
[
1
],
shape_a
[
0
]]
trans_a
=
bool
(
1
-
trans_a
)
trans_a
=
bool
(
1
-
trans_a
)
if
input_x2
.
get
(
"format"
)
==
"FRACTAL_NZ"
:
shape_b
=
[
shape_b
[
1
],
shape_b
[
0
]]
trans_b
=
bool
(
1
-
trans_b
)
trans_b
=
bool
(
1
-
trans_b
)
shape_bias
=
()
if
bias
is
not
None
and
bool
(
bias
):
shape_bias
=
bias
.
get
(
"shape"
)
shape_bias
=
list
(
shape_bias
)
shape_bias
=
_get_bias
(
shape_bias
)
src_dtype
=
input_x1
.
get
(
"dtype"
).
lower
()
dst_dtype
=
output_y
.
get
(
"dtype"
).
lower
()
if
src_dtype
==
"float32"
or
src_dtype
==
"int32"
:
...
...
@@ -338,12 +345,12 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
shape_a_temp
=
(
m_shape
//
block_reduce
,
km_shape
//
block_in
,
block_reduce
,
block_in
)
else
:
shape_a_temp
=
(
m_shape
//
block_in
,
km_shape
//
block_reduce
,
block_in
,
block_reduce
)
if
trans_b
:
shape_b_temp
=
(
kn_shape
//
block_out
,
n_shape
//
block_reduce
,
block_reduce
,
block_out
)
else
:
shape_b_temp
=
(
kn_shape
//
block_reduce
,
n_shape
//
block_out
,
block_out
,
block_reduce
)
if
input_x1
.
get
(
"format"
)
==
"FORMAT_FRACTAL_Z"
:
shape_a_temp
=
(
shape_a_temp
[
0
],
shape_a_temp
[
1
],
shape_a_temp
[
2
],
shape_a_temp
[
3
])
format_a
=
"fractal"
...
...
@@ -353,7 +360,7 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
else
:
shape_a_temp
=
(
shape_a
[
len
(
shape_a
)
-
2
],
shape_a
[
len
(
shape_a
)
-
1
])
format_a
=
"ND"
if
input_x2
.
get
(
"format"
)
==
"FORMAT_FRACTAL_Z"
:
shape_b_temp
=
(
shape_b_temp
[
0
],
shape_b_temp
[
1
],
shape_b_temp
[
2
],
shape_b_temp
[
3
])
format_b
=
"fractal"
...
...
@@ -363,28 +370,28 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
else
:
shape_b_temp
=
(
shape_b
[
len
(
shape_b
)
-
2
],
shape_b
[
len
(
shape_b
)
-
1
])
format_b
=
"ND"
tensor_bias
=
None
tensor_a
=
tvm
.
placeholder
(
shape_a_temp
,
name
=
'tensor_a'
,
dtype
=
src_dtype
)
tensor_b
=
tvm
.
placeholder
(
shape_b_temp
,
name
=
'tensor_b'
,
dtype
=
src_dtype
)
if
len
(
shape_bias
)
>
0
:
tensor_bias
=
tvm
.
placeholder
(
shape_bias
,
name
=
'tensor_bias'
,
dtype
=
dst_dtype
)
result
=
te
.
lang
.
cce
.
matmul
(
tensor_a
,
tensor_b
,
trans_a
,
trans_b
,
format_a
=
format_a
,
format_b
=
format_b
,
dst_dtype
=
dst_dtype
,
tensor_bias
=
tensor_bias
)
with
tvm
.
target
.
cce
():
schedule
=
generic
.
auto_schedule
(
result
)
tensor_list
=
[
tensor_a
,
tensor_b
,
result
]
if
len
(
shape_bias
)
>
0
:
tensor_list
=
[
tensor_a
,
tensor_b
,
tensor_bias
,
result
]
config
=
{
"print_ir"
:
False
,
"name"
:
kernel_name
,
"tensor_list"
:
tensor_list
}
te
.
lang
.
cce
.
cce_build_code
(
schedule
,
config
)
mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
浏览文件 @
2d0ee054
...
...
@@ -13,24 +13,25 @@
# limitations under the License.
# ============================================================================
"""CusMatrixCombine"""
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
from
te
import
tik
from
topi.cce
import
util
from
mindspore.ops.op_info_register
import
op_info_register
,
TBERegOp
,
DataType
cus_matrix_combine_op_info
=
TBERegOp
(
"CusMatrixCombine"
)
\
.
fusion_type
(
"OPAQUE"
)
\
.
async_flag
(
False
)
\
.
binfile_name
(
"matrixcombine.so"
)
\
.
compute_cost
(
10
)
\
.
kernel_name
(
"CusMatrixCombine"
)
\
.
partial_flag
(
True
)
\
.
input
(
0
,
"x1"
,
False
,
"required"
,
"all"
)
\
.
output
(
0
,
"y"
,
False
,
"required"
,
"all"
)
\
.
dtype_format
(
DataType
.
F32_Default
,
DataType
.
F32_Default
)
\
.
get_op_info
()
.
fusion_type
(
"OPAQUE"
)
\
.
async_flag
(
False
)
\
.
binfile_name
(
"matrixcombine.so"
)
\
.
compute_cost
(
10
)
\
.
kernel_name
(
"CusMatrixCombine"
)
\
.
partial_flag
(
True
)
\
.
input
(
0
,
"x1"
,
False
,
"required"
,
"all"
)
\
.
output
(
0
,
"y"
,
False
,
"required"
,
"all"
)
\
.
dtype_format
(
DataType
.
F32_Default
,
DataType
.
F32_Default
)
\
.
get_op_info
()
@
op_info_register
(
cus_matrix_combine_op_info
)
def
CusMatrixCombine
(
input_x
,
output
,
kernel_name
=
"matrix_combine"
):
def
CusMatrixCombine
(
input_x
,
output
,
kernel_name
=
"matrix_combine"
):
input_x_shape
=
input_x
.
get
(
"shape"
)
output_shape
=
output
.
get
(
"shape"
)
split_dim
=
128
...
...
@@ -45,18 +46,20 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
blocks
=
32
matrix_dim
=
input_x_shape
[
0
]
*
input_x_shape
[
1
]
if
input_x_shape
[
0
]
==
1
and
input_x_shape
[
1
]
==
64
:
if
input_x_shape
[
0
]
==
1
and
input_x_shape
[
1
]
==
64
:
tiling_dim
=
2
bs
=
1
with
tik_instance
.
for_range
(
0
,
blocks
,
block_num
=
blocks
)
as
block_index
:
input_x_ub
=
tik_instance
.
Tensor
(
"float32"
,
(
tiling_dim
,
matrix_dim
),
name
=
"input_x_ub"
,
scope
=
tik
.
scope_ubuf
)
with
tik_instance
.
for_range
(
0
,
blocks
,
block_num
=
blocks
)
as
block_index
:
input_x_ub
=
tik_instance
.
Tensor
(
"float32"
,
(
tiling_dim
,
matrix_dim
),
name
=
"input_x_ub"
,
scope
=
tik
.
scope_ubuf
)
tik_instance
.
data_move
(
input_x_ub
,
input_x
[
0
,
block_index
*
tiling_dim
,
0
],
0
,
1
,
16
,
0
,
0
)
tik_instance
.
data_move
(
res
[
block_index
*
tiling_dim
,
0
],
input_x_ub
,
0
,
1
,
16
,
0
,
0
)
else
:
tiling_dim
=
4
bs
=
input_x_shape
[
0
]
with
tik_instance
.
for_range
(
0
,
blocks
,
block_num
=
blocks
)
as
block_index
:
input_x_ub
=
tik_instance
.
Tensor
(
"float32"
,
(
tiling_dim
,
matrix_dim
),
name
=
"input_x_ub"
,
scope
=
tik
.
scope_ubuf
)
with
tik_instance
.
for_range
(
0
,
blocks
,
block_num
=
blocks
)
as
block_index
:
input_x_ub
=
tik_instance
.
Tensor
(
"float32"
,
(
tiling_dim
,
matrix_dim
),
name
=
"input_x_ub"
,
scope
=
tik
.
scope_ubuf
)
zero
=
tik_instance
.
Scalar
(
"float32"
)
zero
.
set_as
(
0.0
)
with
tik_instance
.
for_range
(
0
,
bs
)
as
i
:
...
...
@@ -69,7 +72,9 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
tik_instance
.
vector_dup
(
64
,
input_x_ub
,
zero
,
repeat_1
,
1
,
8
)
tik_instance
.
vector_dup
(
64
,
input_x_ub
[
255
*
64
],
zero
,
repeat_2
,
1
,
8
)
with
tik_instance
.
for_range
(
0
,
tiling_dim
)
as
j
:
tik_instance
.
data_move
(
input_x_ub
[
j
,
split_dim
*
i
],
input_x
[
i
,
block_index
*
tiling_dim
+
j
,
0
],
0
,
1
,
16
,
0
,
0
)
tik_instance
.
data_move
(
res
[
i
*
split_dim
+
block_index
*
tiling_dim
,
0
],
input_x_ub
,
0
,
1
,
tiling_dim
*
matrix_dim
*
4
//
32
,
0
,
0
)
tik_instance
.
data_move
(
input_x_ub
[
j
,
split_dim
*
i
],
input_x
[
i
,
block_index
*
tiling_dim
+
j
,
0
],
0
,
1
,
16
,
0
,
0
)
tik_instance
.
data_move
(
res
[
i
*
split_dim
+
block_index
*
tiling_dim
,
0
],
input_x_ub
,
0
,
1
,
tiling_dim
*
matrix_dim
*
4
//
32
,
0
,
0
)
tik_instance
.
BuildCCE
(
kernel_name
=
kernel_name
,
inputs
=
[
input_x
],
outputs
=
[
res
])
return
tik_instance
mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
浏览文件 @
2d0ee054
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录