提交 2d0ee054 编写于 作者: Z z00478463

for pylint 2nd

上级 648501da
......@@ -14,14 +14,3 @@
# ============================================================================
"""custom ops"""
from .batch_matmul_impl import CusBatchMatMul
from .cholesky_trsm_impl import CusCholeskyTrsm
from .fused_abs_max1_impl import CusFusedAbsMax1
from .img2col_impl import CusImg2Col
from .matmul_cube_dense_left_impl import CusMatMulCubeDenseLeft
from .matmul_cube_dense_right_impl import CusMatMulCubeDenseRight
from .matmul_cube_fracz_left_cast_impl import CusMatMulCubeFraczLeftCast
from .matmul_cube_fracz_right_mul_impl import CusMatMulCubeFraczRightMul
from .matmul_cube_impl import CusMatMulCube
from .matrix_combine_impl import CusMatrixCombine
from .transpose02314_impl import CusTranspose02314
......@@ -14,29 +14,31 @@
# ============================================================================
"""batch_matmul_impl"""
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
from te import tik
from topi.cce import util
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
cus_batchmatmul_op_info = TBERegOp("CusBatchMatMul") \
.fusion_type("OPAQUE") \
.async_flag(False) \
.binfile_name("batchmatmul.so") \
.compute_cost(10) \
.kernel_name("CusBatchMatMul") \
.partial_flag(True) \
.input(0, "x1", False, "required", "all") \
.input(1, "x2", False, "required", "all") \
.output(0, "y", False, "required", "all") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
.get_op_info()
.fusion_type("OPAQUE") \
.async_flag(False) \
.binfile_name("batchmatmul.so") \
.compute_cost(10) \
.kernel_name("CusBatchMatMul") \
.partial_flag(True) \
.input(0, "x1", False, "required", "all") \
.input(1, "x2", False, "required", "all") \
.output(0, "y", False, "required", "all") \
.dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
.get_op_info()
def _get_flattern_shape(shape):
flattern_shape = 1
for dim in shape:
flattern_shape *= dim
return (flattern_shape,)
def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
......@@ -66,12 +68,13 @@ def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_
matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
tik_instance.data_move(res[res_index + thread_idx2 * 64],
matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
input_1_local_UB = tik_instance.Tensor(dtype, [64], name="input_1_local_UB", scope=tik.scope_ubuf)
tik_instance.data_move(input_1_local_UB, input1[input1_index], 0, 1, 8, 0, 0)
with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2:
input_2_local_UB = tik_instance.Tensor(dtype, [32*64], name="input_2_local_UB",
input_2_local_UB = tik_instance.Tensor(dtype, [32 * 64], name="input_2_local_UB",
scope=tik.scope_ubuf)
t_1_local_UB = input_2_local_UB
matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [32], name="matmul_hybrid_f_t_local_UB",
......@@ -83,6 +86,8 @@ def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, inpu
1, 1, 1, 8)
tik_instance.data_move(res[res_index + thread_idx2 * 32],
matmul_hybrid_f_t_local_UB, 0, 1, 4, 0, 0)
@op_info_register(cus_batchmatmul_op_info)
def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"):
if util.get_product_version() == util.VERSION_MINI:
......@@ -97,51 +102,54 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
dtype, input_x2.get("dtype").lower()))
input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a, transpose_b)
support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True),
((36, 128, 128), (36, 128, 128), "float32", False, True),
((5, 128, 128), (5, 128, 128), "float32", False, True),
((18, 128, 128), (18, 128, 128), "float32", False, True),
((16, 128, 128), (16, 128, 128), "float32", False, True),
((9, 128, 128), (9, 128, 128), "float32", False, True),
((1, 64, 64), (1, 64, 64), "float32", False, True),
((1, 128, 128), (1, 128, 128), "float32", False, True),
((4, 128, 128), (4, 128, 128), "float32", False, True),
((2, 128, 128), (2, 128, 128), "float32", False, True)]
((36, 128, 128), (36, 128, 128), "float32", False, True),
((5, 128, 128), (5, 128, 128), "float32", False, True),
((18, 128, 128), (18, 128, 128), "float32", False, True),
((16, 128, 128), (16, 128, 128), "float32", False, True),
((9, 128, 128), (9, 128, 128), "float32", False, True),
((1, 64, 64), (1, 64, 64), "float32", False, True),
((1, 128, 128), (1, 128, 128), "float32", False, True),
((4, 128, 128), (4, 128, 128), "float32", False, True),
((2, 128, 128), (2, 128, 128), "float32", False, True)]
if input_shape not in support_shape:
raise RuntimeError("input_shape %s is not supported" % str(input_shape))
# if not transpose_a and transpose_b:
batch, m, k = x1_shape
_, n, _ = x2_shape
input1_shape = _get_flattern_shape(x1_shape)
input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm)
input2_shape = _get_flattern_shape(x2_shape)
input2 = tik_instance.Tensor(dtype, input2_shape, name="input2", scope=tik.scope_gm)
output_shape = x1_shape
res_shape = _get_flattern_shape(output_shape)
res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm)
if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True):
with tik_instance.for_range(0, 18, block_num=18) as block_idx:
with tik_instance.for_range(0, 2) as cc0:
with tik_instance.for_range(0, 128, thread_num=2) as cc1:
input1_index = block_idx * 32768 + cc0*16384 + cc1 * 128
input2_index = block_idx * 32768 + cc0*16384
res_index = block_idx*32768 + cc0*16384 + cc1*128
input1_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
input2_index = block_idx * 32768 + cc0 * 16384
res_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
_inner_matmul_new(tik_instance, dtype,
input1, input1_index,
input2, input2_index,
res, res_index)
input1, input1_index,
input2, input2_index,
res, res_index)
if input_shape == ((5, 128, 128), (5, 128, 128), "float32", False, True):
with tik_instance.for_range(0, 30, block_num=30) as block_idx:
with tik_instance.for_range(0, 11) as cc1_db:
with tik_instance.for_range(0, 2, thread_num=2) as thread_idx:
with tik_instance.if_scope(((((block_idx % 6) * 22) + (cc1_db * 2) + thread_idx) < 128)):
input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
tik_instance.data_move(input_1_local_UB, input1[(block_idx//6)*16384 + (block_idx % 6)*2816 + cc1_db * 256 + thread_idx*128], 0, 1, 16, 0, 0)
input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB",
scope=tik.scope_ubuf)
t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB",
scope=tik.scope_ubuf)
tik_instance.data_move(input_1_local_UB, input1[
(block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 + thread_idx * 128], 0, 1,
16, 0, 0)
with tik_instance.for_range(0, 2) as vec_i:
tik_instance.vadds(64, t_1_0_local_UB[vec_i * 64], input_1_local_UB[vec_i * 64], 0,
64, 1, 1, 16, 0)
......@@ -150,58 +158,61 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
scope=tik.scope_ubuf)
t_1_local_UB = input_2_local_UB
bisec_last_axis_local_UB = input_2_local_UB
matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64], name="matmul_hybrid_f_t_local_UB",
matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64],
name="matmul_hybrid_f_t_local_UB",
scope=tik.scope_ubuf)
matmul_hybrid_f_t_local_UB_dst_tmp = tik_instance.Tensor(dtype, [64],
name="matmul_hybrid_f_t_local_UB_dst_tmp",
scope=tik.scope_ubuf)
tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB, 0, 1, 1, 8)
tik_instance.data_move(input_2_local_UB, input2[(block_idx//6) * 16384 + thread_idx2*8192], 0, 1,
tik_instance.data_move(input_2_local_UB,
input2[(block_idx // 6) * 16384 + thread_idx2 * 8192], 0, 1,
1024, 0, 0)
tik_instance.vmul(64, t_1_local_UB, t_1_0_local_UB, input_2_local_UB, 128, 1, 1, 1, 8, 8, 8)
tik_instance.vadd(64, bisec_last_axis_local_UB, t_1_local_UB, t_1_local_UB[64], 64, 1, 1, 1,
16, 16, 16)
tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB_dst_tmp, 0, 1, 1, 8)
with tik_instance.for_range(0, 64) as cc6:
tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6], bisec_last_axis_local_UB[cc6*128],
tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6],
bisec_last_axis_local_UB[cc6 * 128],
1, 1, 1, 8)
tik_instance.vadd(64, matmul_hybrid_f_t_local_UB, matmul_hybrid_f_t_local_UB_dst_tmp,
matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
tik_instance.data_move(res[(block_idx//6)*16384 + (block_idx%6)*2816 + cc1_db*256 +
thread_idx*128 + thread_idx2*64],
matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
tik_instance.data_move(
res[(block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 +
thread_idx * 128 + thread_idx2 * 64],
matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True):
with tik_instance.for_range(0, 18, block_num=18) as block_idx:
with tik_instance.for_range(0, 128, thread_num=2) as cc0:
input1_index = block_idx * 16384 + cc0 * 128
input2_index = block_idx * 16384
res_index = block_idx*16384 + cc0*128
res_index = block_idx * 16384 + cc0 * 128
_inner_matmul_new(tik_instance, dtype,
input1, input1_index,
input2, input2_index,
res, res_index)
input1, input1_index,
input2, input2_index,
res, res_index)
if input_shape == ((9, 128, 128), (9, 128, 128), "float32", False, True):
with tik_instance.for_range(0, 27, block_num=27) as block_idx:
with tik_instance.for_range(0, 42, thread_num=2) as cc0:
input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0 * 128
input2_index = (block_idx//3) * 16384
res_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0*128
input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
input2_index = (block_idx // 3) * 16384
res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
_inner_matmul_new(tik_instance, dtype,
input1, input1_index,
input2, input2_index,
res, res_index)
input1, input1_index,
input2, input2_index,
res, res_index)
with tik_instance.if_scope((block_idx % 3) < 2):
input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + 42*128
input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
input2_index = (block_idx // 3) * 16384
res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42*128
res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
_inner_matmul_new(tik_instance, dtype,
input1, input1_index,
input2, input2_index,
res, res_index)
input1, input1_index,
input2, input2_index,
res, res_index)
if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True):
with tik_instance.for_range(0, 32, block_num=32) as block_idx:
with tik_instance.for_range(0, 2, thread_num=2) as cc0:
......@@ -209,35 +220,35 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
input2_index = 0
res_index = block_idx * 128 + cc0 * 64
_inner_matmul_new_1_64_32_64(tik_instance, dtype,
input1, input1_index,
input2, input2_index,
res, res_index)
input1, input1_index,
input2, input2_index,
res, res_index)
input_shape_list = [((1, 128, 128), (1, 128, 128), "float32", False, True),
((2, 128, 128), (2, 128, 128), "float32", False, True),
((4, 128, 128), (4, 128, 128), "float32", False, True),
((8, 128, 128), (8, 128, 128), "float32", False, True),
((16, 128, 128), (16, 128, 128), "float32", False, True)
]
]
if input_shape in input_shape_list:
block_num = 32
input1_unit_size = 128
input2_unint_size = 128*128
input2_unint_size = 128 * 128
with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx:
block_process_ele_num = (batch * m * k) // block_num
loop_time = (batch*m*k)//block_num//input1_unit_size
loop_time = (batch * m * k) // block_num // input1_unit_size
thread_num = 2
with tik_instance.for_range(0, loop_time, thread_num=thread_num) as cc0:
input1_index = block_idx*block_process_ele_num + cc0*input1_unit_size
input1_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
if batch > 1:
input2_index = block_idx//(block_num//batch) * input2_unint_size
input2_index = block_idx // (block_num // batch) * input2_unint_size
else:
input2_index = 0
res_index = block_idx*block_process_ele_num + cc0*input1_unit_size
res_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
_inner_matmul_new(tik_instance, dtype,
input1, input1_index,
input2, input2_index,
res, res_index)
tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res])
return tik_instance
......@@ -13,24 +13,25 @@
# limitations under the License.
# ============================================================================
"""CusCholeskyTrsm"""
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
from te import tik
from topi.cce import util
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
cus_cholesky_trsm_op_info = TBERegOp("CusCholeskyTrsm") \
.fusion_type("OPAQUE") \
.async_flag(False) \
.binfile_name("choleskytrsm.so") \
.compute_cost(10) \
.kernel_name("CusCholeskyTrsm") \
.partial_flag(True) \
.input(0, "x1", False, "required", "all") \
.output(0, "y", False, "required", "all") \
.dtype_format(DataType.F32_Default, DataType.F32_Default) \
.get_op_info()
.fusion_type("OPAQUE") \
.async_flag(False) \
.binfile_name("choleskytrsm.so") \
.compute_cost(10) \
.kernel_name("CusCholeskyTrsm") \
.partial_flag(True) \
.input(0, "x1", False, "required", "all") \
.output(0, "y", False, "required", "all") \
.dtype_format(DataType.F32_Default, DataType.F32_Default) \
.get_op_info()
@op_info_register(cus_cholesky_trsm_op_info)
def CusCholeskyTrsm(input_x,output, kernel_name):
def CusCholeskyTrsm(input_x, output, kernel_name):
input_x_shape = input_x.get("shape")
output_shape = output.get("shape")
split_dim = 128
......@@ -47,34 +48,36 @@ def CusCholeskyTrsm(input_x,output, kernel_name):
input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
input_x_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="input_x_ub", scope=tik.scope_ubuf)
temp_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="temp_ub", scope=tik.scope_ubuf)
with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
input_x_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="input_x_ub", scope=tik.scope_ubuf)
temp_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="temp_ub", scope=tik.scope_ubuf)
assist_1_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_1_ub", scope=tik.scope_ubuf)
assist_2_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_2_ub", scope=tik.scope_ubuf)
with tik_instance.for_range(0,split_dim) as i:
tik_instance.data_move(input_x_ub[i,0], input_x[block_index * split_dim + i, block_index * split_dim], 0, 1, vector_repeat_times * 8, 0, 0)
scalar1 = tik_instance.Scalar("float32", init_value = -0.5)
with tik_instance.for_range(0, split_dim) as i:
tik_instance.data_move(input_x_ub[i, 0], input_x[block_index * split_dim + i, block_index * split_dim], 0,
1, vector_repeat_times * 8, 0, 0)
scalar1 = tik_instance.Scalar("float32", init_value=-0.5)
with tik_instance.for_range(0, split_dim) as i:
scalar2= tik_instance.Scalar("float32")
tik_instance.vln(64, assist_1_ub[0], input_x_ub[i,0], vector_repeat_times, 1, 1, 8, 8)
scalar2 = tik_instance.Scalar("float32")
tik_instance.vln(64, assist_1_ub[0], input_x_ub[i, 0], vector_repeat_times, 1, 1, 8, 8)
tik_instance.vmuls(64, assist_2_ub[0], assist_1_ub[0], scalar1, vector_repeat_times, 1, 1, 8, 8)
tik_instance.vexp(64, assist_1_ub[0], assist_2_ub[0], vector_repeat_times, 1, 1, 8, 8)
scalar2.set_as(assist_1_ub[i])
tik_instance.vmuls(64, input_x_ub[i,0], input_x_ub[i,0], scalar2, vector_repeat_times, 1, 1, 8, 8)
tik_instance.vmuls(64, input_x_ub[i, 0], input_x_ub[i, 0], scalar2, vector_repeat_times, 1, 1, 8, 8)
with tik_instance.for_range(i + 1, split_dim) as j:
scalar3= tik_instance.Scalar("float32")
scalar3 = tik_instance.Scalar("float32")
scalar3.set_as(input_x_ub[i, j])
tik_instance.vmuls(64,temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8)
tik_instance.vsub(64,input_x_ub[i+1,0], input_x_ub[i+1,0], temp_ub[i+1,0], (split_dim-1-i) * vector_repeat_times, 1, 1, 1, 8, 8, 8)
tik_instance.vmuls(64, temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8)
tik_instance.vsub(64, input_x_ub[i + 1, 0], input_x_ub[i + 1, 0], temp_ub[i + 1, 0],
(split_dim - 1 - i) * vector_repeat_times, 1, 1, 1, 8, 8, 8)
zero = tik_instance.Scalar("float32")
zero.set_as(0.0)
one = tik_instance.Scalar("float32")
one.set_as(1.0)
with tik_instance.for_range(0, split_dim) as i:
tik_instance.vector_dup(64, temp_ub[i,0], zero, vector_repeat_times, 1, 8)
tik_instance.vector_dup(64, temp_ub[i, 0], zero, vector_repeat_times, 1, 8)
temp_ub.__setitem__(i * split_dim + i, one)
chol_diag_element_final = tik_instance.Scalar("float32")
......@@ -89,16 +92,19 @@ def CusCholeskyTrsm(input_x,output, kernel_name):
with tik_instance.for_range(0, i) as j:
chol_diag_element_loop = tik_instance.Scalar("float32")
chol_diag_element_loop.set_as(input_x_ub[index, index + 1 + j])
tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop, vector_repeat_times,1,1,8,8)
tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop,
vector_repeat_times, 1, 1, 8, 8)
tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8, 8)
temp_scalar = tik_instance.Scalar("float32")
temp_scalar.set_as(input_x_ub[index, index])
chol_diag_element = tik_instance.Scalar("float32")
chol_diag_element.set_as(1.0 / temp_scalar)
tik_instance.vsub(64,temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element,vector_repeat_times,1,1,8,8)
tik_instance.vsub(64, temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8,
8)
tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element, vector_repeat_times, 1, 1,
8, 8)
tik_instance.data_move(res[block_index,0,0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim,0,0)
tik_instance.data_move(res[block_index, 0, 0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim, 0, 0)
tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
return tik_instance
......@@ -17,17 +17,15 @@ limitations under the License.
matmul
"""
from __future__ import absolute_import
import te.lang.cce
import te.platform.cce_params as cce
from te.platform.fusion_manager import fusion_manager
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
from te import tik
from te import tvm
from topi import generic
from topi.cce import util
from impl.matmul_vector import matmul_vector_cce
from te import tik
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
# General limitation of the size for input shape: 2**31
SHAPE_SIZE_LIMIT = 2147483648
NoneType = type(None)
......@@ -46,6 +44,7 @@ matmul_cube_dense_left_op_info = TBERegOp("CusMatMulCubeDenseLeft") \
.dtype_format(DataType.F16_Default, DataType.F16_FracNZ, DataType.F16_Default, DataType.F16_FracNZ) \
.get_op_info()
# pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
"""
......@@ -115,16 +114,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
if m_shape != 1:
if n_shape == 1:
if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
raise RuntimeError("input shape K1 should be multiple of %d"
% (cce.BLOCK_IN*cce.BLOCK_IN))
elif km_shape%k_block_size != 0:
% (cce.BLOCK_IN * cce.BLOCK_IN))
elif km_shape % k_block_size != 0:
raise RuntimeError(
"input shape K1 should be multiple of %d" % cce.BLOCK_IN)
else:
if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
raise RuntimeError("input shape K1 should be multiple of %d"
% (cce.BLOCK_IN*cce.BLOCK_IN))
% (cce.BLOCK_IN * cce.BLOCK_IN))
if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
......@@ -132,7 +131,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
if len(shape_bias):
if len(shape_bias) == 1:
if is_gevm or is_gemv:
if shape_bias[0] != m_shape*n_shape:
if shape_bias[0] != m_shape * n_shape:
raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
else:
if shape_bias[0] != n_shape:
......@@ -143,33 +142,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
else:
raise RuntimeError("unsupport input shape now for batch bias case")
def _get_bias(shape_bias):
bias_length = shape_bias[0]
if bias_length % 16 ==0:
if bias_length % 16 == 0:
return shape_bias
else:
bias_length = (bias_length // 16)*16 + 16
bias_length = (bias_length // 16) * 16 + 16
shape_bias = []
shape_bias.append(bias_length)
return shape_bias
def _get_input_shape(shape_x):
dim_a = shape_x[0]
dim_b = shape_x[1]
res = []
if dim_a % 16 !=0:
dim_a = (dim_a // 16)*16 + 16
if dim_a % 16 != 0:
dim_a = (dim_a // 16) * 16 + 16
res.append(dim_a)
else:
res.append(dim_a)
if dim_b % 16 !=0:
dim_b = (dim_b // 16)*16 + 16
if dim_b % 16 != 0:
dim_b = (dim_b // 16) * 16 + 16
res.append(dim_b)
else:
res.append(dim_b)
return res
def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
shape_a = input_x1.get("shape")
shape_b = input_x2.get("shape")
......@@ -184,7 +186,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
if bias is not None and bool(bias):
shape_bias = bias.get("shape")
try:
trans_a_f = bool(1-trans_a)
trans_a_f = bool(1 - trans_a)
if src_dtype == "float32" or src_dtype == "int32":
if len(shape_a) != 2 and len(shape_b) != 2:
return False
......@@ -205,44 +207,46 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
return False
elif shape_a[1] != shape_b[0]:
return False
if trans_a_f and trans_b and shape_b[1] == 1:
return False
if src_dtype == "float16":
if len(shape_a) != 2 and len(shape_b) != 2:
return False
if trans_a:
m_shape = shape_a[1]
k_shape = shape_a[0]
else:
m_shape = shape_a[0]
k_shape = shape_a[1]
if trans_b:
n_shape = shape_b[0]
k_b_shape = shape_b[1]
else:
n_shape = shape_b[1]
k_b_shape = shape_b[0]
if k_shape != k_b_shape:
return False
if m_shape == 1 or n_shape == 1:
if k_shape % 256 != 0:
return False
except RuntimeError as e:
return False
return True
# pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
# @util.check_input_type(dict, dict, (dict, NoneType), dict, bool, bool, str)
@op_info_register(matmul_cube_dense_left_op_info)
def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
kernel_name="matmulcube"):
"""
calculating matrix multiplication with bias, C = A*B + bias, support input
data with fractal format.
......@@ -279,87 +283,87 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
print(shape_a, shape_b)
print("============")
if input_x2.get("format") == "FRACTAL_Z":
n,c,h,w = shape_b
n, c, h, w = shape_b
c0 = 16
c1 = c // c0
if c1 == 0:
c1 = 1
shape_b = [n, c1 * h * w * c0]
shape_a = [n,n]
shape_a = [n, n]
if input_x1.get("format") == "FRACTAL_Z":
n,c,h,w = shape_a
n, c, h, w = shape_a
c0 = 16
c1 = c // c0
if c1 == 0:
c1 = 1
shape_a = [n, c1 * h * w * c0]
shape_b = [c1 * h * w * c0, c1 * h * w * c0]
if input_x2.get("format") == "FRACTAL_NZ":
shape_a = [shape_b[0], shape_b[0]]
shape_b = shape_b
if input_x1.get("format") == "FRACTAL_NZ":
shape_a = shape_a
shape_b = [shape_a[1], shape_a[1]]
shape_a = list(shape_a)
shape_b = list(shape_b)
shape_a = _get_input_shape(shape_a)
shape_b = _get_input_shape(shape_b)
util.check_kernel_name(kernel_name)
util.check_shape_rule(shape_a)
util.check_shape_rule(shape_b)
util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
shape_a = [shape_a[1], shape_a[0]]
trans_a = bool(1-trans_a)
trans_a = bool(1 - trans_a)
shape_b = [shape_b[1], shape_b[0]]
trans_b = bool(1-trans_b)
trans_b = bool(1 - trans_b)
shape_bias = ()
if bias is not None and bool(bias):
shape_bias = bias.get("shape")
shape_bias = list(shape_bias)
shape_bias = _get_bias(shape_bias)
src_dtype = input_x1.get("dtype").lower()
dst_dtype = output_y.get("dtype").lower()
_shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
m_shape = shape_a[len(shape_a) - 2]
km_shape = shape_a[len(shape_a) - 1]
kn_shape = shape_b[len(shape_a) - 2]
n_shape = shape_b[len(shape_a) - 1]
if src_dtype == "float16":
block_reduce = cce.BLOCK_REDUCE
block_in = cce.BLOCK_IN
block_out = cce.BLOCK_OUT
if trans_a and km_shape == 1:
block_in = cce.BLOCK_VECTOR
if not trans_a and m_shape == 1:
block_in = cce.BLOCK_VECTOR
if trans_b and kn_shape == 1:
block_out = cce.BLOCK_VECTOR
if not trans_b and n_shape == 1:
block_out = cce.BLOCK_VECTOR
if trans_a:
shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
else:
shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
if trans_b:
shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
else:
......@@ -368,7 +372,7 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
format_a = "FRACTAL_NZ"
shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
format_b = "FRACTAL_NZ"
print("=======================================")
print(shape_a_temp, shape_b_temp)
print(format_a, format_b)
......@@ -378,67 +382,85 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
dtype=src_dtype)
tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
dtype=src_dtype)
if len(shape_bias) > 0:
tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
dtype=dst_dtype)
if shape_a_temp[0] == 63 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 63:
if util.get_product_version() == util.VERSION_MINI:
tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
else:
tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
resMatmul = tik_instance.Tensor("float16", shape_output, name="output", scope=tik.scope_gm)
with tik_instance.for_range(0,32,block_num=32) as block_index:
resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc, name = "resMatmul_local_UB")
input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf, name = "input_2_local_L1")
input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf, name = "input_1_local_L1")
input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
with tik_instance.for_range(0, 32, block_num=32) as block_index:
resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf,
name="resMatmul_local_UB")
resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc,
name="resMatmul_local_UB")
input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca,
name="input_1_local_L1_local_L0A")
input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf,
name="input_2_local_L1")
input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf,
name="input_1_local_L1")
input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb,
name="input_2_local_L1_local_L0B")
core_m_idx = block_index % 8
core_n_idx = block_index // 8
with tik_instance.if_scope(core_m_idx != 7):
tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128, 55 * 16, 0)
tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 128, 55 * 16, 0)
tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128,
55 * 16, 0)
tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
32, 128, 55 * 16, 0)
with tik_instance.for_range(0, 8) as cc12:
tik_instance.load2dv1(input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256], 0, 8, 8, 0, False)
tik_instance.load2dv1(input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256], 0, 8,
8, 0, False)
with tik_instance.for_range(0, 2) as cc6:
with tik_instance.for_range(0, 8) as cc121:
tik_instance.load2dv1(input_2_local_L1_local_L0B[cc121 * 4096], input_2_local_L1[cc6 * 32768 + cc121 * 256], 0, 16, 8, 0, True)
tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 128, 128, 256, 0)
tik_instance.load2dv1(input_2_local_L1_local_L0B[cc121 * 4096],
input_2_local_L1[cc6 * 32768 + cc121 * 256], 0, 16, 8, 0, True)
tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
input_2_local_L1_local_L0B, 128, 128, 256, 0)
tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0, 1)
tik_instance.data_move(resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 256 // 2 , 0, 55 * 16 * 2 // 2)
tik_instance.data_move(resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
resMatmul_local_UB, 0, 16, 256 // 2, 0, 55 * 16 * 2 // 2)
with tik_instance.else_scope():
tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112, 56 * 16, 0)
tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 112, 56 * 16, 0)
tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112,
56 * 16, 0)
tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
32, 112, 56 * 16, 0)
with tik_instance.for_range(0, 7) as cc10:
tik_instance.load2dv1(input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256], 0, 7, 7, 0, False)
tik_instance.load2dv1(input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256], 0, 7,
7, 0, False)
with tik_instance.for_range(0, 2) as cc5:
with tik_instance.for_range(0, 7) as cc101:
tik_instance.load2dv1(input_2_local_L1_local_L0B[cc101 * 4096], input_2_local_L1[cc5 * 28672 + cc101 * 256], 0, 16, 7, 0, True)
tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 112, 112, 256, 0)
tik_instance.load2dv1(input_2_local_L1_local_L0B[cc101 * 4096],
input_2_local_L1[cc5 * 28672 + cc101 * 256], 0, 16, 7, 0, True)
tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
input_2_local_L1_local_L0B, 112, 112, 256, 0)
tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 112, 0, 0, 1)
tik_instance.data_move(resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 224 // 2 , 0, 56 * 16 * 2 // 2)
tik_instance.data_move(resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
resMatmul_local_UB, 0, 16, 224 // 2, 0, 56 * 16 * 2 // 2)
tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[resMatmul])
return tik_instance
else:
print("come into tbe, shape is error!")
result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
with tvm.target.cce():
schedule = generic.auto_schedule(result)
tensor_list = [tensor_a, tensor_b, result]
if len(shape_bias) > 0:
tensor_list = [tensor_a, tensor_b, tensor_bias, result]
config = {"print_ir": False,
"name": kernel_name,
"tensor_list": tensor_list}
te.lang.cce.cce_build_code(schedule, config)
......@@ -18,15 +18,10 @@ limitations under the License.
matmul
"""
from __future__ import absolute_import
import te.lang.cce
import te.platform.cce_params as cce
from te.platform.fusion_manager import fusion_manager
from te import tvm
from topi import generic
from topi.cce import util
from impl.matmul_vector import matmul_vector_cce
from te import tik
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
from te import tik
from topi.cce import util
matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
.fusion_type("OPAQUE") \
......@@ -40,23 +35,26 @@ matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
.input(2, "x3", False, "required", "all") \
.input(3, "x4", False, "optional", "all") \
.output(0, "y", False, "required", "all") \
.dtype_format(DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracNZ) \
.dtype_format(DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default,
DataType.F32_FracNZ) \
.get_op_info()
@op_info_register(matmul_cube_dense_right_op_info)
def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
kernel_name="matmulcube"):
shape_a_temp = (128, 63, 16, 16)
shape_b_temp = (128, 128, 16, 16)
shape_output = output_y.get("shape")
matrix_max_shape = (1,)
support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape),]
support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape), ]
shape_a_input = input_x1.get("shape")
shape_b_input = input_x2.get("shape")
matrix_max_input = input_x3.get("shape")
input_shape = (tuple(shape_a_input), tuple(shape_b_input), tuple(matrix_max_input))
if input_shape not in support_shape:
raise RuntimeError("input_shape %s is not supported" % str(input_shape))
if shape_a_temp[0] == 128 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 128:
if util.get_product_version() == util.VERSION_MINI:
tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
......@@ -64,79 +62,110 @@ def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}
tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
input_x3 = tik_instance.Tensor("float32", [1,], name="matrix_max", scope=tik.scope_gm)
input_x3 = tik_instance.Tensor("float32", [1, ], name="matrix_max", scope=tik.scope_gm)
resMatmul = tik_instance.Tensor("float32", shape_output, name="output", scope=tik.scope_gm)
with tik_instance.for_range(0, 32, block_num=32) as block_index:
core_m_idx = block_index // 16
core_n_idx = block_index % 16
matrix_max_scalar = tik_instance.Scalar("float32")
matrix_max_local_UB = tik_instance.Tensor("float32", (8,), scope = tik.scope_ubuf, name = "matrix_max_local_UB")
matrix_max_local_UB = tik_instance.Tensor("float32", (8,), scope=tik.scope_ubuf, name="matrix_max_local_UB")
tik_instance.data_move(matrix_max_local_UB, input_x3, 0, 1, 1, 0, 0)
matrix_max_scalar.set_as(matrix_max_local_UB[0])
resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
resMatmul_local_UB1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB1")
resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C")
resMatmul_local_UB_local_L0C1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C1")
resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_ubuf,
name="resMatmul_local_UB")
resMatmul_local_UB1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_ubuf,
name="resMatmul_local_UB1")
resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_cc,
name="resMatmul_local_UB_local_L0C")
resMatmul_local_UB_local_L0C1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_cc,
name="resMatmul_local_UB_local_L0C1")
input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (256 * 128,), scope=tik.scope_ca,
name="input_1_local_L1_local_L0A")
input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf,
name="input_2_local_L1")
input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf,
name="input_2_local_L11")
input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16,), scope=tik.scope_cbuf,
name="input_1_local_L1")
input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16,), scope=tik.scope_cbuf,
name="input_1_local_L11")
input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb,
name="input_2_local_L1_local_L0B")
input_2_local_L1_local_L0B1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb,
name="input_2_local_L1_local_L0B1")
input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (256 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L1")
input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L11")
input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L1")
input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L11")
input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
input_2_local_L1_local_L0B1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B1")
with tik_instance.if_scope(core_m_idx == 0):
with tik_instance.for_range(0, 2) as cc1:
tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752, 0)
tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8,
128, 1920, 0)
tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752,
0)
with tik_instance.for_range(0, 8) as cc10:
tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0, 8, 8, 0, True)
tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0,
8, 8, 0, True)
with tik_instance.for_range(0, 16) as cc101:
tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256], 0, 8, 16, 0, False)
tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],
0, 8, 16, 0, False)
tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
input_2_local_L1_local_L0B, 256, 128, 128, 0)
tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
tik_instance.data_move(resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8)
tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64],
matrix_max_scalar, 255, 1, 1, 8, 8)
tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64],
matrix_max_scalar, 2, 1, 1, 8, 8)
tik_instance.data_move(resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512,
0, 1504)
with tik_instance.else_scope():
tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128,
1920, 0)
tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + 2 * 4096], 0, 8, 256, 752, 0)
with tik_instance.for_range(0, 8) as cc10:
tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0, 8, 8, 0, True)
tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0, 8,
8, 0, True)
with tik_instance.for_range(0, 16) as cc101:
tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256], 0, 8, 16, 0, False)
tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256], 0, 8,
16, 0, False)
tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B,
256, 128, 128, 0)
tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
tik_instance.data_move(resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
tik_instance.data_move(input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8)
tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64], matrix_max_scalar,
255, 1, 1, 8, 8)
tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64], matrix_max_scalar, 2,
1, 1, 8, 8)
tik_instance.data_move(resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0,
1504)
tik_instance.data_move(input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128,
1920, 0)
tik_instance.data_move(input_1_local_L11, input_x1[core_n_idx * 129024 + 12288], 0, 8, 240, 768, 0)
with tik_instance.for_range(0, 8) as cc102:
tik_instance.load2dv1(input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256], 0, 8, 8, 0, True)
tik_instance.load2dv1(input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256], 0,
8, 8, 0, True)
with tik_instance.for_range(0, 16) as cc103:
tik_instance.load2dv1(input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256], 0, 8, 15, 0, False)
tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B1, 240, 128, 128, 0)
tik_instance.load2dv1(input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256], 0,
8, 15, 0, False)
tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A,
input_2_local_L1_local_L0B1, 240, 128, 128, 0)
tik_instance.data_move(resMatmul_local_UB1, resMatmul_local_UB_local_L0C1, 0, 1, 120, 0, 0)
tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255,1,1,8,8)
tik_instance.vmuls(64, resMatmul_local_UB1[255*64], resMatmul_local_UB1[255*64], matrix_max_scalar, 225,1,1,8,8)
tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255, 1, 1, 8, 8)
tik_instance.vmuls(64, resMatmul_local_UB1[255 * 64], resMatmul_local_UB1[255 * 64], matrix_max_scalar,
225, 1, 1, 8, 8)
tik_instance.data_move(resMatmul[core_n_idx * 129024 + 12288], resMatmul_local_UB1, 0, 8, 480, 0, 1536)
tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
return tik_instance
......@@ -17,11 +17,12 @@ limitations under the License.
matmul
"""
from __future__ import absolute_import
import te.platform.cce_params as cce
from te import tvm
from topi.cce import util
from te import tik
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
from te import tik
from topi.cce import util
# General limitation of the size for input shape: 2**31
SHAPE_SIZE_LIMIT = 2147483648
NoneType = type(None)
......@@ -40,6 +41,7 @@ matmul_cube_fracz_left_cast_op_info = TBERegOp("CusMatMulCubeFraczLeftCast") \
.dtype_format(DataType.F16_Default, DataType.F32_FracZ, DataType.F16_Default, DataType.F16_FracZ) \
.get_op_info()
# pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
"""
......@@ -137,6 +139,7 @@ src_dtype: str
else:
raise RuntimeError("unsupport input shape now for batch bias case")
def _get_bias(shape_bias):
bias_length = shape_bias[0]
if bias_length % 16 == 0:
......@@ -147,6 +150,7 @@ def _get_bias(shape_bias):
shape_bias.append(bias_length)
return shape_bias
def _get_input_shape(shape_x):
dim_a = shape_x[0]
dim_b = shape_x[1]
......@@ -164,6 +168,7 @@ def _get_input_shape(shape_x):
res.append(dim_b)
return res
def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
shape_a = input_x1.get("shape")
shape_b = input_x2.get("shape")
......@@ -199,40 +204,41 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
return False
elif shape_a[1] != shape_b[0]:
return False
if trans_a_f and trans_b and shape_b[1] == 1:
return False
if src_dtype == "float16":
if len(shape_a) != 2 and len(shape_b) != 2:
return False
if trans_a:
m_shape = shape_a[1]
k_shape = shape_a[0]
else:
m_shape = shape_a[0]
k_shape = shape_a[1]
if trans_b:
n_shape = shape_b[0]
k_b_shape = shape_b[1]
else:
n_shape = shape_b[1]
k_b_shape = shape_b[0]
if k_shape != k_b_shape:
return False
if m_shape == 1 or n_shape == 1:
if k_shape % 256 != 0:
return False
except RuntimeError as e:
return False
return True
# pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
@op_info_register(matmul_cube_fracz_left_cast_op_info)
def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
......@@ -278,7 +284,7 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
c1 = 1
shape_b = [n, c1 * h * w * c0]
shape_a = [n, n]
if input_x1.get("format") == "FRACTAL_Z":
n, c, h, w = shape_a
c0 = 16
......@@ -291,26 +297,26 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
if input_x2.get("format") == "FRACTAL_NZ":
shape_a = [shape_b[0], shape_b[0]]
shape_b = shape_b
if input_x1.get("format") == "FRACTAL_NZ":
shape_a = shape_a
shape_b = [shape_a[1], shape_a[1]]
shape_a = list(shape_a)
shape_b = list(shape_b)
shape_a = _get_input_shape(shape_a)
shape_b = _get_input_shape(shape_b)
util.check_kernel_name(kernel_name)
util.check_shape_rule(shape_a)
util.check_shape_rule(shape_b)
util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
shape_a = [shape_a[1], shape_a[0]]
trans_a = bool(1 - trans_a)
shape_b = [shape_b[1], shape_b[0]]
trans_b = bool(1 - trans_b)
......@@ -319,45 +325,45 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
shape_bias = bias.get("shape")
shape_bias = list(shape_bias)
shape_bias = _get_bias(shape_bias)
src_dtype = input_x1.get("dtype").lower()
_shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
m_shape = shape_a[len(shape_a) - 2]
km_shape = shape_a[len(shape_a) - 1]
kn_shape = shape_b[len(shape_a) - 2]
n_shape = shape_b[len(shape_a) - 1]
if src_dtype == "float16":
block_reduce = cce.BLOCK_REDUCE
block_in = cce.BLOCK_IN
block_out = cce.BLOCK_OUT
if trans_a and km_shape == 1:
block_in = cce.BLOCK_VECTOR
if not trans_a and m_shape == 1:
block_in = cce.BLOCK_VECTOR
if trans_b and kn_shape == 1:
block_out = cce.BLOCK_VECTOR
if not trans_b and n_shape == 1:
block_out = cce.BLOCK_VECTOR
if trans_a:
shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
else:
shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
if trans_b:
shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
else:
shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
if util.get_product_version() == util.VERSION_MINI:
tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
else:
......@@ -372,7 +378,8 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
diag_opt=diag_opt, diag_size=DIAG_SIZE)
tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[res_matmul])
return tik_instance
def get_cus_tile_info(input_x1, input_x2, diag_size):
tile_map = {
((32, 32, 16, 16), (128, 32, 16, 16)): (8, 8, 16),
......@@ -381,10 +388,10 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
((128, 128, 16, 16), (32, 128, 16, 16)): (8, 8, 16),
((16, 16, 16, 16), (144, 16, 16, 16)): (8, 8, 9),
((64, 64, 16, 16), (16, 64, 16, 16)): (8, 8, 4),
((16, 16, 16, 16), (64, 16, 16, 16)): (8, 8, 4),
((32, 32, 16, 16), (8, 32, 16, 16)): (8, 8, 1),
((16, 16, 16, 16), (64, 16, 16, 16)): (8, 8, 4),
((32, 32, 16, 16), (8, 32, 16, 16)): (8, 8, 1),
((128, 128, 16, 16), (64, 128, 16, 16)): (8, 8, 16),
((16, 16, 16, 16), (4, 16, 16, 16)): (8, 8, 1),
((16, 16, 16, 16), (4, 16, 16, 16)): (8, 8, 1),
((16, 16, 16, 16), (32, 16, 16, 16)): (8, 8, 2),
((64, 64, 16, 16), (32, 64, 16, 16)): (8, 8, 8),
((32, 32, 16, 16), (64, 32, 16, 16)): (8, 8, 8),
......@@ -398,13 +405,14 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
}
shape_info = (tuple(input_x1.shape), tuple(input_x2.shape))
diag_opt = False
if input_x1.shape[0]*input_x1.shape[3] > diag_size:
if input_x1.shape[0] * input_x1.shape[3] > diag_size:
diag_opt = True
if shape_info not in tile_map:
raise ValueError("shape %s is not supported" % str(shape_info))
mo_tile, ko_tile, no_tile = tile_map[shape_info]
return mo_tile, ko_tile, no_tile, diag_opt
def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
res, mo_tile, ko_tile, no_tile, diag_opt=False, diag_size=128):
ko, mo, mi, ki = input_x1.shape
......@@ -420,7 +428,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
raise ValueError("shape of input_x1 or input_x2 is not supported!")
if not trans_a or not trans_b:
raise ValueError("only trans_a=False and trans_b=False be supported!")
core_m_num = mo // mo_tile
loop_n_num = no // no_tile
if loop_n_num * core_m_num <= maxblocknum:
......@@ -432,7 +440,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
else:
raise ValueError("Does not support this scenario!")
block_num = core_m_num * core_n_num
loop_k_num = ko // ko_tile
if diag_opt:
loop_k_num = diag_outer // ko_tile
......@@ -445,7 +453,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
core_n = block_idx % core_n_num
with tik_instance.for_range(0, loop_n_num) as cc_n:
res_L0C = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
name="resMatmul_L0C", scope=tik.scope_cc)
name="resMatmul_L0C", scope=tik.scope_cc)
with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
# input_x2 -> input_x2_ub -(fp322fp16)-> input_x2_cast_ub -> input_x2_L1
input_x2_ub = tik_instance.Tensor("float32", [no_tile, ko_tile_inner, c0, c0], name="input_x2_ub",
......@@ -476,41 +484,41 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
input_x2_cast_ub[count * repeate_times_max * vectorfp32_size],
input_x2_ub[count * repeate_times_max * vectorfp32_size], repeate_num,
1, 1, 4, 8)
input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
name="input_x2_L1", scope=tik.scope_cbuf)
tik_instance.data_move(input_x2_L1, input_x2_cast_ub, 0, 1,
no_tile * ko_tile_inner * c0 * c0 * fp16_size // blocksize, 0, 0)
# input_x1 -> input_x1_L1
input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
name="input_x1_L1", scope=tik.scope_cbuf)
tik_instance.data_move(input_x1_L1,
input_x1[k_idx,
core_m * mo_tile, 0, 0],
0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
(mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
# input_x2_L1 -> input_x2_L0B
input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
name="input_x2_L0B", scope=tik.scope_cb)
with tik_instance.for_range(0, ko_tile_inner) as cc2:
tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
ko_tile_inner,
0, True)
# input_x1_L1 -> input_x1_L0A
input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
name="input_x1_L0A", scope=tik.scope_ca)
with tik_instance.for_range(0, mo_tile) as cc1:
tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
mo_tile, 0, False)
with tik_instance.if_scope(thread_idx_k == 0):
tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
ko_tile_inner * c0, no_tile * c0, 0)
with tik_instance.else_scope():
tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
ko_tile_inner * c0, no_tile * c0, 1)
res_ub = tik_instance.Tensor(input_x1.dtype, [no_tile, mo_tile, c0, c0],
name="resMatmul_ub", scope=tik.scope_ubuf)
tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0, 1)
tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile, core_m * mo_tile, 0, 0],
res_ub, 0, no_tile,
mo_tile * c0 * c0 * fp16_size // blocksize, 0,
(mo - mo_tile) * c0 * c0 * fp16_size // blocksize)
input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
name="input_x2_L1", scope=tik.scope_cbuf)
tik_instance.data_move(input_x2_L1, input_x2_cast_ub, 0, 1,
no_tile * ko_tile_inner * c0 * c0 * fp16_size // blocksize, 0, 0)
# input_x1 -> input_x1_L1
input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
name="input_x1_L1", scope=tik.scope_cbuf)
tik_instance.data_move(input_x1_L1,
input_x1[k_idx,
core_m * mo_tile, 0, 0],
0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
(mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
# input_x2_L1 -> input_x2_L0B
input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
name="input_x2_L0B", scope=tik.scope_cb)
with tik_instance.for_range(0, ko_tile_inner) as cc2:
tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
ko_tile_inner,
0, True)
# input_x1_L1 -> input_x1_L0A
input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
name="input_x1_L0A", scope=tik.scope_ca)
with tik_instance.for_range(0, mo_tile) as cc1:
tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
mo_tile, 0, False)
with tik_instance.if_scope(thread_idx_k == 0):
tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
ko_tile_inner * c0, no_tile * c0, 0)
with tik_instance.else_scope():
tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
ko_tile_inner * c0, no_tile * c0, 1)
res_ub = tik_instance.Tensor(input_x1.dtype, [no_tile, mo_tile, c0, c0],
name="resMatmul_ub", scope=tik.scope_ubuf)
tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0, 1)
tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile, core_m * mo_tile, 0, 0],
res_ub, 0, no_tile,
mo_tile * c0 * c0 * fp16_size // blocksize, 0,
(mo - mo_tile) * c0 * c0 * fp16_size // blocksize)
......@@ -18,37 +18,35 @@ limitations under the License.
matmul
"""
from __future__ import absolute_import
import te.lang.cce
import te.platform.cce_params as cce
from te.platform.fusion_manager import fusion_manager
from te import tvm
from topi import generic
from topi.cce import util
from te import tik
from impl.matmul_vector import matmul_vector_cce
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
from te import tik
from topi.cce import util
# General limitation of the size for input shape: 2**31
SHAPE_SIZE_LIMIT = 2147483648
NoneType = type(None)
cus_matmul_cube_fracz_right_mul_op_info = TBERegOp("CusMatMulCubeFraczRightMul") \
.fusion_type("OPAQUE") \
.async_flag(False) \
.binfile_name("matmulcubefraczrightmul.so") \
.compute_cost(10) \
.kernel_name("CusMatMulCubeFraczRightMul") \
.partial_flag(True) \
.input(0, "x1", False, "required", "all") \
.input(1, "x2", False, "required", "all") \
.input(2, "x3", False, "required", "all") \
.input(3, "x4", False, "optional", "all") \
.output(0, "y", False, "required", "all") \
.dtype_format(DataType.F16_FracZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracZ) \
.get_op_info()
.fusion_type("OPAQUE") \
.async_flag(False) \
.binfile_name("matmulcubefraczrightmul.so") \
.compute_cost(10) \
.kernel_name("CusMatMulCubeFraczRightMul") \
.partial_flag(True) \
.input(0, "x1", False, "required", "all") \
.input(1, "x2", False, "required", "all") \
.input(2, "x3", False, "required", "all") \
.input(3, "x4", False, "optional", "all") \
.output(0, "y", False, "required", "all") \
.dtype_format(DataType.F16_FracZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default,
DataType.F32_FracZ) \
.get_op_info()
@op_info_register(cus_matmul_cube_fracz_right_mul_op_info)
def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
kernel_name="matmulcube"):
if util.get_product_version() == util.VERSION_MINI:
tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
else:
......@@ -61,10 +59,10 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
input_x3_shape = input_x3.get("shape")
input_x3_dtype = input_x3.get("dtype").lower()
output_shape = output_y.get("shape")
Supported = [((72, 8, 16, 16),"float16", (72, 72, 16, 16), "float16", (1,), "float32"),
((32, 8, 16, 16),"float16", (32, 32, 16, 16), "float16", (1,), "float32"),
((8, 32, 16, 16),"float16", (8, 8, 16, 16), "float16", (1,), "float32"),
((4, 4, 16, 16),"float16", (4, 4, 16, 16), "float16", (1,), "float32"),
Supported = [((72, 8, 16, 16), "float16", (72, 72, 16, 16), "float16", (1,), "float32"),
((32, 8, 16, 16), "float16", (32, 32, 16, 16), "float16", (1,), "float32"),
((8, 32, 16, 16), "float16", (8, 8, 16, 16), "float16", (1,), "float32"),
((4, 4, 16, 16), "float16", (4, 4, 16, 16), "float16", (1,), "float32"),
((4, 16, 16, 16), 'float16', (4, 4, 16, 16), 'float16', (1,), 'float32'),
((49, 4, 16, 16), 'float16', (49, 49, 16, 16), 'float16', (1,), 'float32'),
((36, 4, 16, 16), 'float16', (36, 36, 16, 16), 'float16', (1,), 'float32'),
......@@ -81,7 +79,8 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
((32, 128, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'),
((64, 32, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
((16, 64, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32')]
input_shape = (tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
input_shape = (
tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
if input_shape not in Supported:
raise RuntimeError("input_shape %s is not supported" % str(input_shape))
......@@ -93,6 +92,7 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
return tik_instance
def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
res):
diag_size = 128
......@@ -176,7 +176,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
name="resMatmul_L0C", scope=tik.scope_cc)
with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
if diag_opt:
k_idx = (core_n*loop_n_num + cc_n) * no_tile + thread_idx_k * ko_tile_inner
k_idx = (core_n * loop_n_num + cc_n) * no_tile + thread_idx_k * ko_tile_inner
else:
k_idx = thread_idx_k * ko_tile_inner
# input_x1 -> input_x1_L1
......@@ -191,7 +191,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
name="input_x2_L1", scope=tik.scope_cbuf)
tik_instance.data_move(input_x2_L1,
input_x2[(core_n*loop_n_num + cc_n) * no_tile,
input_x2[(core_n * loop_n_num + cc_n) * no_tile,
k_idx, 0, 0],
0, no_tile, ko_tile_inner * c0 * c0 * fp16_size // blocksize,
(ko - ko_tile_inner) * c0 * c0 * fp16_size // blocksize, 0)
......@@ -215,9 +215,9 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
ko_tile_inner * c0, no_tile * c0, 1)
res_ub = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
name="resMatmul_ub", scope=tik.scope_ubuf)
name="resMatmul_ub", scope=tik.scope_ubuf)
tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0)
input_3_local_UB = tik_instance.Tensor("float32", (8,), scope=tik.scope_ubuf, name="input_3_local_UB")
tik_instance.data_move(input_3_local_UB, input_x3, 0, 1, 1, 0, 0)
matrix_max_scalar = tik_instance.Scalar("float32")
......@@ -236,7 +236,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
res_ub[count * repeate_times_max * vectorfp32_size],
res_ub[count * repeate_times_max * vectorfp32_size],
matrix_max_scalar, repeate_num, 1, 1, 8, 8)
tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile,
(core_m * loop_m_num + cc_m) * mo_tile, 0, 0],
res_ub, 0, no_tile,
......
......@@ -18,13 +18,15 @@ limitations under the License.
matmul
"""
from __future__ import absolute_import
import te.lang.cce
import te.platform.cce_params as cce
from impl.matmul_vector import matmul_vector_cce
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
from te import tvm
from topi import generic
from topi.cce import util
from impl.matmul_vector import matmul_vector_cce
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
# General limitation of the size for input shape: 2**31
SHAPE_SIZE_LIMIT = 2147483648
NoneType = type(None)
......@@ -36,8 +38,8 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
.compute_cost(10) \
.kernel_name("CusMatMulCube") \
.partial_flag(True) \
.attr("transpose_a", "required", "bool", "all")\
.attr("transpose_b", "required", "bool", "all")\
.attr("transpose_a", "required", "bool", "all") \
.attr("transpose_b", "required", "bool", "all") \
.input(0, "x1", False, "required", "all") \
.input(1, "x2", False, "required", "all") \
.input(2, "x3", False, "optional", "all") \
......@@ -45,6 +47,7 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
.dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_FracNZ) \
.get_op_info()
# pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
"""
......@@ -113,16 +116,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
if m_shape != 1:
if n_shape == 1:
if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
raise RuntimeError("input shape K1 should be multiple of %d"
% (cce.BLOCK_IN*cce.BLOCK_IN))
elif km_shape%k_block_size != 0:
% (cce.BLOCK_IN * cce.BLOCK_IN))
elif km_shape % k_block_size != 0:
raise RuntimeError(
"input shape K1 should be multiple of %d" % cce.BLOCK_IN)
else:
if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
raise RuntimeError("input shape K1 should be multiple of %d"
% (cce.BLOCK_IN*cce.BLOCK_IN))
% (cce.BLOCK_IN * cce.BLOCK_IN))
if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
......@@ -130,7 +133,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
if len(shape_bias):
if len(shape_bias) == 1:
if is_gevm or is_gemv:
if shape_bias[0] != m_shape*n_shape:
if shape_bias[0] != m_shape * n_shape:
raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
else:
if shape_bias[0] != n_shape:
......@@ -141,33 +144,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
else:
raise RuntimeError("unsupport input shape now for batch bias case")
def _get_bias(shape_bias):
bias_length = shape_bias[0]
if bias_length % 16 ==0:
if bias_length % 16 == 0:
return shape_bias
else:
bias_length = (bias_length // 16)*16 + 16
bias_length = (bias_length // 16) * 16 + 16
shape_bias = []
shape_bias.append(bias_length)
return shape_bias
def _get_input_shape(shape_x):
dim_a = shape_x[0]
dim_b = shape_x[1]
res = []
if dim_a % 16 !=0:
dim_a = (dim_a // 16)*16 + 16
if dim_a % 16 != 0:
dim_a = (dim_a // 16) * 16 + 16
res.append(dim_a)
else:
res.append(dim_a)
if dim_b % 16 !=0:
dim_b = (dim_b // 16)*16 + 16
if dim_b % 16 != 0:
dim_b = (dim_b // 16) * 16 + 16
res.append(dim_b)
else:
res.append(dim_b)
return res
def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
shape_a = input_x1.get("shape")
shape_b = input_x2.get("shape")
......@@ -182,7 +188,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
if bias is not None and bool(bias):
shape_bias = bias.get("shape")
try:
trans_a_f = bool(1-trans_a)
trans_a_f = bool(1 - trans_a)
if src_dtype == "float32" or src_dtype == "int32":
if len(shape_a) != 2 and len(shape_b) != 2:
return False
......@@ -203,10 +209,10 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
return False
elif shape_a[1] != shape_b[0]:
return False
if trans_a_f and trans_b and shape_b[1] == 1:
return False
if src_dtype == "float16":
if len(shape_a) != 2 and len(shape_b) != 2:
return False
......@@ -217,26 +223,27 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
else:
m_shape = shape_a[0]
k_shape = shape_a[1]
if trans_b:
n_shape = shape_b[0]
k_b_shape = shape_b[1]
else:
n_shape = shape_b[1]
k_b_shape = shape_b[0]
if k_shape != k_b_shape:
return False
if m_shape == 1 or n_shape == 1:
if k_shape % 256 != 0:
return False
except RuntimeError as e:
return False
return True
# pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
@op_info_register(matmul_cube_op_info)
def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
......@@ -269,18 +276,18 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
"""
shape_a = input_x1.get("ori_shape")
shape_b = input_x2.get("ori_shape")
if shape_a is not None:
if len(shape_a) < 2:
shape_a = input_x1.get("shape")
if shape_b is not None:
if len(shape_b) < 2:
shape_b = input_x2.get("shape")
shape_a = list(shape_a)
shape_b = list(shape_b)
if input_x1.get("format") == "FRACTAL_NZ":
shape_a = _get_input_shape(shape_a)
shape_b = _get_input_shape(shape_b)
......@@ -290,21 +297,21 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
util.check_shape_rule(shape_b)
util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
if input_x1.get("format") == "FRACTAL_NZ":
shape_a = [shape_a[1], shape_a[0]]
trans_a = bool(1-trans_a)
trans_a = bool(1 - trans_a)
if input_x2.get("format") == "FRACTAL_NZ":
shape_b = [shape_b[1], shape_b[0]]
trans_b = bool(1-trans_b)
trans_b = bool(1 - trans_b)
shape_bias = ()
if bias is not None and bool(bias):
shape_bias = bias.get("shape")
shape_bias = list(shape_bias)
shape_bias = _get_bias(shape_bias)
src_dtype = input_x1.get("dtype").lower()
dst_dtype = output_y.get("dtype").lower()
if src_dtype == "float32" or src_dtype == "int32":
......@@ -338,12 +345,12 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
else:
shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
if trans_b:
shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
else:
shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
if input_x1.get("format") == "FORMAT_FRACTAL_Z":
shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
format_a = "fractal"
......@@ -353,7 +360,7 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
else:
shape_a_temp = (shape_a[len(shape_a) - 2], shape_a[len(shape_a) - 1])
format_a = "ND"
if input_x2.get("format") == "FORMAT_FRACTAL_Z":
shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
format_b = "fractal"
......@@ -363,28 +370,28 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
else:
shape_b_temp = (shape_b[len(shape_b) - 2], shape_b[len(shape_b) - 1])
format_b = "ND"
tensor_bias = None
tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a',
dtype=src_dtype)
tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
dtype=src_dtype)
if len(shape_bias) > 0:
tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
dtype=dst_dtype)
result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
with tvm.target.cce():
schedule = generic.auto_schedule(result)
tensor_list = [tensor_a, tensor_b, result]
if len(shape_bias) > 0:
tensor_list = [tensor_a, tensor_b, tensor_bias, result]
config = {"print_ir": False,
"name": kernel_name,
"tensor_list": tensor_list}
te.lang.cce.cce_build_code(schedule, config)
......@@ -13,24 +13,25 @@
# limitations under the License.
# ============================================================================
"""CusMatrixCombine"""
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
from te import tik
from topi.cce import util
from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
cus_matrix_combine_op_info = TBERegOp("CusMatrixCombine") \
.fusion_type("OPAQUE") \
.async_flag(False) \
.binfile_name("matrixcombine.so") \
.compute_cost(10) \
.kernel_name("CusMatrixCombine") \
.partial_flag(True) \
.input(0, "x1", False, "required", "all") \
.output(0, "y", False, "required", "all") \
.dtype_format(DataType.F32_Default, DataType.F32_Default) \
.get_op_info()
.fusion_type("OPAQUE") \
.async_flag(False) \
.binfile_name("matrixcombine.so") \
.compute_cost(10) \
.kernel_name("CusMatrixCombine") \
.partial_flag(True) \
.input(0, "x1", False, "required", "all") \
.output(0, "y", False, "required", "all") \
.dtype_format(DataType.F32_Default, DataType.F32_Default) \
.get_op_info()
@op_info_register(cus_matrix_combine_op_info)
def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
def CusMatrixCombine(input_x, output, kernel_name="matrix_combine"):
input_x_shape = input_x.get("shape")
output_shape = output.get("shape")
split_dim = 128
......@@ -45,18 +46,20 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
blocks = 32
matrix_dim = input_x_shape[0] * input_x_shape[1]
if input_x_shape[0] == 1 and input_x_shape[1] == 64 :
if input_x_shape[0] == 1 and input_x_shape[1] == 64:
tiling_dim = 2
bs = 1
with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub",
scope=tik.scope_ubuf)
tik_instance.data_move(input_x_ub, input_x[0, block_index * tiling_dim, 0], 0, 1, 16, 0, 0)
tik_instance.data_move(res[block_index * tiling_dim, 0], input_x_ub, 0, 1, 16, 0, 0)
else:
tiling_dim = 4
bs = input_x_shape[0]
with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub",
scope=tik.scope_ubuf)
zero = tik_instance.Scalar("float32")
zero.set_as(0.0)
with tik_instance.for_range(0, bs) as i:
......@@ -69,7 +72,9 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
tik_instance.vector_dup(64, input_x_ub, zero, repeat_1, 1, 8)
tik_instance.vector_dup(64, input_x_ub[255 * 64], zero, repeat_2, 1, 8)
with tik_instance.for_range(0, tiling_dim) as j:
tik_instance.data_move(input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0, 1, 16, 0, 0)
tik_instance.data_move(res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1, tiling_dim * matrix_dim *4 // 32, 0, 0)
tik_instance.data_move(input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0,
1, 16, 0, 0)
tik_instance.data_move(res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1,
tiling_dim * matrix_dim * 4 // 32, 0, 0)
tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
return tik_instance
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册