未验证 提交 8fb829ba 编写于 作者: K kangguangli 提交者: GitHub

Remove fluid matmul (#47988)

* remove layers.matmul in nets.py

* remove layers.matmul in rnn_impl/test_quantization_pass/auto_parallel_gpt_model/test_auto_parallel_completion_gpt

* remove layers.matmul in other files

* fix

* fix

* remove layers.matmul itself

* remove ref in CMakeLists.txt and tools directory

* remove matmul in fluid.layers.nn.py

* remove matmul in fluid.dygraph.rnn.py && resotre test_matmul_op.py

* replace matmul in fluid.dygraph.rnn.py && clean api_test in test_matmul_op.py

* fix error && restore empty test_auto_search_dist_matmul_op.py

* fix check in test_auto_parallel_partitioner.py

* fix test_dist_matmul && test_flags_mkldnn_ops_on_off

* fix test_fused_attention_op_xpu.py && test_matmul_op_xpu.py

* remove test_auto_search_dist_matmul_op.py

* remove layers.matmul in auto_parallel_gpt_model.py && fix doc in fluid/io.py

* fix for matmul_grad

* fix codestyle

* fix codestyle

* resolve conflicts error

* restore unit test file but not compiled it for later remove

* fix codestyle

* fix wrong unittest skip

* fix unittest delete

* fix scale cost

* fix scale cost

* resolve conflicts error

* resolve conflicts error
Co-authored-by: Njakpiase <jakpia21@gmail.com>
上级 1976cc4b
...@@ -101,9 +101,13 @@ void MatmulGradKernel(const Context &dev_ctx, ...@@ -101,9 +101,13 @@ void MatmulGradKernel(const Context &dev_ctx,
if (x_dims.size() != ndims) { if (x_dims.size() != ndims) {
x_dims = ExtendDimsWithOnes(x_dims, ndims); x_dims = ExtendDimsWithOnes(x_dims, ndims);
} else if (y_dims.size() != ndims) { }
if (y_dims.size() != ndims) {
y_dims = ExtendDimsWithOnes(y_dims, ndims); y_dims = ExtendDimsWithOnes(y_dims, ndims);
} }
if (dout_dims.size() != ndims) {
dout_dims = ExtendDimsWithOnes(dout_dims, ndims);
}
// in broadcasting scenario new memory is required because // in broadcasting scenario new memory is required because
// reduce sum must be calculated upon broadcasted dims // reduce sum must be calculated upon broadcasted dims
...@@ -150,7 +154,9 @@ void MatmulGradKernel(const Context &dev_ctx, ...@@ -150,7 +154,9 @@ void MatmulGradKernel(const Context &dev_ctx,
} }
dx->Resize(x.dims()); dx->Resize(x.dims());
dx->set_mem_desc(x.mem_desc().reshape(vectorize(x.dims())));
dy->Resize(y.dims()); dy->Resize(y.dims());
dy->set_mem_desc(y.mem_desc().reshape(vectorize(y.dims())));
} }
template <typename T, typename Context> template <typename T, typename Context>
......
...@@ -151,7 +151,7 @@ class BasicGRUUnit(Layer): ...@@ -151,7 +151,7 @@ class BasicGRUUnit(Layer):
def forward(self, input, pre_hidden): def forward(self, input, pre_hidden):
concat_input_hidden = layers.concat([input, pre_hidden], 1) concat_input_hidden = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight) gate_input = paddle.matmul(x=concat_input_hidden, y=self._gate_weight)
gate_input = paddle.add(gate_input, self._gate_bias) gate_input = paddle.add(gate_input, self._gate_bias)
...@@ -160,7 +160,7 @@ class BasicGRUUnit(Layer): ...@@ -160,7 +160,7 @@ class BasicGRUUnit(Layer):
r_hidden = r * pre_hidden r_hidden = r * pre_hidden
candidate = layers.matmul( candidate = paddle.matmul(
layers.concat([input, r_hidden], 1), self._candidate_weight layers.concat([input, r_hidden], 1), self._candidate_weight
) )
candidate = paddle.add(candidate, self._candidate_bias) candidate = paddle.add(candidate, self._candidate_bias)
...@@ -874,7 +874,7 @@ class BasicLSTMUnit(Layer): ...@@ -874,7 +874,7 @@ class BasicLSTMUnit(Layer):
def forward(self, input, pre_hidden, pre_cell): def forward(self, input, pre_hidden, pre_cell):
concat_input_hidden = layers.concat([input, pre_hidden], 1) concat_input_hidden = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) gate_input = paddle.matmul(x=concat_input_hidden, y=self._weight)
gate_input = paddle.add(gate_input, self._bias) gate_input = paddle.add(gate_input, self._bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
......
...@@ -76,7 +76,7 @@ def residual_block(num, quant_skip_pattern=None): ...@@ -76,7 +76,7 @@ def residual_block(num, quant_skip_pattern=None):
matmul_weight = paddle.create_parameter( matmul_weight = paddle.create_parameter(
shape=[1, 16, 32, 32], dtype='float32' shape=[1, 16, 32, 32], dtype='float32'
) )
hidden = fluid.layers.matmul(hidden, matmul_weight, True, True) hidden = paddle.matmul(hidden, matmul_weight, True, True)
if quant_skip_pattern: if quant_skip_pattern:
with fluid.name_scope(quant_skip_pattern): with fluid.name_scope(quant_skip_pattern):
pool = fluid.layers.pool2d( pool = fluid.layers.pool2d(
...@@ -724,7 +724,7 @@ def quant_dequant_residual_block(num, quant_skip_pattern=None): ...@@ -724,7 +724,7 @@ def quant_dequant_residual_block(num, quant_skip_pattern=None):
conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True) conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None) short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
hidden = paddle.nn.functional.relu(paddle.add(x=conv, y=short)) hidden = paddle.nn.functional.relu(paddle.add(x=conv, y=short))
hidden = fluid.layers.matmul(hidden, data2, True, True) hidden = paddle.matmul(hidden, data2, True, True)
if isinstance(quant_skip_pattern, str): if isinstance(quant_skip_pattern, str):
with fluid.name_scope(quant_skip_pattern): with fluid.name_scope(quant_skip_pattern):
pool1 = fluid.layers.pool2d( pool1 = fluid.layers.pool2d(
......
...@@ -17,11 +17,11 @@ from . import Layer ...@@ -17,11 +17,11 @@ from . import Layer
from ..layers import ( from ..layers import (
concat, concat,
fill_constant, fill_constant,
matmul,
elementwise_mul, elementwise_mul,
split, split,
) )
import copy import copy
import paddle
__all__ = ['LSTMCell', 'GRUCell'] __all__ = ['LSTMCell', 'GRUCell']
...@@ -215,11 +215,12 @@ class LSTMCell(Layer): ...@@ -215,11 +215,12 @@ class LSTMCell(Layer):
def forward(self, input, pre_hidden, pre_cell): def forward(self, input, pre_hidden, pre_cell):
if self._use_cudnn_impl: if self._use_cudnn_impl:
igates = matmul(input, y=self._weight_ih, transpose_y=True) igates = paddle.matmul(input, y=self._weight_ih, transpose_y=True)
igates = paddle.add(igates, self._bias_ih) igates = paddle.add(igates, self._bias_ih)
hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True) hgates = paddle.matmul(
pre_hidden, self._weight_hh, transpose_y=True
)
hgates = paddle.add(hgates, self._bias_hh) hgates = paddle.add(hgates, self._bias_hh)
chunked_igates = split(igates, num_or_sections=4, dim=1) chunked_igates = split(igates, num_or_sections=4, dim=1)
chunked_hgates = split(hgates, num_or_sections=4, dim=1) chunked_hgates = split(hgates, num_or_sections=4, dim=1)
...@@ -241,7 +242,7 @@ class LSTMCell(Layer): ...@@ -241,7 +242,7 @@ class LSTMCell(Layer):
else: else:
concat_input_hidden = concat([input, pre_hidden], 1) concat_input_hidden = concat([input, pre_hidden], 1)
gate_input = matmul(x=concat_input_hidden, y=self._weight) gate_input = paddle.matmul(x=concat_input_hidden, y=self._weight)
gate_input = paddle.add(gate_input, self._bias) gate_input = paddle.add(gate_input, self._bias)
i, j, f, o = split(gate_input, num_or_sections=4, dim=-1) i, j, f, o = split(gate_input, num_or_sections=4, dim=-1)
...@@ -461,10 +462,11 @@ class GRUCell(Layer): ...@@ -461,10 +462,11 @@ class GRUCell(Layer):
def forward(self, input, pre_hidden): def forward(self, input, pre_hidden):
if self._use_cudnn_impl: if self._use_cudnn_impl:
igates = paddle.matmul(input, y=self._weight_ih, transpose_y=True)
igates = matmul(input, y=self._weight_ih, transpose_y=True)
igates = paddle.add(igates, self._bias_ih) igates = paddle.add(igates, self._bias_ih)
hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True) hgates = paddle.matmul(
pre_hidden, self._weight_hh, transpose_y=True
)
hgates = paddle.add(hgates, self._bias_hh) hgates = paddle.add(hgates, self._bias_hh)
chunked_igates = split(igates, num_or_sections=3, dim=1) chunked_igates = split(igates, num_or_sections=3, dim=1)
...@@ -486,7 +488,9 @@ class GRUCell(Layer): ...@@ -486,7 +488,9 @@ class GRUCell(Layer):
concat_input_hidden = concat([input, pre_hidden], 1) concat_input_hidden = concat([input, pre_hidden], 1)
gate_input = matmul(x=concat_input_hidden, y=self._gate_weight) gate_input = paddle.matmul(
x=concat_input_hidden, y=self._gate_weight
)
gate_input = paddle.add(gate_input, self._gate_bias) gate_input = paddle.add(gate_input, self._gate_bias)
gate_input = self._gate_activation(gate_input) gate_input = self._gate_activation(gate_input)
...@@ -494,7 +498,7 @@ class GRUCell(Layer): ...@@ -494,7 +498,7 @@ class GRUCell(Layer):
r_hidden = r * pre_hidden r_hidden = r * pre_hidden
candidate = matmul( candidate = paddle.matmul(
concat([input, r_hidden], 1), self._candidate_weight concat([input, r_hidden], 1), self._candidate_weight
) )
candidate = paddle.add(candidate, self._candidate_bias) candidate = paddle.add(candidate, self._candidate_bias)
......
...@@ -73,7 +73,6 @@ __all__ = [ ...@@ -73,7 +73,6 @@ __all__ = [
'dropout', 'dropout',
'split', 'split',
'l2_normalize', 'l2_normalize',
'matmul',
'row_conv', 'row_conv',
'layer_norm', 'layer_norm',
'spectral_norm', 'spectral_norm',
...@@ -2589,154 +2588,6 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): ...@@ -2589,154 +2588,6 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
return out return out
@deprecated(since="2.0.0", update_to="paddle.matmul")
def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
"""
Applies matrix multiplication to two tensors.
Currently, the input tensors' rank can be any, but when the rank of any
inputs is bigger than 3, this two inputs' rank should be equal.
The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
- If a transpose flag is specified, the last two dimensions of the tensor
are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for
:math:`x` it is treated as :math:`[1, D]` in nontransposed form and as
:math:`[D, 1]` in transposed form, whereas for :math:`y` it is the
opposite: It is treated as :math:`[D, 1]` in nontransposed form and as
:math:`[1, D]` in transposed form.
- After transpose, the two tensors are 2-D or n-D and matrix multiplication
performs in the following way.
- If both are 2-D, they are multiplied like conventional matrices.
- If either is n-D, it is treated as a stack of matrices residing in the
last two dimensions and a batched matrix multiply supporting broadcast
applies on the two tensors.
Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and
nontransposed, the prepended or appended dimension :math:`1` will be
removed after matrix multiplication.
Args:
x (Variable): The input variable which is a Tensor or LoDTensor.
y (Variable): The input variable which is a Tensor or LoDTensor.
transpose_x (bool): Whether to transpose :math:`x` before multiplication.
transpose_y (bool): Whether to transpose :math:`y` before multiplication.
alpha (float): The scale of output. Default 1.0.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The product Tensor (or LoDTensor) variable.
Examples:
.. code-block:: python
# Examples to clarify shapes of the inputs and output
# x: [B, ..., M, K], y: [B, ..., K, N]
# fluid.layers.matmul(x, y) # out: [B, ..., M, N]
# x: [B, M, K], y: [B, K, N]
# fluid.layers.matmul(x, y) # out: [B, M, N]
# x: [B, M, K], y: [K, N]
# fluid.layers.matmul(x, y) # out: [B, M, N]
# x: [M, K], y: [K, N]
# fluid.layers.matmul(x, y) # out: [M, N]
# x: [B, M, K], y: [K]
# fluid.layers.matmul(x, y) # out: [B, M]
# x: [K], y: [K]
# fluid.layers.matmul(x, y) # out: [1]
# x: [M], y: [N]
# fluid.layers.matmul(x, y, True, True) # out: [M, N]
import paddle
import paddle.fluid as fluid
paddle.enable_static()
x = fluid.layers.data(name='x', shape=[2, 3], dtype='float32')
y = fluid.layers.data(name='y', shape=[3, 2], dtype='float32')
out = fluid.layers.matmul(x, y, True, True)
"""
if _non_static_mode():
out = _varbase_creator(dtype=x.dtype)
_legacy_C_ops.matmul(
x,
y,
out,
'transpose_X',
transpose_x,
'transpose_Y',
transpose_y,
'alpha',
float(alpha),
)
return out
def __check_input(x, y):
var_names = {'x': x, 'y': y}
for name, val in var_names.items():
check_variable_and_dtype(
val, name, ['float16', 'float32', 'float64'], 'matmul'
)
x_shape = list(x.shape)
y_shape = list(y.shape)
if len(x_shape) == 1:
x_shape = [1] + x_shape
if len(y_shape) == 1:
y_shape = y_shape + [1]
# check the inner 2 dimensions
if transpose_x:
x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
if transpose_y:
y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
if x_shape[-1] != y_shape[-2]:
assert (x_shape[-1] == -1) or (y_shape[-2] == -1), (
"After performing an optional transpose, Input X's width should be "
"equal to Y's width for multiplication "
"prerequisites. But received X's shape: %s, Y's shape: %s\n"
% (x_shape, y_shape)
)
if len(y_shape) > 2 and len(x_shape) > 2:
for i, dim_x in enumerate(x_shape[:-2]):
# don't check neg shape
if dim_x < 0 or y_shape[i] < 0:
continue
if dim_x != y_shape[i]:
raise ValueError(
"When the matrix is larger than 2 dimensions, the higher "
"dimensional values of the two matrices need to be equal. "
"But received x_shape[%d] != y_shape[%d]. X's shape: %s, "
"Y's shape: %s.\n" % (i, i, x_shape, y_shape)
)
attrs = {
'transpose_X': transpose_x,
'transpose_Y': transpose_y,
'alpha': float(alpha),
}
__check_input(x, y)
helper = LayerHelper('matmul', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type='matmul',
inputs={'X': x, 'Y': y},
outputs={'Out': out},
attrs=attrs,
)
return out
@templatedoc() @templatedoc()
def row_conv(input, future_context_size, param_attr=None, act=None): def row_conv(input, future_context_size, param_attr=None, act=None):
""" """
......
...@@ -621,7 +621,7 @@ def scaled_dot_product_attention( ...@@ -621,7 +621,7 @@ def scaled_dot_product_attention(
key_dim_per_head = keys.shape[-1] // num_heads key_dim_per_head = keys.shape[-1] // num_heads
scaled_q = paddle.scale(x=q, scale=key_dim_per_head**-0.5) scaled_q = paddle.scale(x=q, scale=key_dim_per_head**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True) product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
x = paddle.reshape(x=product, shape=[-1, product.shape[-1]]) x = paddle.reshape(x=product, shape=[-1, product.shape[-1]])
x = paddle.nn.functional.softmax(x) x = paddle.nn.functional.softmax(x)
...@@ -631,5 +631,5 @@ def scaled_dot_product_attention( ...@@ -631,5 +631,5 @@ def scaled_dot_product_attention(
weights = layers.dropout( weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False weights, dropout_prob=dropout_rate, is_test=False
) )
ctx_multiheads = layers.matmul(weights, v) ctx_multiheads = paddle.matmul(weights, v)
return __combine_heads(ctx_multiheads) return __combine_heads(ctx_multiheads)
...@@ -84,9 +84,7 @@ def matmul_dp2mp2(init_x, init_y, trans_x, trans_y): ...@@ -84,9 +84,7 @@ def matmul_dp2mp2(init_x, init_y, trans_x, trans_y):
y = init_y(trans_y) y = init_y(trans_y)
x.stop_gradient = False x.stop_gradient = False
y.stop_gradient = False y.stop_gradient = False
out = paddle.fluid.layers.matmul( out = paddle.matmul(x, y, transpose_x=trans_x, transpose_y=trans_y)
x, y, transpose_x=trans_x, transpose_y=trans_y
)
loss = paddle.mean(out) loss = paddle.mean(out)
return main_program, start_program, loss return main_program, start_program, loss
...@@ -134,22 +132,22 @@ class TestDistMatmul(unittest.TestCase): ...@@ -134,22 +132,22 @@ class TestDistMatmul(unittest.TestCase):
# [0, -1] * [-1, 1] --> [0, 1] # [0, -1] * [-1, 1] --> [0, 1]
ref_ops = [ ref_ops = [
"c_identity", "c_identity",
"matmul", "matmul_v2",
"reduce_mean", "reduce_mean",
"fill_constant", "fill_constant",
"reduce_mean_grad", "reduce_mean_grad",
"matmul_grad", "matmul_v2_grad",
] ]
ops = [] ops = []
block = main_program.global_block() block = main_program.global_block()
for op in block.ops: for op in block.ops:
ops.append(op.type) ops.append(op.type)
if op.type == "matmul": if op.type == "matmul_v2":
out_name = op.output('Out')[0] out_name = op.output('Out')[0]
out_var = block.vars[out_name] out_var = block.vars[out_name]
op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op) op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
assert op_dist_attr.impl_idx == 0 assert op_dist_attr.impl_idx == 0
assert op_dist_attr.impl_type == "matmul" assert op_dist_attr.impl_type == "matmul_v2"
out_dims_mapping = op_dist_attr.get_output_dims_mapping( out_dims_mapping = op_dist_attr.get_output_dims_mapping(
out_name out_name
) )
...@@ -158,33 +156,33 @@ class TestDistMatmul(unittest.TestCase): ...@@ -158,33 +156,33 @@ class TestDistMatmul(unittest.TestCase):
out_var out_var
) )
assert tensor_dist_attr.dims_mapping == [0, 1] assert tensor_dist_attr.dims_mapping == [0, 1]
if op.type == "matmul_grad": if op.type == "matmul_v2_grad":
op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op) op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
assert op_dist_attr.impl_idx == 0 assert op_dist_attr.impl_idx == 0
assert op_dist_attr.impl_type == "matmul" assert op_dist_attr.impl_type == "matmul_v2"
assert ops == ref_ops assert ops == ref_ops
def check_row_program(self, main_program, dist_ctx): def check_row_program(self, main_program, dist_ctx):
# [0, -1, 1] * [1, -1] --> [0, -1, -1] # [0, -1, 1] * [1, -1] --> [0, -1, -1]
ref_ops = [ ref_ops = [
"matmul", "matmul_v2",
"c_allreduce_sum", "c_allreduce_sum",
"reduce_mean", "reduce_mean",
"fill_constant", "fill_constant",
"reduce_mean_grad", "reduce_mean_grad",
"matmul_grad", "matmul_v2_grad",
] ]
ops = [] ops = []
block = main_program.global_block() block = main_program.global_block()
for op in block.ops: for op in block.ops:
ops.append(op.type) ops.append(op.type)
if op.type == "matmul": if op.type == "matmul_v2":
out_name = op.output('Out')[0] out_name = op.output('Out')[0]
out_var = block.vars[out_name] out_var = block.vars[out_name]
op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op) op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
assert op_dist_attr.impl_idx == 1 assert op_dist_attr.impl_idx == 1
assert op_dist_attr.impl_type == "matmul" assert op_dist_attr.impl_type == "matmul_v2"
out_dims_mapping = op_dist_attr.get_output_dims_mapping( out_dims_mapping = op_dist_attr.get_output_dims_mapping(
out_name out_name
) )
...@@ -193,10 +191,10 @@ class TestDistMatmul(unittest.TestCase): ...@@ -193,10 +191,10 @@ class TestDistMatmul(unittest.TestCase):
out_var out_var
) )
assert tensor_dist_attr.dims_mapping == [0, -1, -1] assert tensor_dist_attr.dims_mapping == [0, -1, -1]
if op.type == "matmul_grad": if op.type == "matmul_v2_grad":
op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op) op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
assert op_dist_attr.impl_idx == 1 assert op_dist_attr.impl_idx == 1
assert op_dist_attr.impl_type == "matmul" assert op_dist_attr.impl_type == "matmul_v2"
assert ops == ref_ops assert ops == ref_ops
......
...@@ -168,9 +168,7 @@ class TestDistOpCost(unittest.TestCase): ...@@ -168,9 +168,7 @@ class TestDistOpCost(unittest.TestCase):
auto.ProcessMesh([0, 1], dim_names=["x"]), auto.ProcessMesh([0, 1], dim_names=["x"]),
[None, "x"], [None, "x"],
) )
out1 = paddle.fluid.layers.matmul( out1 = paddle.matmul(out, param1) # [8, 8] [-1, -1]
out, param1
) # [8, 8] [-1, -1]
tmp_param = paddle.create_parameter( tmp_param = paddle.create_parameter(
[8, 8], paddle.float32 [8, 8], paddle.float32
) # [8, 8] [-1, -1] ) # [8, 8] [-1, -1]
...@@ -179,10 +177,8 @@ class TestDistOpCost(unittest.TestCase): ...@@ -179,10 +177,8 @@ class TestDistOpCost(unittest.TestCase):
auto.ProcessMesh([0, 1], dim_names=["x"]), auto.ProcessMesh([0, 1], dim_names=["x"]),
[None, None], [None, None],
) )
tmp_out = paddle.fluid.layers.matmul(out1, tmp_param) tmp_out = paddle.matmul(out1, tmp_param)
out2 = paddle.fluid.layers.matmul( out2 = paddle.matmul(tmp_out, param2) # [8, 4] [-1, 0]
tmp_out, param2
) # [8, 4] [-1, 0]
out8 = paddle.transpose(out2, [1, 0]) # [4, 8] [0, -1] out8 = paddle.transpose(out2, [1, 0]) # [4, 8] [0, -1]
......
...@@ -231,8 +231,10 @@ class MultiHeadAttention(nn.Layer): ...@@ -231,8 +231,10 @@ class MultiHeadAttention(nn.Layer):
return self.Cache(key, value) return self.Cache(key, value)
def core_attn(self, q, k, v, attn_mask): def core_attn(self, q, k, v, attn_mask):
product = layers.matmul( product = paddle.matmul(x=q, y=k, transpose_y=True)
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5 product = paddle.multiply(
product,
paddle.to_tensor(self.head_dim**-0.5, dtype=product.dtype),
) )
if attn_mask is not None: if attn_mask is not None:
product = product + attn_mask product = product + attn_mask
......
...@@ -20,7 +20,6 @@ import numpy as np ...@@ -20,7 +20,6 @@ import numpy as np
import paddle import paddle
import paddle.distributed as dist import paddle.distributed as dist
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
import paddle.fluid as fluid
import paddle.nn as nn import paddle.nn as nn
from paddle.distributed.fleet.meta_parallel import PipelineLayer from paddle.distributed.fleet.meta_parallel import PipelineLayer
from paddle.fluid.dygraph.layers import Layer from paddle.fluid.dygraph.layers import Layer
...@@ -54,7 +53,7 @@ class SimpleNet(Layer): ...@@ -54,7 +53,7 @@ class SimpleNet(Layer):
def forward(self, x1, x2, y1): def forward(self, x1, x2, y1):
x_emb = self.word_embeddings(x1) x_emb = self.word_embeddings(x1)
fc = fluid.layers.matmul(x_emb, self.softmax_weight) fc = paddle.matmul(x_emb, self.softmax_weight)
fc = paddle.add(fc, self.softmax_bias) fc = paddle.add(fc, self.softmax_bias)
projection = paddle.reshape(fc, shape=[-1, vocab_size]) projection = paddle.reshape(fc, shape=[-1, vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy( loss = paddle.nn.functional.softmax_with_cross_entropy(
...@@ -83,7 +82,7 @@ class MatmulNet(Layer): ...@@ -83,7 +82,7 @@ class MatmulNet(Layer):
def forward(self, args): def forward(self, args):
x1, x2 = args x1, x2 = args
fc = fluid.layers.matmul(x1, self.softmax_weight) fc = paddle.matmul(x1, self.softmax_weight)
return fc, x2 return fc, x2
......
...@@ -24,7 +24,6 @@ import paddle.nn as nn ...@@ -24,7 +24,6 @@ import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle import framework from paddle import framework
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
from paddle.fluid import layers
from paddle.fluid.dygraph.layers import Layer from paddle.fluid.dygraph.layers import Layer
...@@ -73,13 +72,12 @@ class TransformerNet(Layer): ...@@ -73,13 +72,12 @@ class TransformerNet(Layer):
q = self.q_proj(x) q = self.q_proj(x)
k = self.k_proj(x) k = self.k_proj(x)
v = self.v_proj(x) v = self.v_proj(x)
product = layers.matmul( product = paddle.matmul(x=q, y=k, transpose_y=True)
x=q, y=k, transpose_y=True, alpha=d_model**-0.5 product = paddle.scale(product, scale=d_model**-0.5)
)
weights = F.softmax(product) weights = F.softmax(product)
weights = F.dropout(weights, 0.2) weights = F.dropout(weights, 0.2)
tgt = layers.matmul(weights, v) tgt = paddle.matmul(weights, v)
residual = tgt residual = tgt
tgt = self.norm1(tgt) tgt = self.norm1(tgt)
tgt = residual + tgt tgt = residual + tgt
......
...@@ -23,7 +23,6 @@ import paddle.distributed.fleet as fleet ...@@ -23,7 +23,6 @@ import paddle.distributed.fleet as fleet
import paddle.nn as nn import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
from paddle.fluid import layers
from paddle.fluid.dygraph.layers import Layer from paddle.fluid.dygraph.layers import Layer
...@@ -82,14 +81,13 @@ class TransformerNet(Layer): ...@@ -82,14 +81,13 @@ class TransformerNet(Layer):
q = self.q_proj(x) q = self.q_proj(x)
k = self.k_proj(x) k = self.k_proj(x)
v = self.v_proj(x) v = self.v_proj(x)
product = layers.matmul( product = paddle.matmul(x=q, y=k, transpose_y=True)
x=q, y=k, transpose_y=True, alpha=d_model**-0.5 product = paddle.scale(product, scale=d_model**-0.5)
)
weights = F.softmax(product + mask) weights = F.softmax(product + mask)
# TODO(shenliang03) For save/load in PipeLineParallel, can’t support dropout temporarily. # TODO(shenliang03) For save/load in PipeLineParallel, can’t support dropout temporarily.
# weights = F.dropout(weights, 0.2) # weights = F.dropout(weights, 0.2)
tgt = layers.matmul(weights, v) tgt = paddle.matmul(weights, v)
residual = tgt residual = tgt
tgt = self.norm1(tgt) tgt = self.norm1(tgt)
tgt = residual + tgt tgt = residual + tgt
......
...@@ -23,7 +23,6 @@ import paddle.distributed.fleet as fleet ...@@ -23,7 +23,6 @@ import paddle.distributed.fleet as fleet
import paddle.nn as nn import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
from paddle.fluid import layers
from paddle.fluid.dygraph.layers import Layer from paddle.fluid.dygraph.layers import Layer
...@@ -83,12 +82,11 @@ class TransformerNet(Layer): ...@@ -83,12 +82,11 @@ class TransformerNet(Layer):
q = self.q_proj(x) q = self.q_proj(x)
k = self.k_proj(x) k = self.k_proj(x)
v = self.v_proj(x) v = self.v_proj(x)
product = layers.matmul( product = paddle.matmul(x=q, y=k, transpose_y=True)
x=q, y=k, transpose_y=True, alpha=d_model**-0.5 product = paddle.scale(product, scale=d_model**-0.5)
)
weights = F.softmax(product + mask) weights = F.softmax(product + mask)
tgt = layers.matmul(weights, v) tgt = paddle.matmul(weights, v)
residual = tgt residual = tgt
tgt = self.norm1(tgt) tgt = self.norm1(tgt)
tgt = residual + tgt tgt = residual + tgt
......
...@@ -20,7 +20,6 @@ import numpy as np ...@@ -20,7 +20,6 @@ import numpy as np
import paddle import paddle
import paddle.distributed as dist import paddle.distributed as dist
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
import paddle.fluid as fluid
import paddle.nn as nn import paddle.nn as nn
from paddle.distributed.fleet.meta_parallel import ( from paddle.distributed.fleet.meta_parallel import (
LayerDesc, LayerDesc,
...@@ -61,7 +60,7 @@ class SimpleNet(Layer): ...@@ -61,7 +60,7 @@ class SimpleNet(Layer):
def forward(self, x1, x2, y1): def forward(self, x1, x2, y1):
x_emb = self.word_embeddings(x1) x_emb = self.word_embeddings(x1)
fc = fluid.layers.matmul(x_emb, self.softmax_weight) fc = paddle.matmul(x_emb, self.softmax_weight)
fc = paddle.add(fc, self.softmax_bias) fc = paddle.add(fc, self.softmax_bias)
projection = paddle.reshape(fc, shape=[-1, vocab_size]) projection = paddle.reshape(fc, shape=[-1, vocab_size])
...@@ -97,7 +96,7 @@ class MatmulNet(Layer): ...@@ -97,7 +96,7 @@ class MatmulNet(Layer):
def forward(self, args): def forward(self, args):
x1, x2 = args x1, x2 = args
fc = fluid.layers.matmul(x1, self.softmax_weight) fc = paddle.matmul(x1, self.softmax_weight)
return fc, x2 return fc, x2
......
...@@ -334,12 +334,12 @@ class MultiHeadAttentionLayer(Layer): ...@@ -334,12 +334,12 @@ class MultiHeadAttentionLayer(Layer):
transpose_v = paddle.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) transpose_v = paddle.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
# scale dot product attention # scale dot product attention
product = fluid.layers.matmul( product = paddle.matmul(
x=transpose_q, x=transpose_q,
y=transpose_k, y=transpose_k,
transpose_y=True, transpose_y=True,
alpha=self._d_model**-0.5,
) )
product = paddle.scale(product, scale=self._d_model**-0.5)
if attn_bias is not None: if attn_bias is not None:
product += attn_bias product += attn_bias
weights = paddle.nn.functional.softmax(product) weights = paddle.nn.functional.softmax(product)
...@@ -350,9 +350,9 @@ class MultiHeadAttentionLayer(Layer): ...@@ -350,9 +350,9 @@ class MultiHeadAttentionLayer(Layer):
seed=ModelHyperParams.dropout_seed, seed=ModelHyperParams.dropout_seed,
is_test=False, is_test=False,
) )
out = fluid.layers.matmul(weights_droped, transpose_v) out = paddle.matmul(weights_droped, transpose_v)
else: else:
out = fluid.layers.matmul(weights, transpose_v) out = paddle.matmul(weights, transpose_v)
# combine heads # combine heads
if len(out.shape) != 4: if len(out.shape) != 4:
...@@ -839,7 +839,7 @@ class WrapDecoderLayer(Layer): ...@@ -839,7 +839,7 @@ class WrapDecoderLayer(Layer):
) )
if self._weight_sharing: if self._weight_sharing:
predict = fluid.layers.matmul( predict = paddle.matmul(
x=dec_output_reshape, x=dec_output_reshape,
y=self._prepare_decoder_layer._input_emb.weight, y=self._prepare_decoder_layer._input_emb.weight,
transpose_y=True, transpose_y=True,
......
...@@ -1174,7 +1174,7 @@ def multi_head_attention( ...@@ -1174,7 +1174,7 @@ def multi_head_attention(
Scaled Dot-Product Attention Scaled Dot-Product Attention
""" """
scaled_q = paddle.scale(x=q, scale=d_model**-0.5) scaled_q = paddle.scale(x=q, scale=d_model**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True) product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
if attn_bias: if attn_bias:
product += attn_bias product += attn_bias
weights = paddle.nn.functional.softmax(product) weights = paddle.nn.functional.softmax(product)
...@@ -1185,7 +1185,7 @@ def multi_head_attention( ...@@ -1185,7 +1185,7 @@ def multi_head_attention(
seed=ModelHyperParams.dropout_seed, seed=ModelHyperParams.dropout_seed,
is_test=False, is_test=False,
) )
out = layers.matmul(weights, v) out = paddle.matmul(weights, v)
return out return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
...@@ -1701,7 +1701,7 @@ def wrap_decoder( ...@@ -1701,7 +1701,7 @@ def wrap_decoder(
) )
# Return logits for training and probs for inference. # Return logits for training and probs for inference.
if weight_sharing: if weight_sharing:
predict = layers.matmul( predict = paddle.matmul(
x=dec_output, x=dec_output,
y=fluid.framework._get_var(word_emb_param_names[0]), y=fluid.framework._get_var(word_emb_param_names[0]),
transpose_y=True, transpose_y=True,
......
...@@ -272,7 +272,7 @@ class BertModelLayer(Layer): ...@@ -272,7 +272,7 @@ class BertModelLayer(Layer):
emb_out = self.pre_process_layer(emb_out) emb_out = self.pre_process_layer(emb_out)
self_attn_mask = fluid.layers.matmul( self_attn_mask = paddle.matmul(
x=input_mask, y=input_mask, transpose_y=True x=input_mask, y=input_mask, transpose_y=True
) )
self_attn_mask = paddle.scale( self_attn_mask = paddle.scale(
...@@ -401,7 +401,7 @@ class PretrainModelLayer(Layer): ...@@ -401,7 +401,7 @@ class PretrainModelLayer(Layer):
mask_trans_feat = self.pre_process_layer(mask_trans_feat) mask_trans_feat = self.pre_process_layer(mask_trans_feat)
if self._weight_sharing: if self._weight_sharing:
fc_out = fluid.layers.matmul( fc_out = paddle.matmul(
x=mask_trans_feat, x=mask_trans_feat,
y=self.bert_layer._src_emb._w, y=self.bert_layer._src_emb._w,
transpose_y=True, transpose_y=True,
......
...@@ -70,7 +70,7 @@ class BasicLSTMUnit(Layer): ...@@ -70,7 +70,7 @@ class BasicLSTMUnit(Layer):
def forward(self, input, pre_hidden, pre_cell): def forward(self, input, pre_hidden, pre_cell):
concat_input_hidden = layers.concat([input, pre_hidden], 1) concat_input_hidden = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) gate_input = paddle.matmul(x=concat_input_hidden, y=self._weight)
gate_input = paddle.add(gate_input, self._bias) gate_input = paddle.add(gate_input, self._bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
...@@ -697,14 +697,14 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -697,14 +697,14 @@ class AttentionModel(fluid.dygraph.Layer):
def attention(self, query, enc_output, mask=None): def attention(self, query, enc_output, mask=None):
query = fluid.layers.unsqueeze(query, [1]) query = fluid.layers.unsqueeze(query, [1])
memory = self.attn_fc(enc_output) memory = self.attn_fc(enc_output)
attn = fluid.layers.matmul(query, memory, transpose_y=True) attn = paddle.matmul(query, memory, transpose_y=True)
if mask is not None: if mask is not None:
attn = paddle.transpose(attn, [1, 0, 2]) attn = paddle.transpose(attn, [1, 0, 2])
attn = paddle.add(attn, mask * 1000000000) attn = paddle.add(attn, mask * 1000000000)
attn = paddle.transpose(attn, [1, 0, 2]) attn = paddle.transpose(attn, [1, 0, 2])
weight = paddle.nn.functional.softmax(attn) weight = paddle.nn.functional.softmax(attn)
weight_memory = fluid.layers.matmul(weight, memory) weight_memory = paddle.matmul(weight, memory)
return weight_memory return weight_memory
......
...@@ -282,7 +282,7 @@ class BMN(fluid.dygraph.Layer): ...@@ -282,7 +282,7 @@ class BMN(fluid.dygraph.Layer):
# PEM # PEM
xp = paddle.nn.functional.relu(self.p_conv1(x)) xp = paddle.nn.functional.relu(self.p_conv1(x))
# BM layer # BM layer
xp = fluid.layers.matmul(xp, self.sample_mask) xp = paddle.matmul(xp, self.sample_mask)
xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale]) xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale])
xp = self.p_conv3d1(xp) xp = self.p_conv3d1(xp)
......
...@@ -66,9 +66,9 @@ class SubNetWithDict(fluid.dygraph.Layer): ...@@ -66,9 +66,9 @@ class SubNetWithDict(fluid.dygraph.Layer):
v = 0.2 * cache_v + v v = 0.2 * cache_v + v
cache["k"], cache["v"] = k, v cache["k"], cache["v"] = k, v
weight = fluid.layers.matmul(x=q, y=k, transpose_y=True) weight = paddle.matmul(x=q, y=k, transpose_y=True)
weight = paddle.nn.functional.softmax(weight) weight = paddle.nn.functional.softmax(weight)
out = fluid.layers.matmul(weight, v) out = paddle.matmul(weight, v)
return out return out
......
...@@ -42,7 +42,7 @@ np.random.seed(0) ...@@ -42,7 +42,7 @@ np.random.seed(0)
def simple_func(x, weight_numpy): def simple_func(x, weight_numpy):
x = fluid.dygraph.to_variable(x) x = fluid.dygraph.to_variable(x)
w = fluid.dygraph.to_variable(weight_numpy) w = fluid.dygraph.to_variable(weight_numpy)
y = fluid.layers.matmul(x, w) y = paddle.matmul(x, w)
z = paddle.mean(y) z = paddle.mean(y)
return z return z
...@@ -51,7 +51,7 @@ def simple_func(x, weight_numpy): ...@@ -51,7 +51,7 @@ def simple_func(x, weight_numpy):
def decorated_simple_func(x, weight_numpy): def decorated_simple_func(x, weight_numpy):
x = fluid.dygraph.to_variable(x) x = fluid.dygraph.to_variable(x)
w = fluid.dygraph.to_variable(weight_numpy) w = fluid.dygraph.to_variable(weight_numpy)
y = fluid.layers.matmul(x, w) y = paddle.matmul(x, w)
z = paddle.mean(y) z = paddle.mean(y)
return z return z
......
...@@ -94,7 +94,7 @@ class SimpleLSTMRNN(fluid.Layer): ...@@ -94,7 +94,7 @@ class SimpleLSTMRNN(fluid.Layer):
bias = self.bias_arr[k] bias = self.bias_arr[k]
nn = fluid.layers.concat([step_input, pre_hidden], 1) nn = fluid.layers.concat([step_input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1) gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias) gate_input = paddle.add(gate_input, bias)
i, j, f, o = fluid.layers.split( i, j, f, o = fluid.layers.split(
...@@ -213,7 +213,7 @@ class PtbModel(fluid.Layer): ...@@ -213,7 +213,7 @@ class PtbModel(fluid.Layer):
x_emb, init_h, init_c x_emb, init_h, init_c
) )
projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = paddle.matmul(rnn_out, self.softmax_weight)
projection = paddle.add(projection, self.softmax_bias) projection = paddle.add(projection, self.softmax_bias)
loss = paddle.nn.functional.softmax_with_cross_entropy( loss = paddle.nn.functional.softmax_with_cross_entropy(
......
...@@ -148,16 +148,14 @@ class MultiHeadAttention(Layer): ...@@ -148,16 +148,14 @@ class MultiHeadAttention(Layer):
v = layers.concat([cache_v, v], axis=2) v = layers.concat([cache_v, v], axis=2)
cache["k"], cache["v"] = k, v cache["k"], cache["v"] = k, v
# scale dot product attention # scale dot product attention
product = layers.matmul( product = paddle.matmul(x=q, y=k, transpose_y=True)
x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5 product = paddle.scale(product, scale=self.d_model**-0.5)
)
if attn_bias is not None: if attn_bias is not None:
product += attn_bias product += attn_bias
weights = paddle.nn.functional.softmax(product) weights = paddle.nn.functional.softmax(product)
if self.dropout_rate: if self.dropout_rate:
weights = layers.dropout(weights, dropout_prob=self.dropout_rate) weights = layers.dropout(weights, dropout_prob=self.dropout_rate)
out = layers.matmul(weights, v) out = paddle.matmul(weights, v)
out = paddle.transpose(out, perm=[0, 2, 1, 3]) out = paddle.transpose(out, perm=[0, 2, 1, 3])
out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
...@@ -524,7 +522,7 @@ class WrapDecoder(Layer): ...@@ -524,7 +522,7 @@ class WrapDecoder(Layer):
postprocess_cmd, postprocess_cmd,
) )
if share_input_output_embed: if share_input_output_embed:
self.linear = lambda x: layers.matmul( self.linear = lambda x: paddle.matmul(
x=x, y=self.word_embedder.word_embedder.weight, transpose_y=True x=x, y=self.word_embedder.word_embedder.weight, transpose_y=True
) )
else: else:
......
...@@ -44,7 +44,6 @@ class TestBase(IPUOpTest): ...@@ -44,7 +44,6 @@ class TestBase(IPUOpTest):
self.attrs = { self.attrs = {
"transpose_x": False, "transpose_x": False,
"transpose_y": False, "transpose_y": False,
"alpha": 1.0,
} }
@IPUOpTest.static_graph @IPUOpTest.static_graph
...@@ -56,7 +55,7 @@ class TestBase(IPUOpTest): ...@@ -56,7 +55,7 @@ class TestBase(IPUOpTest):
name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32' name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32'
) )
out = paddle.fluid.layers.matmul(x, y, **self.attrs) out = paddle.matmul(x, y, **self.attrs)
self.fetch_list = [out.name] self.fetch_list = [out.name]
def run_model(self, exec_mode): def run_model(self, exec_mode):
...@@ -75,7 +74,6 @@ class TestCase1(TestBase): ...@@ -75,7 +74,6 @@ class TestCase1(TestBase):
self.attrs = { self.attrs = {
"transpose_x": True, "transpose_x": True,
"transpose_y": True, "transpose_y": True,
"alpha": 1.0,
} }
...@@ -84,7 +82,6 @@ class TestCase2(TestBase): ...@@ -84,7 +82,6 @@ class TestCase2(TestBase):
self.attrs = { self.attrs = {
"transpose_x": True, "transpose_x": True,
"transpose_y": True, "transpose_y": True,
"alpha": 3.14,
} }
def set_atol(self): def set_atol(self):
...@@ -141,7 +138,6 @@ class TestCase6_2(TestCase6): ...@@ -141,7 +138,6 @@ class TestCase6_2(TestCase6):
self.attrs = { self.attrs = {
"transpose_x": True, "transpose_x": True,
"transpose_y": True, "transpose_y": True,
"alpha": 1.0,
} }
...@@ -154,7 +150,10 @@ class TestCase7(TestBase): ...@@ -154,7 +150,10 @@ class TestCase7(TestBase):
self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
def set_op_attrs(self): def set_op_attrs(self):
self.attrs = {"transpose_x": False, "transpose_y": True, "alpha": 0.125} self.attrs = {
"transpose_x": False,
"transpose_y": True,
}
class TestCase8(TestBase): class TestCase8(TestBase):
...@@ -179,7 +178,6 @@ class TestCase8_2(TestBase): ...@@ -179,7 +178,6 @@ class TestCase8_2(TestBase):
self.attrs = { self.attrs = {
"transpose_x": True, "transpose_x": True,
"transpose_y": True, "transpose_y": True,
"alpha": 1.0,
} }
......
...@@ -67,7 +67,7 @@ class TestWeightSharing(IPUOpTest): ...@@ -67,7 +67,7 @@ class TestWeightSharing(IPUOpTest):
input=y, size=768, param_attr=paddle.fluid.ParamAttr(name="fc") input=y, size=768, param_attr=paddle.fluid.ParamAttr(name="fc")
) )
with paddle.static.ipu_shard_guard(index=0, stage=2): with paddle.static.ipu_shard_guard(index=0, stage=2):
out = paddle.fluid.layers.matmul( out = paddle.matmul(
x=z, x=z,
y=self.main_prog.global_block().var('word_embedding'), y=self.main_prog.global_block().var('word_embedding'),
transpose_y=True, transpose_y=True,
......
...@@ -37,7 +37,7 @@ class TestMKLDNNMatmulFuseOp(InferencePassTest): ...@@ -37,7 +37,7 @@ class TestMKLDNNMatmulFuseOp(InferencePassTest):
y = fluid.data( y = fluid.data(
name='y', shape=[-1] + self.shape_y, dtype=self.d_type name='y', shape=[-1] + self.shape_y, dtype=self.d_type
) )
out = fluid.layers.matmul(x, y) out = paddle.matmul(x, y)
out = paddle.transpose(out, perm=[0, 2, 1, 3]) out = paddle.transpose(out, perm=[0, 2, 1, 3])
out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]]) out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]])
...@@ -79,7 +79,7 @@ class TestMKLDNNMatmulOpNotFusedWrongTransposeAxis(TestMKLDNNMatmulFuseOp): ...@@ -79,7 +79,7 @@ class TestMKLDNNMatmulOpNotFusedWrongTransposeAxis(TestMKLDNNMatmulFuseOp):
y = fluid.data( y = fluid.data(
name='y', shape=[-1] + self.shape_y, dtype=self.d_type name='y', shape=[-1] + self.shape_y, dtype=self.d_type
) )
out = fluid.layers.matmul(x, y) out = paddle.matmul(x, y)
out = paddle.transpose(out, perm=[0, 1, 2, 3]) out = paddle.transpose(out, perm=[0, 1, 2, 3])
out = paddle.reshape(out, [0, 0, 0, 0]) out = paddle.reshape(out, [0, 0, 0, 0])
out = fluid.layers.fc(out, size=1) out = fluid.layers.fc(out, size=1)
...@@ -102,7 +102,7 @@ class TestMKLDNNMatmulOpNotFusedBreakPattern(TestMKLDNNMatmulFuseOp): ...@@ -102,7 +102,7 @@ class TestMKLDNNMatmulOpNotFusedBreakPattern(TestMKLDNNMatmulFuseOp):
y = fluid.data( y = fluid.data(
name='y', shape=[-1] + self.shape_y, dtype=self.d_type name='y', shape=[-1] + self.shape_y, dtype=self.d_type
) )
out = fluid.layers.matmul(x, y) out = paddle.matmul(x, y)
out = paddle.transpose(out, perm=[0, 2, 1, 3]) out = paddle.transpose(out, perm=[0, 2, 1, 3])
out = paddle.transpose(out, perm=[0, 1, 2, 3]) # breaks pattern out = paddle.transpose(out, perm=[0, 1, 2, 3]) # breaks pattern
out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]]) out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]])
......
...@@ -30,13 +30,13 @@ class TensorRTInspectorTest(InferencePassTest): ...@@ -30,13 +30,13 @@ class TensorRTInspectorTest(InferencePassTest):
self.set_params() self.set_params()
with fluid.program_guard(self.main_program, self.startup_program): with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(name="data", shape=[1, 16, 16], dtype="float32") data = fluid.data(name="data", shape=[1, 16, 16], dtype="float32")
matmul_out = fluid.layers.matmul( matmul_out = paddle.matmul(
x=data, x=data,
y=data, y=data,
transpose_x=self.transpose_x, transpose_x=self.transpose_x,
transpose_y=self.transpose_y, transpose_y=self.transpose_y,
alpha=self.alpha,
) )
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True) out = fluid.layers.batch_norm(matmul_out, is_test=True)
self.feeds = { self.feeds = {
......
...@@ -17,6 +17,7 @@ import unittest ...@@ -17,6 +17,7 @@ import unittest
import numpy as np import numpy as np
from inference_pass_test import InferencePassTest from inference_pass_test import InferencePassTest
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.core import AnalysisConfig, PassVersionChecker from paddle.fluid.core import AnalysisConfig, PassVersionChecker
...@@ -27,13 +28,13 @@ class TensorRTMatMulDims2Test(InferencePassTest): ...@@ -27,13 +28,13 @@ class TensorRTMatMulDims2Test(InferencePassTest):
self.set_params() self.set_params()
with fluid.program_guard(self.main_program, self.startup_program): with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(name="data", shape=[24, 24], dtype="float32") data = fluid.data(name="data", shape=[24, 24], dtype="float32")
matmul_out = fluid.layers.matmul( matmul_out = paddle.matmul(
x=data, x=data,
y=data, y=data,
transpose_x=self.transpose_x, transpose_x=self.transpose_x,
transpose_y=self.transpose_y, transpose_y=self.transpose_y,
alpha=self.alpha,
) )
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True) out = fluid.layers.batch_norm(matmul_out, is_test=True)
self.feeds = { self.feeds = {
...@@ -66,13 +67,13 @@ class TensorRTMatMulTest(InferencePassTest): ...@@ -66,13 +67,13 @@ class TensorRTMatMulTest(InferencePassTest):
data = fluid.data( data = fluid.data(
name="data", shape=[-1, 6, 24, 24], dtype="float32" name="data", shape=[-1, 6, 24, 24], dtype="float32"
) )
matmul_out = fluid.layers.matmul( matmul_out = paddle.matmul(
x=data, x=data,
y=data, y=data,
transpose_x=self.transpose_x, transpose_x=self.transpose_x,
transpose_y=self.transpose_y, transpose_y=self.transpose_y,
alpha=self.alpha,
) )
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True) out = fluid.layers.batch_norm(matmul_out, is_test=True)
self.feeds = { self.feeds = {
...@@ -128,13 +129,13 @@ class TensorRTMatMulBroadcastTest(InferencePassTest): ...@@ -128,13 +129,13 @@ class TensorRTMatMulBroadcastTest(InferencePassTest):
name="data_x", shape=[-1, 6, 24], dtype="float32" name="data_x", shape=[-1, 6, 24], dtype="float32"
) )
data_y = fluid.data(name="data_y", shape=[24, 16], dtype="float32") data_y = fluid.data(name="data_y", shape=[24, 16], dtype="float32")
matmul_out = fluid.layers.matmul( matmul_out = paddle.matmul(
x=data_x, x=data_x,
y=data_y, y=data_y,
transpose_x=self.transpose_x, transpose_x=self.transpose_x,
transpose_y=self.transpose_y, transpose_y=self.transpose_y,
alpha=self.alpha,
) )
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True) out = fluid.layers.batch_norm(matmul_out, is_test=True)
self.feeds = { self.feeds = {
......
...@@ -32,13 +32,13 @@ class TensorRTMatMulQuantDequantDims3Test(QuantDequantTest): ...@@ -32,13 +32,13 @@ class TensorRTMatMulQuantDequantDims3Test(QuantDequantTest):
name='data', shape=[1, 28, 28], dtype='float32' name='data', shape=[1, 28, 28], dtype='float32'
) )
self.label = fluid.data(name='label', shape=[1, 1], dtype='int64') self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
matmul_out = fluid.layers.matmul( matmul_out = paddle.matmul(
x=self.data, x=self.data,
y=self.data, y=self.data,
transpose_x=self.transpose_x, transpose_x=self.transpose_x,
transpose_y=self.transpose_y, transpose_y=self.transpose_y,
alpha=self.alpha,
) )
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
fc_out = fluid.layers.fc( fc_out = fluid.layers.fc(
input=matmul_out, input=matmul_out,
size=10, size=10,
...@@ -128,13 +128,13 @@ class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest): ...@@ -128,13 +128,13 @@ class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest):
) )
self.label = fluid.data(name='label', shape=[1, 1], dtype='int64') self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
reshape_out = paddle.reshape(self.data, shape=[1, 4, 14, 14]) reshape_out = paddle.reshape(self.data, shape=[1, 4, 14, 14])
matmul_out = fluid.layers.matmul( matmul_out = paddle.matmul(
x=reshape_out, x=reshape_out,
y=reshape_out, y=reshape_out,
transpose_x=self.transpose_x, transpose_x=self.transpose_x,
transpose_y=self.transpose_y, transpose_y=self.transpose_y,
alpha=self.alpha,
) )
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True) out = fluid.layers.batch_norm(matmul_out, is_test=True)
fc_out = fluid.layers.fc( fc_out = fluid.layers.fc(
input=matmul_out, input=matmul_out,
...@@ -224,13 +224,13 @@ class TensorRTMatMulQuantDequantDims3DynamicTest(QuantDequantTest): ...@@ -224,13 +224,13 @@ class TensorRTMatMulQuantDequantDims3DynamicTest(QuantDequantTest):
name='data', shape=[-1, 28, 28], dtype='float32' name='data', shape=[-1, 28, 28], dtype='float32'
) )
self.label = fluid.data(name='label', shape=[1, 1], dtype='int64') self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
matmul_out = fluid.layers.matmul( matmul_out = paddle.matmul(
x=self.data, x=self.data,
y=self.data, y=self.data,
transpose_x=self.transpose_x, transpose_x=self.transpose_x,
transpose_y=self.transpose_y, transpose_y=self.transpose_y,
alpha=self.alpha,
) )
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True) out = fluid.layers.batch_norm(matmul_out, is_test=True)
fc_out = fluid.layers.fc( fc_out = fluid.layers.fc(
input=matmul_out, input=matmul_out,
......
...@@ -48,7 +48,7 @@ def check(): ...@@ -48,7 +48,7 @@ def check():
a = fluid.dygraph.to_variable(a_np) a = fluid.dygraph.to_variable(a_np)
b = fluid.dygraph.to_variable(b_np) b = fluid.dygraph.to_variable(b_np)
y = paddle.add(x=a, y=b) y = paddle.add(x=a, y=b)
y = fluid.layers.matmul(x=y, y=b, transpose_y=True) y = paddle.matmul(x=y, y=b, transpose_y=True)
res1 = func(y) res1 = func(y)
np_res = np.add(a_np, b_np) np_res = np.add(a_np, b_np)
......
...@@ -87,14 +87,14 @@ class TestFlagsUseMkldnn(unittest.TestCase): ...@@ -87,14 +87,14 @@ class TestFlagsUseMkldnn(unittest.TestCase):
assert self.not_found(self.matmul_regex, out, err) assert self.not_found(self.matmul_regex, out, err)
def test_flags_use_mkl_dnn_off(self): def test_flags_use_mkl_dnn_off(self):
env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul")} env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul_v2")}
out, err = self.flags_use_mkl_dnn_common(env) out, err = self.flags_use_mkl_dnn_common(env)
assert self.found(self.relu_regex, out, err) assert self.found(self.relu_regex, out, err)
assert self.found(self.ew_add_regex, out, err) assert self.found(self.ew_add_regex, out, err)
assert self.not_found(self.matmul_regex, out, err) assert self.not_found(self.matmul_regex, out, err)
def test_flags_use_mkl_dnn_off_multiple(self): def test_flags_use_mkl_dnn_off_multiple(self):
env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul,relu")} env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul_v2,relu")}
out, err = self.flags_use_mkl_dnn_common(env) out, err = self.flags_use_mkl_dnn_common(env)
assert self.not_found(self.relu_regex, out, err) assert self.not_found(self.relu_regex, out, err)
assert self.found(self.ew_add_regex, out, err) assert self.found(self.ew_add_regex, out, err)
...@@ -103,7 +103,7 @@ class TestFlagsUseMkldnn(unittest.TestCase): ...@@ -103,7 +103,7 @@ class TestFlagsUseMkldnn(unittest.TestCase):
def test_flags_use_mkl_dnn_on_off(self): def test_flags_use_mkl_dnn_on_off(self):
env = { env = {
str("FLAGS_tracer_mkldnn_ops_on"): str("elementwise_add"), str("FLAGS_tracer_mkldnn_ops_on"): str("elementwise_add"),
str("FLAGS_tracer_mkldnn_ops_off"): str("matmul"), str("FLAGS_tracer_mkldnn_ops_off"): str("matmul_v2"),
} }
out, err = self.flags_use_mkl_dnn_common(env) out, err = self.flags_use_mkl_dnn_common(env)
assert self.not_found(self.relu_regex, out, err) assert self.not_found(self.relu_regex, out, err)
......
...@@ -65,7 +65,7 @@ class SimpleNet(fluid.Layer): ...@@ -65,7 +65,7 @@ class SimpleNet(fluid.Layer):
def forward(self, input, label): def forward(self, input, label):
x_emb = self.embedding(input) x_emb = self.embedding(input)
fc = fluid.layers.matmul(x_emb, self.softmax_weight) fc = paddle.matmul(x_emb, self.softmax_weight)
fc = paddle.add(fc, self.softmax_bias) fc = paddle.add(fc, self.softmax_bias)
projection = paddle.reshape(fc, shape=[-1, self.vocab_size]) projection = paddle.reshape(fc, shape=[-1, self.vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy( loss = paddle.nn.functional.softmax_with_cross_entropy(
......
...@@ -24,7 +24,6 @@ import paddle.utils as utils ...@@ -24,7 +24,6 @@ import paddle.utils as utils
from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.completion import Completer
from paddle.distributed.auto_parallel.dist_context import DistributedContext from paddle.distributed.auto_parallel.dist_context import DistributedContext
from paddle.distributed.fleet import auto from paddle.distributed.fleet import auto
from paddle.fluid import layers
paddle.enable_static() paddle.enable_static()
_global_parallel_strategy = None _global_parallel_strategy = None
...@@ -301,9 +300,8 @@ class AttentionLayer(nn.Layer): ...@@ -301,9 +300,8 @@ class AttentionLayer(nn.Layer):
v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
# scale dot product attention # scale dot product attention
product = layers.matmul( product = tensor.matmul(x=q, y=k, transpose_y=True)
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5 product = tensor.scale(product, scale=self.head_dim**-0.5)
)
if self.attn_mask is not None: if self.attn_mask is not None:
product = product + self.attn_mask product = product + self.attn_mask
...@@ -568,9 +566,8 @@ class DecoderLayer(nn.Layer): ...@@ -568,9 +566,8 @@ class DecoderLayer(nn.Layer):
v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
# scale dot product attention # scale dot product attention
product = layers.matmul( product = tensor.matmul(x=q, y=k, transpose_y=True)
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5 product = tensor.scale(product, scale=self.head_dim**-0.5)
)
if self.attn_mask is not None: if self.attn_mask is not None:
product = product + self.attn_mask product = product + self.attn_mask
......
...@@ -210,9 +210,8 @@ class MultiHeadAttention(nn.Layer): ...@@ -210,9 +210,8 @@ class MultiHeadAttention(nn.Layer):
query, key, value, use_cache, cache query, key, value, use_cache, cache
) )
# scale dot product attention # scale dot product attention
product = layers.matmul( product = tensor.matmul(x=q, y=k, transpose_y=True)
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5 product = tensor.scale(product, scale=self.head_dim**-0.5)
)
if attn_mask is not None: if attn_mask is not None:
product = product + attn_mask product = product + attn_mask
......
...@@ -28,7 +28,6 @@ from paddle.distributed.auto_parallel.partitioner import Partitioner ...@@ -28,7 +28,6 @@ from paddle.distributed.auto_parallel.partitioner import Partitioner
from paddle.distributed.auto_parallel.process_group import new_process_group from paddle.distributed.auto_parallel.process_group import new_process_group
from paddle.distributed.auto_parallel.utils import _get_comm_group from paddle.distributed.auto_parallel.utils import _get_comm_group
from paddle.distributed.fleet import auto from paddle.distributed.fleet import auto
from paddle.fluid import layers
paddle.enable_static() paddle.enable_static()
_global_parallel_strategy = None _global_parallel_strategy = None
...@@ -695,9 +694,8 @@ class AttentionLayer(nn.Layer): ...@@ -695,9 +694,8 @@ class AttentionLayer(nn.Layer):
v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
# scale dot product attention # scale dot product attention
product = layers.matmul( product = tensor.matmul(x=q, y=k, transpose_y=True)
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5 product = tensor.scale(product, scale=self.head_dim**-0.5)
)
if self.attn_mask is not None: if self.attn_mask is not None:
product = product + self.attn_mask product = product + self.attn_mask
...@@ -868,7 +866,8 @@ class TestAttentionAutoPartitioner(unittest.TestCase): ...@@ -868,7 +866,8 @@ class TestAttentionAutoPartitioner(unittest.TestCase):
'transpose2', 'transpose2',
'reshape2', 'reshape2',
'transpose2', 'transpose2',
'matmul', 'matmul_v2',
"scale",
'softmax', 'softmax',
'dropout', 'dropout',
'matmul_v2', 'matmul_v2',
...@@ -976,7 +975,8 @@ class TestAttentionAutoPartitioner(unittest.TestCase): ...@@ -976,7 +975,8 @@ class TestAttentionAutoPartitioner(unittest.TestCase):
'transpose2', 'transpose2',
'reshape2', 'reshape2',
'transpose2', 'transpose2',
'matmul', 'matmul_v2',
"scale",
'softmax', 'softmax',
'dropout', 'dropout',
'matmul_v2', 'matmul_v2',
...@@ -1166,9 +1166,8 @@ class DecoderLayer(nn.Layer): ...@@ -1166,9 +1166,8 @@ class DecoderLayer(nn.Layer):
v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
# scale dot product attention # scale dot product attention
product = layers.matmul( product = tensor.matmul(x=q, y=k, transpose_y=True)
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5 product = tensor.scale(product, scale=self.head_dim**-0.5)
)
if self.attn_mask is not None: if self.attn_mask is not None:
product = product + self.attn_mask product = product + self.attn_mask
...@@ -1347,7 +1346,8 @@ class TestDecoderLayerPartitioner(unittest.TestCase): ...@@ -1347,7 +1346,8 @@ class TestDecoderLayerPartitioner(unittest.TestCase):
'transpose2', 'transpose2',
'reshape2', 'reshape2',
'transpose2', 'transpose2',
'matmul', 'matmul_v2',
"scale",
'softmax', 'softmax',
'dropout', 'dropout',
'matmul_v2', 'matmul_v2',
...@@ -1399,15 +1399,15 @@ class TestDecoderLayerPartitioner(unittest.TestCase): ...@@ -1399,15 +1399,15 @@ class TestDecoderLayerPartitioner(unittest.TestCase):
distributed_attr_check_for_program(dist_main_prog, dist_context) distributed_attr_check_for_program(dist_main_prog, dist_context)
) )
# check distribured attr # check distribured attr
serial_op_idx = [0, 5, 9, 11, 23, 28, 31] serial_op_idx = [0, 5, 9, 11, 24, 29, 32]
dist_op_idx = [ dist_op_idx = [
[0, 1], [0, 1],
[6, 7], [6, 7],
[11, 12], [11, 12],
[14, 15], [14, 15],
[27, 28], [28, 29],
[33, 34], [34, 35],
[37, 38], [38, 39],
] ]
self.assertTrue( self.assertTrue(
distributed_attr_check_for_dist_op( distributed_attr_check_for_dist_op(
...@@ -1500,7 +1500,8 @@ class TestDecoderLayerPartitioner(unittest.TestCase): ...@@ -1500,7 +1500,8 @@ class TestDecoderLayerPartitioner(unittest.TestCase):
'transpose2', 'transpose2',
'reshape2', 'reshape2',
'transpose2', 'transpose2',
'matmul', 'matmul_v2',
"scale",
'softmax', 'softmax',
'dropout', 'dropout',
'matmul_v2', 'matmul_v2',
......
...@@ -256,9 +256,8 @@ class MultiHeadAttention(nn.Layer): ...@@ -256,9 +256,8 @@ class MultiHeadAttention(nn.Layer):
query, key, value, use_cache, cache query, key, value, use_cache, cache
) )
# scale dot product attention # scale dot product attention
product = layers.matmul( product = tensor.matmul(x=q, y=k, transpose_y=True)
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5 product = tensor.scale(product, scale=self.head_dim**-0.5)
)
if attn_mask is not None: if attn_mask is not None:
product = product + attn_mask product = product + attn_mask
......
...@@ -103,6 +103,7 @@ def mlp_forward(train_program, start_program): ...@@ -103,6 +103,7 @@ def mlp_forward(train_program, start_program):
return loss, train_program, start_program return loss, train_program, start_program
@unittest.skipIf(True, "to delete later")
class TestCompatible(unittest.TestCase): class TestCompatible(unittest.TestCase):
def test_matmulv2_matmul_2_compatible(self): def test_matmulv2_matmul_2_compatible(self):
valid_op_dist_attr_list = [] valid_op_dist_attr_list = []
......
...@@ -26,7 +26,6 @@ from paddle.distributed.auto_parallel.dist_op import DistributedOperator ...@@ -26,7 +26,6 @@ from paddle.distributed.auto_parallel.dist_op import DistributedOperator
from paddle.distributed.auto_parallel.operators.common import ( from paddle.distributed.auto_parallel.operators.common import (
get_distributed_operator_impl_container, get_distributed_operator_impl_container,
) )
from paddle.fluid import layers
paddle.enable_static() paddle.enable_static()
device = "gpu" if core.is_compiled_with_cuda() else "cpu" device = "gpu" if core.is_compiled_with_cuda() else "cpu"
...@@ -85,7 +84,7 @@ def mlp_forward(train_program, start_program): ...@@ -85,7 +84,7 @@ def mlp_forward(train_program, start_program):
shape=[hidden_size, hidden_size], shape=[hidden_size, hidden_size],
dtype='float32', dtype='float32',
) )
input = layers.matmul(x=input, y=matmulinput) input = paddle.matmul(x=input, y=matmulinput)
label = static.data( label = static.data(
name="label", shape=[batch_size, 1], dtype='float32' name="label", shape=[batch_size, 1], dtype='float32'
) )
......
...@@ -22,7 +22,6 @@ from op_test import OpTest, skip_check_grad_ci ...@@ -22,7 +22,6 @@ from op_test import OpTest, skip_check_grad_ci
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.layers as layers
@skip_check_grad_ci( @skip_check_grad_ci(
...@@ -77,7 +76,7 @@ class TestCholeskyOp(OpTest): ...@@ -77,7 +76,7 @@ class TestCholeskyOp(OpTest):
dtype=root_data.dtype, shape=root_data.shape dtype=root_data.dtype, shape=root_data.shape
) )
root_t = paddle.transpose(root, self.trans_dims) root_t = paddle.transpose(root, self.trans_dims)
x = layers.matmul(x=root, y=root_t) + 1e-05 x = paddle.matmul(x=root, y=root_t) + 1e-05
out = paddle.cholesky(x, upper=self.attrs["upper"]) out = paddle.cholesky(x, upper=self.attrs["upper"])
grad_check(root, out, x_init=root_data, place=place) grad_check(root, out, x_init=root_data, place=place)
......
...@@ -414,9 +414,7 @@ class TestFakeInit(TranspilerTest): ...@@ -414,9 +414,7 @@ class TestFakeInit(TranspilerTest):
input_emb_re = paddle.reshape(input_emb, shape=[-1, 1, embedding_size]) input_emb_re = paddle.reshape(input_emb, shape=[-1, 1, embedding_size])
neg_matmul = fluid.layers.matmul( neg_matmul = paddle.matmul(input_emb_re, neg_emb_w_re, transpose_y=True)
input_emb_re, neg_emb_w_re, transpose_y=True
)
neg_matmul_re = paddle.reshape(neg_matmul, shape=[-1, neg_num]) neg_matmul_re = paddle.reshape(neg_matmul, shape=[-1, neg_num])
neg_logits = paddle.add(neg_matmul_re, neg_emb_b_vec) neg_logits = paddle.add(neg_matmul_re, neg_emb_b_vec)
# nce loss # nce loss
......
...@@ -167,7 +167,7 @@ def lm_model( ...@@ -167,7 +167,7 @@ def lm_model(
bias = bias_arr[k] bias = bias_arr[k]
nn = layers.concat([input, pre_hidden], 1) nn = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=nn, y=weight_1) gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias) gate_input = paddle.add(gate_input, bias)
i = paddle.slice( i = paddle.slice(
...@@ -291,7 +291,7 @@ def lm_model( ...@@ -291,7 +291,7 @@ def lm_model(
bias = bias_arr[k] bias = bias_arr[k]
nn = layers.concat([input, pre_hidden], 1) nn = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=nn, y=weight_1) gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias) gate_input = paddle.add(gate_input, bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
...@@ -459,7 +459,7 @@ def lm_model( ...@@ -459,7 +459,7 @@ def lm_model(
), ),
) )
projection = layers.matmul(rnn_out, softmax_weight) projection = paddle.matmul(rnn_out, softmax_weight)
projection = paddle.add(projection, softmax_bias) projection = paddle.add(projection, softmax_bias)
projection = paddle.reshape(projection, shape=[-1, vocab_size]) projection = paddle.reshape(projection, shape=[-1, vocab_size])
......
...@@ -21,7 +21,6 @@ import paddle ...@@ -21,7 +21,6 @@ import paddle
import paddle.incubate.nn.functional as incubate_f import paddle.incubate.nn.functional as incubate_f
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle import tensor from paddle import tensor
from paddle.fluid import layers
from paddle.fluid.framework import default_main_program from paddle.fluid.framework import default_main_program
from paddle.nn.layer.common import Dropout, Linear from paddle.nn.layer.common import Dropout, Linear
from paddle.nn.layer.norm import LayerNorm from paddle.nn.layer.norm import LayerNorm
...@@ -192,9 +191,8 @@ class TestFusedAttentionOp(OpTest): ...@@ -192,9 +191,8 @@ class TestFusedAttentionOp(OpTest):
# [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim] # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, out_seq_len] # --> [B, n_head, seq_len, out_seq_len]
qk_out = layers.matmul( qk_out = paddle.matmul(x=q_out, y=k_out, transpose_y=True)
x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5 qk_out = paddle.scale(qk_out, scale=self.head_dim**-0.5)
)
if attn_mask is not None: if attn_mask is not None:
attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype) attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype)
......
...@@ -19,7 +19,6 @@ import numpy as np ...@@ -19,7 +19,6 @@ import numpy as np
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle import _legacy_C_ops, tensor from paddle import _legacy_C_ops, tensor
from paddle.fluid import layers
from paddle.fluid.framework import default_main_program from paddle.fluid.framework import default_main_program
from paddle.nn.layer.common import Dropout from paddle.nn.layer.common import Dropout
from paddle.nn.layer.norm import LayerNorm from paddle.nn.layer.norm import LayerNorm
...@@ -388,9 +387,8 @@ class TestFusedMultiTransformerInt8Op(unittest.TestCase): ...@@ -388,9 +387,8 @@ class TestFusedMultiTransformerInt8Op(unittest.TestCase):
# [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim] # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, out_seq_len] # --> [B, n_head, seq_len, out_seq_len]
qk_out = layers.matmul( qk_out = paddle.matmul(x=q_out, y=k_out, transpose_y=True)
x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5 qk_out = paddle.scale(qk_out, scale=self.head_dim**-0.5)
)
if self.debug: if self.debug:
print('qk out is') print('qk out is')
......
...@@ -281,9 +281,8 @@ class TestFusedMultiTransformerOp(OpTest): ...@@ -281,9 +281,8 @@ class TestFusedMultiTransformerOp(OpTest):
# [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim] # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, out_seq_len] # --> [B, n_head, seq_len, out_seq_len]
qk_out = layers.matmul( qk_out = paddle.matmul(x=q_out, y=k_out, transpose_y=True)
x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5 qk_out = paddle.scale(qk_out, scale=self.head_dim**-0.5)
)
if self.debug: if self.debug:
print('qk out is') print('qk out is')
......
...@@ -1001,7 +1001,7 @@ class TestDygraphGuardWithError(unittest.TestCase): ...@@ -1001,7 +1001,7 @@ class TestDygraphGuardWithError(unittest.TestCase):
with self.assertRaisesRegexp( with self.assertRaisesRegexp(
TypeError, "Please use `with fluid.dygraph.guard()" TypeError, "Please use `with fluid.dygraph.guard()"
): ):
y = fluid.layers.matmul(x, x) y = paddle.matmul(x, x)
def test_without_guard(self): def test_without_guard(self):
with _test_eager_guard(): with _test_eager_guard():
......
...@@ -46,9 +46,9 @@ class GraphConv(fluid.Layer): ...@@ -46,9 +46,9 @@ class GraphConv(fluid.Layer):
) )
def forward(self, features, adj): def forward(self, features, adj):
support = fluid.layers.matmul(features, self.weight) support = paddle.matmul(features, self.weight)
# TODO(panyx0718): sparse matmul? # TODO(panyx0718): sparse matmul?
return fluid.layers.matmul(adj, support) + self.bias return paddle.matmul(adj, support) + self.bias
class GCN(fluid.Layer): class GCN(fluid.Layer):
......
...@@ -64,7 +64,7 @@ class SimpleNet(fluid.Layer): ...@@ -64,7 +64,7 @@ class SimpleNet(fluid.Layer):
def forward(self, input, label): def forward(self, input, label):
x_emb = self.embedding(input) x_emb = self.embedding(input)
projection = fluid.layers.matmul( projection = paddle.matmul(
x_emb, paddle.transpose(self.embedding.weight, perm=[1, 0]) x_emb, paddle.transpose(self.embedding.weight, perm=[1, 0])
) )
projection = paddle.add(projection, self.softmax_bias) projection = paddle.add(projection, self.softmax_bias)
......
...@@ -109,7 +109,7 @@ class SimpleLSTMRNN(fluid.Layer): ...@@ -109,7 +109,7 @@ class SimpleLSTMRNN(fluid.Layer):
bias = self.bias_arr[k] bias = self.bias_arr[k]
nn = fluid.layers.concat([self._input, pre_hidden], 1) nn = fluid.layers.concat([self._input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1) gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias) gate_input = paddle.add(gate_input, bias)
i, j, f, o = fluid.layers.split( i, j, f, o = fluid.layers.split(
...@@ -225,7 +225,7 @@ class PtbModel(fluid.Layer): ...@@ -225,7 +225,7 @@ class PtbModel(fluid.Layer):
rnn_out = paddle.reshape( rnn_out = paddle.reshape(
rnn_out, shape=[-1, self.num_steps, self.hidden_size] rnn_out, shape=[-1, self.num_steps, self.hidden_size]
) )
projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = paddle.matmul(rnn_out, self.softmax_weight)
projection = paddle.add(projection, self.softmax_bias) projection = paddle.add(projection, self.softmax_bias)
projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy( loss = paddle.nn.functional.softmax_with_cross_entropy(
......
...@@ -104,7 +104,7 @@ class SimpleLSTMRNN(fluid.Layer): ...@@ -104,7 +104,7 @@ class SimpleLSTMRNN(fluid.Layer):
bias = self.bias_arr[k] bias = self.bias_arr[k]
nn = fluid.layers.concat([self._input, pre_hidden], 1) nn = fluid.layers.concat([self._input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1) gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias) gate_input = paddle.add(gate_input, bias)
i, j, f, o = fluid.layers.split( i, j, f, o = fluid.layers.split(
...@@ -221,7 +221,7 @@ class PtbModel(fluid.Layer): ...@@ -221,7 +221,7 @@ class PtbModel(fluid.Layer):
rnn_out, shape=[-1, self.num_steps, self.hidden_size] rnn_out, shape=[-1, self.num_steps, self.hidden_size]
) )
projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = paddle.matmul(rnn_out, self.softmax_weight)
projection = paddle.add(projection, self.softmax_bias) projection = paddle.add(projection, self.softmax_bias)
projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy( loss = paddle.nn.functional.softmax_with_cross_entropy(
......
...@@ -105,7 +105,7 @@ class SimpleLSTMRNN(fluid.Layer): ...@@ -105,7 +105,7 @@ class SimpleLSTMRNN(fluid.Layer):
bias = self.bias_arr[k] bias = self.bias_arr[k]
nn = fluid.layers.concat([self._input, pre_hidden], 1) nn = fluid.layers.concat([self._input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1) gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias) gate_input = paddle.add(gate_input, bias)
i, j, f, o = fluid.layers.split( i, j, f, o = fluid.layers.split(
...@@ -222,7 +222,7 @@ class PtbModel(fluid.Layer): ...@@ -222,7 +222,7 @@ class PtbModel(fluid.Layer):
rnn_out, shape=[-1, self.num_steps, self.hidden_size] rnn_out, shape=[-1, self.num_steps, self.hidden_size]
) )
projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = paddle.matmul(rnn_out, self.softmax_weight)
projection = paddle.add(projection, self.softmax_bias) projection = paddle.add(projection, self.softmax_bias)
projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy( loss = paddle.nn.functional.softmax_with_cross_entropy(
......
...@@ -72,9 +72,9 @@ class SimpleNet(fluid.Layer): ...@@ -72,9 +72,9 @@ class SimpleNet(fluid.Layer):
def forward(self, input, label): def forward(self, input, label):
x_emb = self.embedding(input) x_emb = self.embedding(input)
fc = fluid.layers.matmul(x_emb, self.softmax_weight) fc = paddle.matmul(x_emb, self.softmax_weight)
fc = paddle.add(fc, self.softmax_bias) fc = paddle.add(fc, self.softmax_bias)
projection = fluid.layers.matmul( projection = paddle.matmul(
fc, paddle.transpose(self.embedding.weight, perm=[1, 0]) fc, paddle.transpose(self.embedding.weight, perm=[1, 0])
) )
projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
......
...@@ -495,12 +495,12 @@ class MultiHeadAttentionLayer(Layer): ...@@ -495,12 +495,12 @@ class MultiHeadAttentionLayer(Layer):
transpose_v = paddle.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) transpose_v = paddle.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
# scale dot product attention # scale dot product attention
product = fluid.layers.matmul( product = paddle.matmul(
x=transpose_q, x=transpose_q,
y=transpose_k, y=transpose_k,
transpose_y=True, transpose_y=True,
alpha=self._d_model**-0.5,
) )
product = paddle.scale(product, scale=self._d_model**-0.5)
if attn_bias is not None: if attn_bias is not None:
product += attn_bias product += attn_bias
weights = paddle.nn.functional.softmax(product) weights = paddle.nn.functional.softmax(product)
...@@ -511,9 +511,9 @@ class MultiHeadAttentionLayer(Layer): ...@@ -511,9 +511,9 @@ class MultiHeadAttentionLayer(Layer):
seed=ModelHyperParams.dropout_seed, seed=ModelHyperParams.dropout_seed,
is_test=False, is_test=False,
) )
out = fluid.layers.matmul(weights_droped, transpose_v) out = paddle.matmul(weights_droped, transpose_v)
else: else:
out = fluid.layers.matmul(weights, transpose_v) out = paddle.matmul(weights, transpose_v)
# combine heads # combine heads
if len(out.shape) != 4: if len(out.shape) != 4:
...@@ -1003,7 +1003,7 @@ class WrapDecoderLayer(Layer): ...@@ -1003,7 +1003,7 @@ class WrapDecoderLayer(Layer):
) )
if self._weight_sharing: if self._weight_sharing:
predict = fluid.layers.matmul( predict = paddle.matmul(
x=dec_output_reshape, x=dec_output_reshape,
y=self._prepare_decoder_layer._input_emb.weight, y=self._prepare_decoder_layer._input_emb.weight,
transpose_y=True, transpose_y=True,
......
...@@ -290,7 +290,7 @@ class TestLayer(LayerTest): ...@@ -290,7 +290,7 @@ class TestLayer(LayerTest):
with self.static_graph(): with self.static_graph():
t = layers.data(name='t', shape=[3, 3], dtype='float32') t = layers.data(name='t', shape=[3, 3], dtype='float32')
t2 = layers.data(name='t2', shape=[3, 3], dtype='float32') t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
ret = layers.matmul(t, t2) ret = paddle.matmul(t, t2)
static_ret = self.get_static_graph_result( static_ret = self.get_static_graph_result(
feed={ feed={
't': np.ones([3, 3], dtype='float32'), 't': np.ones([3, 3], dtype='float32'),
...@@ -303,14 +303,14 @@ class TestLayer(LayerTest): ...@@ -303,14 +303,14 @@ class TestLayer(LayerTest):
with _test_eager_guard(): with _test_eager_guard():
t = np.ones([3, 3], dtype='float32') t = np.ones([3, 3], dtype='float32')
t2 = np.ones([3, 3], dtype='float32') t2 = np.ones([3, 3], dtype='float32')
dy_eager_ret = layers.matmul( dy_eager_ret = paddle.matmul(
base.to_variable(t), base.to_variable(t2) base.to_variable(t), base.to_variable(t2)
) )
dy_eager_ret_value = dy_eager_ret.numpy() dy_eager_ret_value = dy_eager_ret.numpy()
t = np.ones([3, 3], dtype='float32') t = np.ones([3, 3], dtype='float32')
t2 = np.ones([3, 3], dtype='float32') t2 = np.ones([3, 3], dtype='float32')
dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2)) dy_ret = paddle.matmul(base.to_variable(t), base.to_variable(t2))
dy_ret_value = dy_ret.numpy() dy_ret_value = dy_ret.numpy()
np.testing.assert_allclose(static_ret, dy_ret_value, rtol=1e-05) np.testing.assert_allclose(static_ret, dy_ret_value, rtol=1e-05)
......
...@@ -19,7 +19,6 @@ from op_test import OpTest ...@@ -19,7 +19,6 @@ from op_test import OpTest
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y): def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
...@@ -117,151 +116,6 @@ class Generator: ...@@ -117,151 +116,6 @@ class Generator:
) )
class TestMatmulOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
# The inputs type of matmul_op must be Variable.
input1 = 12
self.assertRaises(TypeError, fluid.layers.matmul, input1, input1)
# The inputs dtype of matmul_op must be float32, float64.
input2 = fluid.layers.data(
name='input2', shape=[10, 10], dtype="int32"
)
self.assertRaises(TypeError, fluid.layers.matmul, input2, input2)
input3 = fluid.layers.data(
name='input3', shape=[2, 2], dtype="float16"
)
fluid.layers.matmul(input3, input3)
# Negative dimension generation
def generate_negative_dims(in_shape):
from itertools import combinations
size = len(in_shape)
indexs = list()
shapes = list()
for i in range(size):
indexs.extend(list(combinations([j for j in range(size)], i + 1)))
for idx in indexs:
shapes.append(
[in_shape[i] if i not in idx else -1 for i in range(size)]
)
return shapes
# Build program with inputs sizes that contain negative numbers
def test_negative_dims_program(obj):
for shape_x in generate_negative_dims(obj.shape_X):
for shape_y in generate_negative_dims(obj.shape_Y):
X = np.random.random(obj.shape_X).astype("float32")
Y = np.random.random(obj.shape_Y).astype("float32")
Ref = reference_matmul(X, Y, obj.transpose_X, obj.transpose_Y)
with program_guard(Program(), Program()):
x = fluid.data(name='x', shape=shape_x, dtype='float32')
y = fluid.data(name='y', shape=shape_y, dtype='float32')
output = fluid.layers.matmul(
x, y, obj.transpose_X, obj.transpose_Y
)
obj.assertEqual(len(Ref.shape), len(output.shape))
for idx in range(len(Ref.shape)):
if output.shape[idx] != -1:
obj.assertEqual(Ref.shape[idx], output.shape[idx])
exe = fluid.Executor(fluid.CPUPlace())
(res,) = exe.run(
fluid.default_main_program(),
feed={'x': X, 'y': Y},
fetch_list=[output],
)
np.allclose(res, Ref, atol=1e-5)
# Generate program api cases for all negative possibilities
def api_test(dim_x, dim_y, trans_x, trans_y):
test_name = 'TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
dim_x, dim_y, trans_x, trans_y
)
shape_x, shape_y = generate_compatible_shapes(
dim_x, dim_y, trans_x, trans_y
)
globals()[test_name] = type(
test_name,
(unittest.TestCase,),
{
'shape_X': shape_x,
'shape_Y': shape_y,
'transpose_X': trans_x,
'transpose_Y': trans_y,
'test_propram': test_negative_dims_program,
},
)
# Generate operators cases for all possibilities
def inject_test(dim_x, dim_y, trans_x, trans_y):
test_name = 'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
dim_x, dim_y, trans_x, trans_y
)
shape_x, shape_y = generate_compatible_shapes(
dim_x, dim_y, trans_x, trans_y
)
globals()[test_name] = type(
test_name,
(Generator, OpTest),
{
'shape_X': shape_x,
'shape_Y': shape_y,
'transpose_X': trans_x,
'transpose_Y': trans_y,
},
)
for dim_X in (1, 2, 3):
for dim_Y in (1, 2, 3):
for transose_x in (False, True):
for transose_y in (False, True):
inject_test(dim_X, dim_Y, transose_x, transose_y)
api_test(dim_X, dim_Y, transose_x, transose_y)
# Test case more batch_size and N, M, K
def generate_compatible_shapes_batch(
dim_X, dim_Y, transpose_X, transpose_Y, batch_size
):
BATCH_SIZE = 2
M = 3
N = 4
K = 5
if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
K = 1
if dim_X == 1:
if transpose_X:
shape_X = [M]
else:
shape_X = [K]
if dim_Y == 1:
if transpose_Y:
shape_Y = [N]
else:
shape_Y = [K]
if dim_X >= 2:
if transpose_X:
shape_X = [K, M]
else:
shape_X = [M, K]
if dim_X == 3:
shape_X = [BATCH_SIZE] + shape_X
if dim_Y >= 2:
if transpose_Y:
shape_Y = [N, K]
else:
shape_Y = [K, N]
if dim_Y == 3:
shape_Y = [BATCH_SIZE] + shape_Y
return shape_X, shape_Y
# Test case n-dim # Test case n-dim
def generate_compatible_shapes_ndim(dim, transpose_X, transpose_Y): def generate_compatible_shapes_ndim(dim, transpose_X, transpose_Y):
M = 2 M = 2
......
...@@ -94,7 +94,7 @@ class TestMatmulDoubleGradCheck(unittest.TestCase): ...@@ -94,7 +94,7 @@ class TestMatmulDoubleGradCheck(unittest.TestCase):
y = paddle.create_parameter( y = paddle.create_parameter(
dtype=typename, shape=self.y_shape, name='y' dtype=typename, shape=self.y_shape, name='y'
) )
out = layers.matmul( out = paddle.matmul(
x, y, self.transpose_x, self.transpose_y, name='out' x, y, self.transpose_x, self.transpose_y, name='out'
) )
......
...@@ -616,13 +616,13 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1): ...@@ -616,13 +616,13 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1):
rnn = layers.StaticRNN() rnn = layers.StaticRNN()
def dot_attention(query, memory): def dot_attention(query, memory):
attn = layers.matmul(query, memory, transpose_y=True) attn = paddle.matmul(query, memory, transpose_y=True)
weight = paddle.nn.functional.softmax(attn) weight = paddle.nn.functional.softmax(attn)
weight_memory = layers.matmul(weight, memory) weight_memory = paddle.matmul(weight, memory)
return weight_memory, weight return weight_memory, weight
y = layers.matmul(emb, w1) y = paddle.matmul(emb, w1)
with rnn.step(): with rnn.step():
pre_h = rnn.memory( pre_h = rnn.memory(
shape=(self.sent_len, self.input_dim), shape=(self.sent_len, self.input_dim),
...@@ -631,7 +631,7 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1): ...@@ -631,7 +631,7 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1):
) )
step_in = rnn.step_input(x) step_in = rnn.step_input(x)
concat_in = layers.concat([step_in, pre_h], 1) concat_in = layers.concat([step_in, pre_h], 1)
new_h = layers.matmul(concat_in, w2) new_h = paddle.matmul(concat_in, w2)
new_h = layers.unsqueeze(new_h, [1]) new_h = layers.unsqueeze(new_h, [1])
new_h, _ = dot_attention(new_h, y) new_h, _ = dot_attention(new_h, y)
new_h = paddle.squeeze(new_h, [1]) new_h = paddle.squeeze(new_h, [1])
......
...@@ -71,14 +71,14 @@ class DecoderCell(layers.RNNCell): ...@@ -71,14 +71,14 @@ class DecoderCell(layers.RNNCell):
query = layers.fc( query = layers.fc(
hidden, size=encoder_output.shape[-1], bias_attr=False hidden, size=encoder_output.shape[-1], bias_attr=False
) )
attn_scores = layers.matmul( attn_scores = paddle.matmul(
layers.unsqueeze(query, [1]), encoder_output, transpose_y=True layers.unsqueeze(query, [1]), encoder_output, transpose_y=True
) )
if encoder_padding_mask is not None: if encoder_padding_mask is not None:
attn_scores = paddle.add(attn_scores, encoder_padding_mask) attn_scores = paddle.add(attn_scores, encoder_padding_mask)
attn_scores = paddle.nn.functional.softmax(attn_scores) attn_scores = paddle.nn.functional.softmax(attn_scores)
attn_out = paddle.squeeze( attn_out = paddle.squeeze(
layers.matmul(attn_scores, encoder_output), [1] paddle.matmul(attn_scores, encoder_output), [1]
) )
attn_out = layers.concat([attn_out, hidden], 1) attn_out = layers.concat([attn_out, hidden], 1)
attn_out = layers.fc(attn_out, size=self.hidden_size, bias_attr=False) attn_out = layers.fc(attn_out, size=self.hidden_size, bias_attr=False)
......
...@@ -115,7 +115,7 @@ class SimpleLSTMRNN(fluid.Layer): ...@@ -115,7 +115,7 @@ class SimpleLSTMRNN(fluid.Layer):
bias = self.bias_arr[k] bias = self.bias_arr[k]
nn = fluid.layers.concat([self._input, pre_hidden], 1) nn = fluid.layers.concat([self._input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1) gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias) gate_input = paddle.add(gate_input, bias)
i, j, f, o = fluid.layers.split( i, j, f, o = fluid.layers.split(
...@@ -234,7 +234,7 @@ class PtbModel(fluid.Layer): ...@@ -234,7 +234,7 @@ class PtbModel(fluid.Layer):
rnn_out = paddle.reshape( rnn_out = paddle.reshape(
rnn_out, shape=[-1, self.num_steps, self.hidden_size] rnn_out, shape=[-1, self.num_steps, self.hidden_size]
) )
projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = paddle.matmul(rnn_out, self.softmax_weight)
projection = paddle.add(projection, self.softmax_bias) projection = paddle.add(projection, self.softmax_bias)
projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy( loss = paddle.nn.functional.softmax_with_cross_entropy(
......
...@@ -163,13 +163,13 @@ def multi_head_attention( ...@@ -163,13 +163,13 @@ def multi_head_attention(
return layers.elementwise_div(x=exp_out, y=sum_out, axis=0) return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
scaled_q = paddle.scale(x=q, scale=d_model**-0.5) scaled_q = paddle.scale(x=q, scale=d_model**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True) product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
weights = __softmax(paddle.add(x=product, y=attn_bias)) weights = __softmax(paddle.add(x=product, y=attn_bias))
if dropout_rate: if dropout_rate:
weights = layers.dropout( weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False weights, dropout_prob=dropout_rate, is_test=False
) )
out = layers.matmul(weights, v) out = paddle.matmul(weights, v)
return out return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
......
...@@ -31,7 +31,6 @@ import paddle ...@@ -31,7 +31,6 @@ import paddle
import paddle.incubate.nn.functional as incubate_f import paddle.incubate.nn.functional as incubate_f
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle import tensor from paddle import tensor
from paddle.fluid import layers
from paddle.fluid.framework import default_main_program from paddle.fluid.framework import default_main_program
from paddle.nn.layer.common import Dropout, Linear from paddle.nn.layer.common import Dropout, Linear
from paddle.nn.layer.norm import LayerNorm from paddle.nn.layer.norm import LayerNorm
...@@ -164,7 +163,7 @@ class XPUTestFusedAttentionOp(XPUOpTestWrapper): ...@@ -164,7 +163,7 @@ class XPUTestFusedAttentionOp(XPUOpTestWrapper):
# [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim] # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, out_seq_len] # --> [B, n_head, seq_len, out_seq_len]
qk_out = layers.matmul( qk_out = tensor.matmul(
x=q_out * self.head_dim**-0.5, y=k_out, transpose_y=True x=q_out * self.head_dim**-0.5, y=k_out, transpose_y=True
) )
......
...@@ -27,7 +27,6 @@ from xpu.get_test_cover_info import ( ...@@ -27,7 +27,6 @@ from xpu.get_test_cover_info import (
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
...@@ -135,71 +134,11 @@ def generate_compatible_shapes_2(dim, transpose_X, transpose_Y): ...@@ -135,71 +134,11 @@ def generate_compatible_shapes_2(dim, transpose_X, transpose_Y):
return shape_X, shape_Y return shape_X, shape_Y
def generate_negative_dims(in_shape):
from itertools import combinations
size = len(in_shape)
indexs = list()
shapes = list()
for i in range(size):
indexs.extend(list(combinations([j for j in range(size)], i + 1)))
for idx in indexs:
shapes.append(
[in_shape[i] if i not in idx else -1 for i in range(size)]
)
return shapes
def test_negative_dims_program(obj):
for shape_x in generate_negative_dims(obj.shape_X):
for shape_y in generate_negative_dims(obj.shape_Y):
X = np.random.random(obj.shape_X).astype(obj.in_type)
Y = np.random.random(obj.shape_Y).astype(obj.in_type)
Ref = reference_matmul(X, Y, obj.transpose_X, obj.transpose_Y)
with program_guard(Program(), Program()):
x = fluid.data(name='x', shape=shape_x, dtype=obj.in_type_str)
y = fluid.data(name='y', shape=shape_y, dtype=obj.in_type_str)
output = fluid.layers.matmul(
x, y, obj.transpose_X, obj.transpose_Y
)
obj.assertEqual(len(Ref.shape), len(output.shape))
for idx in range(len(Ref.shape)):
if output.shape[idx] != -1:
obj.assertEqual(Ref.shape[idx], output.shape[idx])
exe = fluid.Executor(fluid.XPUPlace(0))
(res,) = exe.run(
fluid.default_main_program(),
feed={'x': X, 'y': Y},
fetch_list=[output],
)
np.allclose(res, Ref, atol=1e-3)
class XPUTestMatmulOpErr(XPUOpTestWrapper): class XPUTestMatmulOpErr(XPUOpTestWrapper):
def __init__(self): def __init__(self):
self.op_name = "matmul" self.op_name = "matmul"
self.use_dynamic_create_class = False self.use_dynamic_create_class = False
class TestMatmulOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
# The inputs type of matmul_op must be Variable.
input1 = 12
self.assertRaises(
TypeError, fluid.layers.matmul, input1, input1
)
# The inputs dtype of matmul_op must be float32, float16
input2 = fluid.layers.data(
name='input2', shape=[10, 10], dtype="int32"
)
self.assertRaises(
TypeError, fluid.layers.matmul, input2, input2
)
input3 = fluid.layers.data(
name='input3', shape=[2, 2], dtype="float16"
)
fluid.layers.matmul(input3, input3)
class API_TestMm(unittest.TestCase): class API_TestMm(unittest.TestCase):
def test_out(self): def test_out(self):
with fluid.program_guard(fluid.Program()): with fluid.program_guard(fluid.Program()):
...@@ -399,39 +338,6 @@ class XPUTestMatmulOp1(XPUOpTestWrapper): ...@@ -399,39 +338,6 @@ class XPUTestMatmulOp1(XPUOpTestWrapper):
return base_class, classes return base_class, classes
class XPUTestMatmulOp2(XPUOpTestWrapper):
def __init__(self):
self.op_name = "matmul"
self.use_dynamic_create_class = True
def dynamic_create_class(self):
base_class = unittest.TestCase
classes = []
xpu_support_dims_list = [[1, 1], [2, 2], [3, 3]]
batch_size = [2, 4, 5, 10, 50, 100, 300]
for dims in xpu_support_dims_list:
dim_X = dims[0]
dim_Y = dims[1]
for transose_x in [True, False]:
for transose_y in [True, False]:
for batch in batch_size:
class_name = 'TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'.format(
dim_X, dim_Y, transose_x, transose_y, batch
)
shape_x, shape_y = generate_compatible_shapes(
dim_X, dim_Y, transose_x, transose_y, batch
)
attr_dict = {
'shape_X': shape_x,
'shape_Y': shape_y,
'transpose_X': transose_x,
'transpose_Y': transose_y,
'test_propram': test_negative_dims_program,
}
classes.append([class_name, attr_dict])
return base_class, classes
class XPUTestMatmulOp3(XPUOpTestWrapper): class XPUTestMatmulOp3(XPUOpTestWrapper):
def __init__(self): def __init__(self):
self.op_name = "matmul" self.op_name = "matmul"
...@@ -464,7 +370,6 @@ support_types = get_xpu_op_support_types('matmul') ...@@ -464,7 +370,6 @@ support_types = get_xpu_op_support_types('matmul')
for stype in support_types: for stype in support_types:
create_test_class(globals(), XPUTestMatmulOpErr, stype) create_test_class(globals(), XPUTestMatmulOpErr, stype)
create_test_class(globals(), XPUTestMatmulOp1, stype) create_test_class(globals(), XPUTestMatmulOp1, stype)
create_test_class(globals(), XPUTestMatmulOp2, stype)
create_test_class(globals(), XPUTestMatmulOp3, stype) create_test_class(globals(), XPUTestMatmulOp3, stype)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册