未验证 提交 8fb829ba 编写于 作者: K kangguangli 提交者: GitHub

Remove fluid matmul (#47988)

* remove layers.matmul in nets.py

* remove layers.matmul in rnn_impl/test_quantization_pass/auto_parallel_gpt_model/test_auto_parallel_completion_gpt

* remove layers.matmul in other files

* fix

* fix

* remove layers.matmul itself

* remove ref in CMakeLists.txt and tools directory

* remove matmul in fluid.layers.nn.py

* remove matmul in fluid.dygraph.rnn.py && resotre test_matmul_op.py

* replace matmul in fluid.dygraph.rnn.py && clean api_test in test_matmul_op.py

* fix error && restore empty test_auto_search_dist_matmul_op.py

* fix check in test_auto_parallel_partitioner.py

* fix test_dist_matmul && test_flags_mkldnn_ops_on_off

* fix test_fused_attention_op_xpu.py && test_matmul_op_xpu.py

* remove test_auto_search_dist_matmul_op.py

* remove layers.matmul in auto_parallel_gpt_model.py && fix doc in fluid/io.py

* fix for matmul_grad

* fix codestyle

* fix codestyle

* resolve conflicts error

* restore unit test file but not compiled it for later remove

* fix codestyle

* fix wrong unittest skip

* fix unittest delete

* fix scale cost

* fix scale cost

* resolve conflicts error

* resolve conflicts error
Co-authored-by: Njakpiase <jakpia21@gmail.com>
上级 1976cc4b
......@@ -101,9 +101,13 @@ void MatmulGradKernel(const Context &dev_ctx,
if (x_dims.size() != ndims) {
x_dims = ExtendDimsWithOnes(x_dims, ndims);
} else if (y_dims.size() != ndims) {
}
if (y_dims.size() != ndims) {
y_dims = ExtendDimsWithOnes(y_dims, ndims);
}
if (dout_dims.size() != ndims) {
dout_dims = ExtendDimsWithOnes(dout_dims, ndims);
}
// in broadcasting scenario new memory is required because
// reduce sum must be calculated upon broadcasted dims
......@@ -150,7 +154,9 @@ void MatmulGradKernel(const Context &dev_ctx,
}
dx->Resize(x.dims());
dx->set_mem_desc(x.mem_desc().reshape(vectorize(x.dims())));
dy->Resize(y.dims());
dy->set_mem_desc(y.mem_desc().reshape(vectorize(y.dims())));
}
template <typename T, typename Context>
......
......@@ -151,7 +151,7 @@ class BasicGRUUnit(Layer):
def forward(self, input, pre_hidden):
concat_input_hidden = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
gate_input = paddle.matmul(x=concat_input_hidden, y=self._gate_weight)
gate_input = paddle.add(gate_input, self._gate_bias)
......@@ -160,7 +160,7 @@ class BasicGRUUnit(Layer):
r_hidden = r * pre_hidden
candidate = layers.matmul(
candidate = paddle.matmul(
layers.concat([input, r_hidden], 1), self._candidate_weight
)
candidate = paddle.add(candidate, self._candidate_bias)
......@@ -874,7 +874,7 @@ class BasicLSTMUnit(Layer):
def forward(self, input, pre_hidden, pre_cell):
concat_input_hidden = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
gate_input = paddle.matmul(x=concat_input_hidden, y=self._weight)
gate_input = paddle.add(gate_input, self._bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
......
......@@ -76,7 +76,7 @@ def residual_block(num, quant_skip_pattern=None):
matmul_weight = paddle.create_parameter(
shape=[1, 16, 32, 32], dtype='float32'
)
hidden = fluid.layers.matmul(hidden, matmul_weight, True, True)
hidden = paddle.matmul(hidden, matmul_weight, True, True)
if quant_skip_pattern:
with fluid.name_scope(quant_skip_pattern):
pool = fluid.layers.pool2d(
......@@ -724,7 +724,7 @@ def quant_dequant_residual_block(num, quant_skip_pattern=None):
conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
hidden = paddle.nn.functional.relu(paddle.add(x=conv, y=short))
hidden = fluid.layers.matmul(hidden, data2, True, True)
hidden = paddle.matmul(hidden, data2, True, True)
if isinstance(quant_skip_pattern, str):
with fluid.name_scope(quant_skip_pattern):
pool1 = fluid.layers.pool2d(
......
......@@ -17,11 +17,11 @@ from . import Layer
from ..layers import (
concat,
fill_constant,
matmul,
elementwise_mul,
split,
)
import copy
import paddle
__all__ = ['LSTMCell', 'GRUCell']
......@@ -215,11 +215,12 @@ class LSTMCell(Layer):
def forward(self, input, pre_hidden, pre_cell):
if self._use_cudnn_impl:
igates = matmul(input, y=self._weight_ih, transpose_y=True)
igates = paddle.matmul(input, y=self._weight_ih, transpose_y=True)
igates = paddle.add(igates, self._bias_ih)
hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
hgates = paddle.matmul(
pre_hidden, self._weight_hh, transpose_y=True
)
hgates = paddle.add(hgates, self._bias_hh)
chunked_igates = split(igates, num_or_sections=4, dim=1)
chunked_hgates = split(hgates, num_or_sections=4, dim=1)
......@@ -241,7 +242,7 @@ class LSTMCell(Layer):
else:
concat_input_hidden = concat([input, pre_hidden], 1)
gate_input = matmul(x=concat_input_hidden, y=self._weight)
gate_input = paddle.matmul(x=concat_input_hidden, y=self._weight)
gate_input = paddle.add(gate_input, self._bias)
i, j, f, o = split(gate_input, num_or_sections=4, dim=-1)
......@@ -461,10 +462,11 @@ class GRUCell(Layer):
def forward(self, input, pre_hidden):
if self._use_cudnn_impl:
igates = matmul(input, y=self._weight_ih, transpose_y=True)
igates = paddle.matmul(input, y=self._weight_ih, transpose_y=True)
igates = paddle.add(igates, self._bias_ih)
hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
hgates = paddle.matmul(
pre_hidden, self._weight_hh, transpose_y=True
)
hgates = paddle.add(hgates, self._bias_hh)
chunked_igates = split(igates, num_or_sections=3, dim=1)
......@@ -486,7 +488,9 @@ class GRUCell(Layer):
concat_input_hidden = concat([input, pre_hidden], 1)
gate_input = matmul(x=concat_input_hidden, y=self._gate_weight)
gate_input = paddle.matmul(
x=concat_input_hidden, y=self._gate_weight
)
gate_input = paddle.add(gate_input, self._gate_bias)
gate_input = self._gate_activation(gate_input)
......@@ -494,7 +498,7 @@ class GRUCell(Layer):
r_hidden = r * pre_hidden
candidate = matmul(
candidate = paddle.matmul(
concat([input, r_hidden], 1), self._candidate_weight
)
candidate = paddle.add(candidate, self._candidate_bias)
......
......@@ -73,7 +73,6 @@ __all__ = [
'dropout',
'split',
'l2_normalize',
'matmul',
'row_conv',
'layer_norm',
'spectral_norm',
......@@ -2589,154 +2588,6 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
return out
@deprecated(since="2.0.0", update_to="paddle.matmul")
def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
"""
Applies matrix multiplication to two tensors.
Currently, the input tensors' rank can be any, but when the rank of any
inputs is bigger than 3, this two inputs' rank should be equal.
The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
- If a transpose flag is specified, the last two dimensions of the tensor
are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for
:math:`x` it is treated as :math:`[1, D]` in nontransposed form and as
:math:`[D, 1]` in transposed form, whereas for :math:`y` it is the
opposite: It is treated as :math:`[D, 1]` in nontransposed form and as
:math:`[1, D]` in transposed form.
- After transpose, the two tensors are 2-D or n-D and matrix multiplication
performs in the following way.
- If both are 2-D, they are multiplied like conventional matrices.
- If either is n-D, it is treated as a stack of matrices residing in the
last two dimensions and a batched matrix multiply supporting broadcast
applies on the two tensors.
Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and
nontransposed, the prepended or appended dimension :math:`1` will be
removed after matrix multiplication.
Args:
x (Variable): The input variable which is a Tensor or LoDTensor.
y (Variable): The input variable which is a Tensor or LoDTensor.
transpose_x (bool): Whether to transpose :math:`x` before multiplication.
transpose_y (bool): Whether to transpose :math:`y` before multiplication.
alpha (float): The scale of output. Default 1.0.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The product Tensor (or LoDTensor) variable.
Examples:
.. code-block:: python
# Examples to clarify shapes of the inputs and output
# x: [B, ..., M, K], y: [B, ..., K, N]
# fluid.layers.matmul(x, y) # out: [B, ..., M, N]
# x: [B, M, K], y: [B, K, N]
# fluid.layers.matmul(x, y) # out: [B, M, N]
# x: [B, M, K], y: [K, N]
# fluid.layers.matmul(x, y) # out: [B, M, N]
# x: [M, K], y: [K, N]
# fluid.layers.matmul(x, y) # out: [M, N]
# x: [B, M, K], y: [K]
# fluid.layers.matmul(x, y) # out: [B, M]
# x: [K], y: [K]
# fluid.layers.matmul(x, y) # out: [1]
# x: [M], y: [N]
# fluid.layers.matmul(x, y, True, True) # out: [M, N]
import paddle
import paddle.fluid as fluid
paddle.enable_static()
x = fluid.layers.data(name='x', shape=[2, 3], dtype='float32')
y = fluid.layers.data(name='y', shape=[3, 2], dtype='float32')
out = fluid.layers.matmul(x, y, True, True)
"""
if _non_static_mode():
out = _varbase_creator(dtype=x.dtype)
_legacy_C_ops.matmul(
x,
y,
out,
'transpose_X',
transpose_x,
'transpose_Y',
transpose_y,
'alpha',
float(alpha),
)
return out
def __check_input(x, y):
var_names = {'x': x, 'y': y}
for name, val in var_names.items():
check_variable_and_dtype(
val, name, ['float16', 'float32', 'float64'], 'matmul'
)
x_shape = list(x.shape)
y_shape = list(y.shape)
if len(x_shape) == 1:
x_shape = [1] + x_shape
if len(y_shape) == 1:
y_shape = y_shape + [1]
# check the inner 2 dimensions
if transpose_x:
x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
if transpose_y:
y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
if x_shape[-1] != y_shape[-2]:
assert (x_shape[-1] == -1) or (y_shape[-2] == -1), (
"After performing an optional transpose, Input X's width should be "
"equal to Y's width for multiplication "
"prerequisites. But received X's shape: %s, Y's shape: %s\n"
% (x_shape, y_shape)
)
if len(y_shape) > 2 and len(x_shape) > 2:
for i, dim_x in enumerate(x_shape[:-2]):
# don't check neg shape
if dim_x < 0 or y_shape[i] < 0:
continue
if dim_x != y_shape[i]:
raise ValueError(
"When the matrix is larger than 2 dimensions, the higher "
"dimensional values of the two matrices need to be equal. "
"But received x_shape[%d] != y_shape[%d]. X's shape: %s, "
"Y's shape: %s.\n" % (i, i, x_shape, y_shape)
)
attrs = {
'transpose_X': transpose_x,
'transpose_Y': transpose_y,
'alpha': float(alpha),
}
__check_input(x, y)
helper = LayerHelper('matmul', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type='matmul',
inputs={'X': x, 'Y': y},
outputs={'Out': out},
attrs=attrs,
)
return out
@templatedoc()
def row_conv(input, future_context_size, param_attr=None, act=None):
"""
......
......@@ -621,7 +621,7 @@ def scaled_dot_product_attention(
key_dim_per_head = keys.shape[-1] // num_heads
scaled_q = paddle.scale(x=q, scale=key_dim_per_head**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
x = paddle.reshape(x=product, shape=[-1, product.shape[-1]])
x = paddle.nn.functional.softmax(x)
......@@ -631,5 +631,5 @@ def scaled_dot_product_attention(
weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False
)
ctx_multiheads = layers.matmul(weights, v)
ctx_multiheads = paddle.matmul(weights, v)
return __combine_heads(ctx_multiheads)
......@@ -84,9 +84,7 @@ def matmul_dp2mp2(init_x, init_y, trans_x, trans_y):
y = init_y(trans_y)
x.stop_gradient = False
y.stop_gradient = False
out = paddle.fluid.layers.matmul(
x, y, transpose_x=trans_x, transpose_y=trans_y
)
out = paddle.matmul(x, y, transpose_x=trans_x, transpose_y=trans_y)
loss = paddle.mean(out)
return main_program, start_program, loss
......@@ -134,22 +132,22 @@ class TestDistMatmul(unittest.TestCase):
# [0, -1] * [-1, 1] --> [0, 1]
ref_ops = [
"c_identity",
"matmul",
"matmul_v2",
"reduce_mean",
"fill_constant",
"reduce_mean_grad",
"matmul_grad",
"matmul_v2_grad",
]
ops = []
block = main_program.global_block()
for op in block.ops:
ops.append(op.type)
if op.type == "matmul":
if op.type == "matmul_v2":
out_name = op.output('Out')[0]
out_var = block.vars[out_name]
op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
assert op_dist_attr.impl_idx == 0
assert op_dist_attr.impl_type == "matmul"
assert op_dist_attr.impl_type == "matmul_v2"
out_dims_mapping = op_dist_attr.get_output_dims_mapping(
out_name
)
......@@ -158,33 +156,33 @@ class TestDistMatmul(unittest.TestCase):
out_var
)
assert tensor_dist_attr.dims_mapping == [0, 1]
if op.type == "matmul_grad":
if op.type == "matmul_v2_grad":
op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
assert op_dist_attr.impl_idx == 0
assert op_dist_attr.impl_type == "matmul"
assert op_dist_attr.impl_type == "matmul_v2"
assert ops == ref_ops
def check_row_program(self, main_program, dist_ctx):
# [0, -1, 1] * [1, -1] --> [0, -1, -1]
ref_ops = [
"matmul",
"matmul_v2",
"c_allreduce_sum",
"reduce_mean",
"fill_constant",
"reduce_mean_grad",
"matmul_grad",
"matmul_v2_grad",
]
ops = []
block = main_program.global_block()
for op in block.ops:
ops.append(op.type)
if op.type == "matmul":
if op.type == "matmul_v2":
out_name = op.output('Out')[0]
out_var = block.vars[out_name]
op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
assert op_dist_attr.impl_idx == 1
assert op_dist_attr.impl_type == "matmul"
assert op_dist_attr.impl_type == "matmul_v2"
out_dims_mapping = op_dist_attr.get_output_dims_mapping(
out_name
)
......@@ -193,10 +191,10 @@ class TestDistMatmul(unittest.TestCase):
out_var
)
assert tensor_dist_attr.dims_mapping == [0, -1, -1]
if op.type == "matmul_grad":
if op.type == "matmul_v2_grad":
op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
assert op_dist_attr.impl_idx == 1
assert op_dist_attr.impl_type == "matmul"
assert op_dist_attr.impl_type == "matmul_v2"
assert ops == ref_ops
......
......@@ -168,9 +168,7 @@ class TestDistOpCost(unittest.TestCase):
auto.ProcessMesh([0, 1], dim_names=["x"]),
[None, "x"],
)
out1 = paddle.fluid.layers.matmul(
out, param1
) # [8, 8] [-1, -1]
out1 = paddle.matmul(out, param1) # [8, 8] [-1, -1]
tmp_param = paddle.create_parameter(
[8, 8], paddle.float32
) # [8, 8] [-1, -1]
......@@ -179,10 +177,8 @@ class TestDistOpCost(unittest.TestCase):
auto.ProcessMesh([0, 1], dim_names=["x"]),
[None, None],
)
tmp_out = paddle.fluid.layers.matmul(out1, tmp_param)
out2 = paddle.fluid.layers.matmul(
tmp_out, param2
) # [8, 4] [-1, 0]
tmp_out = paddle.matmul(out1, tmp_param)
out2 = paddle.matmul(tmp_out, param2) # [8, 4] [-1, 0]
out8 = paddle.transpose(out2, [1, 0]) # [4, 8] [0, -1]
......
......@@ -231,8 +231,10 @@ class MultiHeadAttention(nn.Layer):
return self.Cache(key, value)
def core_attn(self, q, k, v, attn_mask):
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
product = paddle.matmul(x=q, y=k, transpose_y=True)
product = paddle.multiply(
product,
paddle.to_tensor(self.head_dim**-0.5, dtype=product.dtype),
)
if attn_mask is not None:
product = product + attn_mask
......
......@@ -20,7 +20,6 @@ import numpy as np
import paddle
import paddle.distributed as dist
import paddle.distributed.fleet as fleet
import paddle.fluid as fluid
import paddle.nn as nn
from paddle.distributed.fleet.meta_parallel import PipelineLayer
from paddle.fluid.dygraph.layers import Layer
......@@ -54,7 +53,7 @@ class SimpleNet(Layer):
def forward(self, x1, x2, y1):
x_emb = self.word_embeddings(x1)
fc = fluid.layers.matmul(x_emb, self.softmax_weight)
fc = paddle.matmul(x_emb, self.softmax_weight)
fc = paddle.add(fc, self.softmax_bias)
projection = paddle.reshape(fc, shape=[-1, vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy(
......@@ -83,7 +82,7 @@ class MatmulNet(Layer):
def forward(self, args):
x1, x2 = args
fc = fluid.layers.matmul(x1, self.softmax_weight)
fc = paddle.matmul(x1, self.softmax_weight)
return fc, x2
......
......@@ -24,7 +24,6 @@ import paddle.nn as nn
import paddle.nn.functional as F
from paddle import framework
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
from paddle.fluid import layers
from paddle.fluid.dygraph.layers import Layer
......@@ -73,13 +72,12 @@ class TransformerNet(Layer):
q = self.q_proj(x)
k = self.k_proj(x)
v = self.v_proj(x)
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=d_model**-0.5
)
product = paddle.matmul(x=q, y=k, transpose_y=True)
product = paddle.scale(product, scale=d_model**-0.5)
weights = F.softmax(product)
weights = F.dropout(weights, 0.2)
tgt = layers.matmul(weights, v)
tgt = paddle.matmul(weights, v)
residual = tgt
tgt = self.norm1(tgt)
tgt = residual + tgt
......
......@@ -23,7 +23,6 @@ import paddle.distributed.fleet as fleet
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
from paddle.fluid import layers
from paddle.fluid.dygraph.layers import Layer
......@@ -82,14 +81,13 @@ class TransformerNet(Layer):
q = self.q_proj(x)
k = self.k_proj(x)
v = self.v_proj(x)
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=d_model**-0.5
)
product = paddle.matmul(x=q, y=k, transpose_y=True)
product = paddle.scale(product, scale=d_model**-0.5)
weights = F.softmax(product + mask)
# TODO(shenliang03) For save/load in PipeLineParallel, can’t support dropout temporarily.
# weights = F.dropout(weights, 0.2)
tgt = layers.matmul(weights, v)
tgt = paddle.matmul(weights, v)
residual = tgt
tgt = self.norm1(tgt)
tgt = residual + tgt
......
......@@ -23,7 +23,6 @@ import paddle.distributed.fleet as fleet
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
from paddle.fluid import layers
from paddle.fluid.dygraph.layers import Layer
......@@ -83,12 +82,11 @@ class TransformerNet(Layer):
q = self.q_proj(x)
k = self.k_proj(x)
v = self.v_proj(x)
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=d_model**-0.5
)
product = paddle.matmul(x=q, y=k, transpose_y=True)
product = paddle.scale(product, scale=d_model**-0.5)
weights = F.softmax(product + mask)
tgt = layers.matmul(weights, v)
tgt = paddle.matmul(weights, v)
residual = tgt
tgt = self.norm1(tgt)
tgt = residual + tgt
......
......@@ -20,7 +20,6 @@ import numpy as np
import paddle
import paddle.distributed as dist
import paddle.distributed.fleet as fleet
import paddle.fluid as fluid
import paddle.nn as nn
from paddle.distributed.fleet.meta_parallel import (
LayerDesc,
......@@ -61,7 +60,7 @@ class SimpleNet(Layer):
def forward(self, x1, x2, y1):
x_emb = self.word_embeddings(x1)
fc = fluid.layers.matmul(x_emb, self.softmax_weight)
fc = paddle.matmul(x_emb, self.softmax_weight)
fc = paddle.add(fc, self.softmax_bias)
projection = paddle.reshape(fc, shape=[-1, vocab_size])
......@@ -97,7 +96,7 @@ class MatmulNet(Layer):
def forward(self, args):
x1, x2 = args
fc = fluid.layers.matmul(x1, self.softmax_weight)
fc = paddle.matmul(x1, self.softmax_weight)
return fc, x2
......
......@@ -334,12 +334,12 @@ class MultiHeadAttentionLayer(Layer):
transpose_v = paddle.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
# scale dot product attention
product = fluid.layers.matmul(
product = paddle.matmul(
x=transpose_q,
y=transpose_k,
transpose_y=True,
alpha=self._d_model**-0.5,
)
product = paddle.scale(product, scale=self._d_model**-0.5)
if attn_bias is not None:
product += attn_bias
weights = paddle.nn.functional.softmax(product)
......@@ -350,9 +350,9 @@ class MultiHeadAttentionLayer(Layer):
seed=ModelHyperParams.dropout_seed,
is_test=False,
)
out = fluid.layers.matmul(weights_droped, transpose_v)
out = paddle.matmul(weights_droped, transpose_v)
else:
out = fluid.layers.matmul(weights, transpose_v)
out = paddle.matmul(weights, transpose_v)
# combine heads
if len(out.shape) != 4:
......@@ -839,7 +839,7 @@ class WrapDecoderLayer(Layer):
)
if self._weight_sharing:
predict = fluid.layers.matmul(
predict = paddle.matmul(
x=dec_output_reshape,
y=self._prepare_decoder_layer._input_emb.weight,
transpose_y=True,
......
......@@ -1174,7 +1174,7 @@ def multi_head_attention(
Scaled Dot-Product Attention
"""
scaled_q = paddle.scale(x=q, scale=d_model**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
if attn_bias:
product += attn_bias
weights = paddle.nn.functional.softmax(product)
......@@ -1185,7 +1185,7 @@ def multi_head_attention(
seed=ModelHyperParams.dropout_seed,
is_test=False,
)
out = layers.matmul(weights, v)
out = paddle.matmul(weights, v)
return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
......@@ -1701,7 +1701,7 @@ def wrap_decoder(
)
# Return logits for training and probs for inference.
if weight_sharing:
predict = layers.matmul(
predict = paddle.matmul(
x=dec_output,
y=fluid.framework._get_var(word_emb_param_names[0]),
transpose_y=True,
......
......@@ -272,7 +272,7 @@ class BertModelLayer(Layer):
emb_out = self.pre_process_layer(emb_out)
self_attn_mask = fluid.layers.matmul(
self_attn_mask = paddle.matmul(
x=input_mask, y=input_mask, transpose_y=True
)
self_attn_mask = paddle.scale(
......@@ -401,7 +401,7 @@ class PretrainModelLayer(Layer):
mask_trans_feat = self.pre_process_layer(mask_trans_feat)
if self._weight_sharing:
fc_out = fluid.layers.matmul(
fc_out = paddle.matmul(
x=mask_trans_feat,
y=self.bert_layer._src_emb._w,
transpose_y=True,
......
......@@ -70,7 +70,7 @@ class BasicLSTMUnit(Layer):
def forward(self, input, pre_hidden, pre_cell):
concat_input_hidden = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
gate_input = paddle.matmul(x=concat_input_hidden, y=self._weight)
gate_input = paddle.add(gate_input, self._bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
......@@ -697,14 +697,14 @@ class AttentionModel(fluid.dygraph.Layer):
def attention(self, query, enc_output, mask=None):
query = fluid.layers.unsqueeze(query, [1])
memory = self.attn_fc(enc_output)
attn = fluid.layers.matmul(query, memory, transpose_y=True)
attn = paddle.matmul(query, memory, transpose_y=True)
if mask is not None:
attn = paddle.transpose(attn, [1, 0, 2])
attn = paddle.add(attn, mask * 1000000000)
attn = paddle.transpose(attn, [1, 0, 2])
weight = paddle.nn.functional.softmax(attn)
weight_memory = fluid.layers.matmul(weight, memory)
weight_memory = paddle.matmul(weight, memory)
return weight_memory
......
......@@ -282,7 +282,7 @@ class BMN(fluid.dygraph.Layer):
# PEM
xp = paddle.nn.functional.relu(self.p_conv1(x))
# BM layer
xp = fluid.layers.matmul(xp, self.sample_mask)
xp = paddle.matmul(xp, self.sample_mask)
xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale])
xp = self.p_conv3d1(xp)
......
......@@ -66,9 +66,9 @@ class SubNetWithDict(fluid.dygraph.Layer):
v = 0.2 * cache_v + v
cache["k"], cache["v"] = k, v
weight = fluid.layers.matmul(x=q, y=k, transpose_y=True)
weight = paddle.matmul(x=q, y=k, transpose_y=True)
weight = paddle.nn.functional.softmax(weight)
out = fluid.layers.matmul(weight, v)
out = paddle.matmul(weight, v)
return out
......
......@@ -42,7 +42,7 @@ np.random.seed(0)
def simple_func(x, weight_numpy):
x = fluid.dygraph.to_variable(x)
w = fluid.dygraph.to_variable(weight_numpy)
y = fluid.layers.matmul(x, w)
y = paddle.matmul(x, w)
z = paddle.mean(y)
return z
......@@ -51,7 +51,7 @@ def simple_func(x, weight_numpy):
def decorated_simple_func(x, weight_numpy):
x = fluid.dygraph.to_variable(x)
w = fluid.dygraph.to_variable(weight_numpy)
y = fluid.layers.matmul(x, w)
y = paddle.matmul(x, w)
z = paddle.mean(y)
return z
......
......@@ -94,7 +94,7 @@ class SimpleLSTMRNN(fluid.Layer):
bias = self.bias_arr[k]
nn = fluid.layers.concat([step_input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1)
gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias)
i, j, f, o = fluid.layers.split(
......@@ -213,7 +213,7 @@ class PtbModel(fluid.Layer):
x_emb, init_h, init_c
)
projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
projection = paddle.matmul(rnn_out, self.softmax_weight)
projection = paddle.add(projection, self.softmax_bias)
loss = paddle.nn.functional.softmax_with_cross_entropy(
......
......@@ -148,16 +148,14 @@ class MultiHeadAttention(Layer):
v = layers.concat([cache_v, v], axis=2)
cache["k"], cache["v"] = k, v
# scale dot product attention
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5
)
product = paddle.matmul(x=q, y=k, transpose_y=True)
product = paddle.scale(product, scale=self.d_model**-0.5)
if attn_bias is not None:
product += attn_bias
weights = paddle.nn.functional.softmax(product)
if self.dropout_rate:
weights = layers.dropout(weights, dropout_prob=self.dropout_rate)
out = layers.matmul(weights, v)
out = paddle.matmul(weights, v)
out = paddle.transpose(out, perm=[0, 2, 1, 3])
out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
......@@ -524,7 +522,7 @@ class WrapDecoder(Layer):
postprocess_cmd,
)
if share_input_output_embed:
self.linear = lambda x: layers.matmul(
self.linear = lambda x: paddle.matmul(
x=x, y=self.word_embedder.word_embedder.weight, transpose_y=True
)
else:
......
......@@ -44,7 +44,6 @@ class TestBase(IPUOpTest):
self.attrs = {
"transpose_x": False,
"transpose_y": False,
"alpha": 1.0,
}
@IPUOpTest.static_graph
......@@ -56,7 +55,7 @@ class TestBase(IPUOpTest):
name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32'
)
out = paddle.fluid.layers.matmul(x, y, **self.attrs)
out = paddle.matmul(x, y, **self.attrs)
self.fetch_list = [out.name]
def run_model(self, exec_mode):
......@@ -75,7 +74,6 @@ class TestCase1(TestBase):
self.attrs = {
"transpose_x": True,
"transpose_y": True,
"alpha": 1.0,
}
......@@ -84,7 +82,6 @@ class TestCase2(TestBase):
self.attrs = {
"transpose_x": True,
"transpose_y": True,
"alpha": 3.14,
}
def set_atol(self):
......@@ -141,7 +138,6 @@ class TestCase6_2(TestCase6):
self.attrs = {
"transpose_x": True,
"transpose_y": True,
"alpha": 1.0,
}
......@@ -154,7 +150,10 @@ class TestCase7(TestBase):
self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
def set_op_attrs(self):
self.attrs = {"transpose_x": False, "transpose_y": True, "alpha": 0.125}
self.attrs = {
"transpose_x": False,
"transpose_y": True,
}
class TestCase8(TestBase):
......@@ -179,7 +178,6 @@ class TestCase8_2(TestBase):
self.attrs = {
"transpose_x": True,
"transpose_y": True,
"alpha": 1.0,
}
......
......@@ -67,7 +67,7 @@ class TestWeightSharing(IPUOpTest):
input=y, size=768, param_attr=paddle.fluid.ParamAttr(name="fc")
)
with paddle.static.ipu_shard_guard(index=0, stage=2):
out = paddle.fluid.layers.matmul(
out = paddle.matmul(
x=z,
y=self.main_prog.global_block().var('word_embedding'),
transpose_y=True,
......
......@@ -37,7 +37,7 @@ class TestMKLDNNMatmulFuseOp(InferencePassTest):
y = fluid.data(
name='y', shape=[-1] + self.shape_y, dtype=self.d_type
)
out = fluid.layers.matmul(x, y)
out = paddle.matmul(x, y)
out = paddle.transpose(out, perm=[0, 2, 1, 3])
out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]])
......@@ -79,7 +79,7 @@ class TestMKLDNNMatmulOpNotFusedWrongTransposeAxis(TestMKLDNNMatmulFuseOp):
y = fluid.data(
name='y', shape=[-1] + self.shape_y, dtype=self.d_type
)
out = fluid.layers.matmul(x, y)
out = paddle.matmul(x, y)
out = paddle.transpose(out, perm=[0, 1, 2, 3])
out = paddle.reshape(out, [0, 0, 0, 0])
out = fluid.layers.fc(out, size=1)
......@@ -102,7 +102,7 @@ class TestMKLDNNMatmulOpNotFusedBreakPattern(TestMKLDNNMatmulFuseOp):
y = fluid.data(
name='y', shape=[-1] + self.shape_y, dtype=self.d_type
)
out = fluid.layers.matmul(x, y)
out = paddle.matmul(x, y)
out = paddle.transpose(out, perm=[0, 2, 1, 3])
out = paddle.transpose(out, perm=[0, 1, 2, 3]) # breaks pattern
out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]])
......
......@@ -30,13 +30,13 @@ class TensorRTInspectorTest(InferencePassTest):
self.set_params()
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(name="data", shape=[1, 16, 16], dtype="float32")
matmul_out = fluid.layers.matmul(
matmul_out = paddle.matmul(
x=data,
y=data,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y,
alpha=self.alpha,
)
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True)
self.feeds = {
......
......@@ -17,6 +17,7 @@ import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import AnalysisConfig, PassVersionChecker
......@@ -27,13 +28,13 @@ class TensorRTMatMulDims2Test(InferencePassTest):
self.set_params()
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(name="data", shape=[24, 24], dtype="float32")
matmul_out = fluid.layers.matmul(
matmul_out = paddle.matmul(
x=data,
y=data,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y,
alpha=self.alpha,
)
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True)
self.feeds = {
......@@ -66,13 +67,13 @@ class TensorRTMatMulTest(InferencePassTest):
data = fluid.data(
name="data", shape=[-1, 6, 24, 24], dtype="float32"
)
matmul_out = fluid.layers.matmul(
matmul_out = paddle.matmul(
x=data,
y=data,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y,
alpha=self.alpha,
)
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True)
self.feeds = {
......@@ -128,13 +129,13 @@ class TensorRTMatMulBroadcastTest(InferencePassTest):
name="data_x", shape=[-1, 6, 24], dtype="float32"
)
data_y = fluid.data(name="data_y", shape=[24, 16], dtype="float32")
matmul_out = fluid.layers.matmul(
matmul_out = paddle.matmul(
x=data_x,
y=data_y,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y,
alpha=self.alpha,
)
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True)
self.feeds = {
......
......@@ -32,13 +32,13 @@ class TensorRTMatMulQuantDequantDims3Test(QuantDequantTest):
name='data', shape=[1, 28, 28], dtype='float32'
)
self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
matmul_out = fluid.layers.matmul(
matmul_out = paddle.matmul(
x=self.data,
y=self.data,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y,
alpha=self.alpha,
)
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
fc_out = fluid.layers.fc(
input=matmul_out,
size=10,
......@@ -128,13 +128,13 @@ class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest):
)
self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
reshape_out = paddle.reshape(self.data, shape=[1, 4, 14, 14])
matmul_out = fluid.layers.matmul(
matmul_out = paddle.matmul(
x=reshape_out,
y=reshape_out,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y,
alpha=self.alpha,
)
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True)
fc_out = fluid.layers.fc(
input=matmul_out,
......@@ -224,13 +224,13 @@ class TensorRTMatMulQuantDequantDims3DynamicTest(QuantDequantTest):
name='data', shape=[-1, 28, 28], dtype='float32'
)
self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
matmul_out = fluid.layers.matmul(
matmul_out = paddle.matmul(
x=self.data,
y=self.data,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y,
alpha=self.alpha,
)
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True)
fc_out = fluid.layers.fc(
input=matmul_out,
......
......@@ -48,7 +48,7 @@ def check():
a = fluid.dygraph.to_variable(a_np)
b = fluid.dygraph.to_variable(b_np)
y = paddle.add(x=a, y=b)
y = fluid.layers.matmul(x=y, y=b, transpose_y=True)
y = paddle.matmul(x=y, y=b, transpose_y=True)
res1 = func(y)
np_res = np.add(a_np, b_np)
......
......@@ -87,14 +87,14 @@ class TestFlagsUseMkldnn(unittest.TestCase):
assert self.not_found(self.matmul_regex, out, err)
def test_flags_use_mkl_dnn_off(self):
env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul")}
env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul_v2")}
out, err = self.flags_use_mkl_dnn_common(env)
assert self.found(self.relu_regex, out, err)
assert self.found(self.ew_add_regex, out, err)
assert self.not_found(self.matmul_regex, out, err)
def test_flags_use_mkl_dnn_off_multiple(self):
env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul,relu")}
env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul_v2,relu")}
out, err = self.flags_use_mkl_dnn_common(env)
assert self.not_found(self.relu_regex, out, err)
assert self.found(self.ew_add_regex, out, err)
......@@ -103,7 +103,7 @@ class TestFlagsUseMkldnn(unittest.TestCase):
def test_flags_use_mkl_dnn_on_off(self):
env = {
str("FLAGS_tracer_mkldnn_ops_on"): str("elementwise_add"),
str("FLAGS_tracer_mkldnn_ops_off"): str("matmul"),
str("FLAGS_tracer_mkldnn_ops_off"): str("matmul_v2"),
}
out, err = self.flags_use_mkl_dnn_common(env)
assert self.not_found(self.relu_regex, out, err)
......
......@@ -65,7 +65,7 @@ class SimpleNet(fluid.Layer):
def forward(self, input, label):
x_emb = self.embedding(input)
fc = fluid.layers.matmul(x_emb, self.softmax_weight)
fc = paddle.matmul(x_emb, self.softmax_weight)
fc = paddle.add(fc, self.softmax_bias)
projection = paddle.reshape(fc, shape=[-1, self.vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy(
......
......@@ -24,7 +24,6 @@ import paddle.utils as utils
from paddle.distributed.auto_parallel.completion import Completer
from paddle.distributed.auto_parallel.dist_context import DistributedContext
from paddle.distributed.fleet import auto
from paddle.fluid import layers
paddle.enable_static()
_global_parallel_strategy = None
......@@ -301,9 +300,8 @@ class AttentionLayer(nn.Layer):
v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
# scale dot product attention
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
)
product = tensor.matmul(x=q, y=k, transpose_y=True)
product = tensor.scale(product, scale=self.head_dim**-0.5)
if self.attn_mask is not None:
product = product + self.attn_mask
......@@ -568,9 +566,8 @@ class DecoderLayer(nn.Layer):
v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
# scale dot product attention
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
)
product = tensor.matmul(x=q, y=k, transpose_y=True)
product = tensor.scale(product, scale=self.head_dim**-0.5)
if self.attn_mask is not None:
product = product + self.attn_mask
......
......@@ -210,9 +210,8 @@ class MultiHeadAttention(nn.Layer):
query, key, value, use_cache, cache
)
# scale dot product attention
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
)
product = tensor.matmul(x=q, y=k, transpose_y=True)
product = tensor.scale(product, scale=self.head_dim**-0.5)
if attn_mask is not None:
product = product + attn_mask
......
......@@ -28,7 +28,6 @@ from paddle.distributed.auto_parallel.partitioner import Partitioner
from paddle.distributed.auto_parallel.process_group import new_process_group
from paddle.distributed.auto_parallel.utils import _get_comm_group
from paddle.distributed.fleet import auto
from paddle.fluid import layers
paddle.enable_static()
_global_parallel_strategy = None
......@@ -695,9 +694,8 @@ class AttentionLayer(nn.Layer):
v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
# scale dot product attention
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
)
product = tensor.matmul(x=q, y=k, transpose_y=True)
product = tensor.scale(product, scale=self.head_dim**-0.5)
if self.attn_mask is not None:
product = product + self.attn_mask
......@@ -868,7 +866,8 @@ class TestAttentionAutoPartitioner(unittest.TestCase):
'transpose2',
'reshape2',
'transpose2',
'matmul',
'matmul_v2',
"scale",
'softmax',
'dropout',
'matmul_v2',
......@@ -976,7 +975,8 @@ class TestAttentionAutoPartitioner(unittest.TestCase):
'transpose2',
'reshape2',
'transpose2',
'matmul',
'matmul_v2',
"scale",
'softmax',
'dropout',
'matmul_v2',
......@@ -1166,9 +1166,8 @@ class DecoderLayer(nn.Layer):
v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
# scale dot product attention
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
)
product = tensor.matmul(x=q, y=k, transpose_y=True)
product = tensor.scale(product, scale=self.head_dim**-0.5)
if self.attn_mask is not None:
product = product + self.attn_mask
......@@ -1347,7 +1346,8 @@ class TestDecoderLayerPartitioner(unittest.TestCase):
'transpose2',
'reshape2',
'transpose2',
'matmul',
'matmul_v2',
"scale",
'softmax',
'dropout',
'matmul_v2',
......@@ -1399,15 +1399,15 @@ class TestDecoderLayerPartitioner(unittest.TestCase):
distributed_attr_check_for_program(dist_main_prog, dist_context)
)
# check distribured attr
serial_op_idx = [0, 5, 9, 11, 23, 28, 31]
serial_op_idx = [0, 5, 9, 11, 24, 29, 32]
dist_op_idx = [
[0, 1],
[6, 7],
[11, 12],
[14, 15],
[27, 28],
[33, 34],
[37, 38],
[28, 29],
[34, 35],
[38, 39],
]
self.assertTrue(
distributed_attr_check_for_dist_op(
......@@ -1500,7 +1500,8 @@ class TestDecoderLayerPartitioner(unittest.TestCase):
'transpose2',
'reshape2',
'transpose2',
'matmul',
'matmul_v2',
"scale",
'softmax',
'dropout',
'matmul_v2',
......
......@@ -256,9 +256,8 @@ class MultiHeadAttention(nn.Layer):
query, key, value, use_cache, cache
)
# scale dot product attention
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
)
product = tensor.matmul(x=q, y=k, transpose_y=True)
product = tensor.scale(product, scale=self.head_dim**-0.5)
if attn_mask is not None:
product = product + attn_mask
......
......@@ -103,6 +103,7 @@ def mlp_forward(train_program, start_program):
return loss, train_program, start_program
@unittest.skipIf(True, "to delete later")
class TestCompatible(unittest.TestCase):
def test_matmulv2_matmul_2_compatible(self):
valid_op_dist_attr_list = []
......
......@@ -26,7 +26,6 @@ from paddle.distributed.auto_parallel.dist_op import DistributedOperator
from paddle.distributed.auto_parallel.operators.common import (
get_distributed_operator_impl_container,
)
from paddle.fluid import layers
paddle.enable_static()
device = "gpu" if core.is_compiled_with_cuda() else "cpu"
......@@ -85,7 +84,7 @@ def mlp_forward(train_program, start_program):
shape=[hidden_size, hidden_size],
dtype='float32',
)
input = layers.matmul(x=input, y=matmulinput)
input = paddle.matmul(x=input, y=matmulinput)
label = static.data(
name="label", shape=[batch_size, 1], dtype='float32'
)
......
......@@ -22,7 +22,6 @@ from op_test import OpTest, skip_check_grad_ci
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.layers as layers
@skip_check_grad_ci(
......@@ -77,7 +76,7 @@ class TestCholeskyOp(OpTest):
dtype=root_data.dtype, shape=root_data.shape
)
root_t = paddle.transpose(root, self.trans_dims)
x = layers.matmul(x=root, y=root_t) + 1e-05
x = paddle.matmul(x=root, y=root_t) + 1e-05
out = paddle.cholesky(x, upper=self.attrs["upper"])
grad_check(root, out, x_init=root_data, place=place)
......
......@@ -414,9 +414,7 @@ class TestFakeInit(TranspilerTest):
input_emb_re = paddle.reshape(input_emb, shape=[-1, 1, embedding_size])
neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w_re, transpose_y=True
)
neg_matmul = paddle.matmul(input_emb_re, neg_emb_w_re, transpose_y=True)
neg_matmul_re = paddle.reshape(neg_matmul, shape=[-1, neg_num])
neg_logits = paddle.add(neg_matmul_re, neg_emb_b_vec)
# nce loss
......
......@@ -167,7 +167,7 @@ def lm_model(
bias = bias_arr[k]
nn = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=nn, y=weight_1)
gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias)
i = paddle.slice(
......@@ -291,7 +291,7 @@ def lm_model(
bias = bias_arr[k]
nn = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=nn, y=weight_1)
gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
......@@ -459,7 +459,7 @@ def lm_model(
),
)
projection = layers.matmul(rnn_out, softmax_weight)
projection = paddle.matmul(rnn_out, softmax_weight)
projection = paddle.add(projection, softmax_bias)
projection = paddle.reshape(projection, shape=[-1, vocab_size])
......
......@@ -21,7 +21,6 @@ import paddle
import paddle.incubate.nn.functional as incubate_f
import paddle.nn.functional as F
from paddle import tensor
from paddle.fluid import layers
from paddle.fluid.framework import default_main_program
from paddle.nn.layer.common import Dropout, Linear
from paddle.nn.layer.norm import LayerNorm
......@@ -192,9 +191,8 @@ class TestFusedAttentionOp(OpTest):
# [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, out_seq_len]
qk_out = layers.matmul(
x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5
)
qk_out = paddle.matmul(x=q_out, y=k_out, transpose_y=True)
qk_out = paddle.scale(qk_out, scale=self.head_dim**-0.5)
if attn_mask is not None:
attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype)
......
......@@ -19,7 +19,6 @@ import numpy as np
import paddle
import paddle.nn.functional as F
from paddle import _legacy_C_ops, tensor
from paddle.fluid import layers
from paddle.fluid.framework import default_main_program
from paddle.nn.layer.common import Dropout
from paddle.nn.layer.norm import LayerNorm
......@@ -388,9 +387,8 @@ class TestFusedMultiTransformerInt8Op(unittest.TestCase):
# [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, out_seq_len]
qk_out = layers.matmul(
x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5
)
qk_out = paddle.matmul(x=q_out, y=k_out, transpose_y=True)
qk_out = paddle.scale(qk_out, scale=self.head_dim**-0.5)
if self.debug:
print('qk out is')
......
......@@ -281,9 +281,8 @@ class TestFusedMultiTransformerOp(OpTest):
# [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, out_seq_len]
qk_out = layers.matmul(
x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5
)
qk_out = paddle.matmul(x=q_out, y=k_out, transpose_y=True)
qk_out = paddle.scale(qk_out, scale=self.head_dim**-0.5)
if self.debug:
print('qk out is')
......
......@@ -1001,7 +1001,7 @@ class TestDygraphGuardWithError(unittest.TestCase):
with self.assertRaisesRegexp(
TypeError, "Please use `with fluid.dygraph.guard()"
):
y = fluid.layers.matmul(x, x)
y = paddle.matmul(x, x)
def test_without_guard(self):
with _test_eager_guard():
......
......@@ -46,9 +46,9 @@ class GraphConv(fluid.Layer):
)
def forward(self, features, adj):
support = fluid.layers.matmul(features, self.weight)
support = paddle.matmul(features, self.weight)
# TODO(panyx0718): sparse matmul?
return fluid.layers.matmul(adj, support) + self.bias
return paddle.matmul(adj, support) + self.bias
class GCN(fluid.Layer):
......
......@@ -64,7 +64,7 @@ class SimpleNet(fluid.Layer):
def forward(self, input, label):
x_emb = self.embedding(input)
projection = fluid.layers.matmul(
projection = paddle.matmul(
x_emb, paddle.transpose(self.embedding.weight, perm=[1, 0])
)
projection = paddle.add(projection, self.softmax_bias)
......
......@@ -109,7 +109,7 @@ class SimpleLSTMRNN(fluid.Layer):
bias = self.bias_arr[k]
nn = fluid.layers.concat([self._input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1)
gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias)
i, j, f, o = fluid.layers.split(
......@@ -225,7 +225,7 @@ class PtbModel(fluid.Layer):
rnn_out = paddle.reshape(
rnn_out, shape=[-1, self.num_steps, self.hidden_size]
)
projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
projection = paddle.matmul(rnn_out, self.softmax_weight)
projection = paddle.add(projection, self.softmax_bias)
projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy(
......
......@@ -104,7 +104,7 @@ class SimpleLSTMRNN(fluid.Layer):
bias = self.bias_arr[k]
nn = fluid.layers.concat([self._input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1)
gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias)
i, j, f, o = fluid.layers.split(
......@@ -221,7 +221,7 @@ class PtbModel(fluid.Layer):
rnn_out, shape=[-1, self.num_steps, self.hidden_size]
)
projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
projection = paddle.matmul(rnn_out, self.softmax_weight)
projection = paddle.add(projection, self.softmax_bias)
projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy(
......
......@@ -105,7 +105,7 @@ class SimpleLSTMRNN(fluid.Layer):
bias = self.bias_arr[k]
nn = fluid.layers.concat([self._input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1)
gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias)
i, j, f, o = fluid.layers.split(
......@@ -222,7 +222,7 @@ class PtbModel(fluid.Layer):
rnn_out, shape=[-1, self.num_steps, self.hidden_size]
)
projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
projection = paddle.matmul(rnn_out, self.softmax_weight)
projection = paddle.add(projection, self.softmax_bias)
projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy(
......
......@@ -72,9 +72,9 @@ class SimpleNet(fluid.Layer):
def forward(self, input, label):
x_emb = self.embedding(input)
fc = fluid.layers.matmul(x_emb, self.softmax_weight)
fc = paddle.matmul(x_emb, self.softmax_weight)
fc = paddle.add(fc, self.softmax_bias)
projection = fluid.layers.matmul(
projection = paddle.matmul(
fc, paddle.transpose(self.embedding.weight, perm=[1, 0])
)
projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
......
......@@ -495,12 +495,12 @@ class MultiHeadAttentionLayer(Layer):
transpose_v = paddle.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
# scale dot product attention
product = fluid.layers.matmul(
product = paddle.matmul(
x=transpose_q,
y=transpose_k,
transpose_y=True,
alpha=self._d_model**-0.5,
)
product = paddle.scale(product, scale=self._d_model**-0.5)
if attn_bias is not None:
product += attn_bias
weights = paddle.nn.functional.softmax(product)
......@@ -511,9 +511,9 @@ class MultiHeadAttentionLayer(Layer):
seed=ModelHyperParams.dropout_seed,
is_test=False,
)
out = fluid.layers.matmul(weights_droped, transpose_v)
out = paddle.matmul(weights_droped, transpose_v)
else:
out = fluid.layers.matmul(weights, transpose_v)
out = paddle.matmul(weights, transpose_v)
# combine heads
if len(out.shape) != 4:
......@@ -1003,7 +1003,7 @@ class WrapDecoderLayer(Layer):
)
if self._weight_sharing:
predict = fluid.layers.matmul(
predict = paddle.matmul(
x=dec_output_reshape,
y=self._prepare_decoder_layer._input_emb.weight,
transpose_y=True,
......
......@@ -290,7 +290,7 @@ class TestLayer(LayerTest):
with self.static_graph():
t = layers.data(name='t', shape=[3, 3], dtype='float32')
t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
ret = layers.matmul(t, t2)
ret = paddle.matmul(t, t2)
static_ret = self.get_static_graph_result(
feed={
't': np.ones([3, 3], dtype='float32'),
......@@ -303,14 +303,14 @@ class TestLayer(LayerTest):
with _test_eager_guard():
t = np.ones([3, 3], dtype='float32')
t2 = np.ones([3, 3], dtype='float32')
dy_eager_ret = layers.matmul(
dy_eager_ret = paddle.matmul(
base.to_variable(t), base.to_variable(t2)
)
dy_eager_ret_value = dy_eager_ret.numpy()
t = np.ones([3, 3], dtype='float32')
t2 = np.ones([3, 3], dtype='float32')
dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2))
dy_ret = paddle.matmul(base.to_variable(t), base.to_variable(t2))
dy_ret_value = dy_ret.numpy()
np.testing.assert_allclose(static_ret, dy_ret_value, rtol=1e-05)
......
......@@ -19,7 +19,6 @@ from op_test import OpTest
import paddle
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
......@@ -117,151 +116,6 @@ class Generator:
)
class TestMatmulOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
# The inputs type of matmul_op must be Variable.
input1 = 12
self.assertRaises(TypeError, fluid.layers.matmul, input1, input1)
# The inputs dtype of matmul_op must be float32, float64.
input2 = fluid.layers.data(
name='input2', shape=[10, 10], dtype="int32"
)
self.assertRaises(TypeError, fluid.layers.matmul, input2, input2)
input3 = fluid.layers.data(
name='input3', shape=[2, 2], dtype="float16"
)
fluid.layers.matmul(input3, input3)
# Negative dimension generation
def generate_negative_dims(in_shape):
from itertools import combinations
size = len(in_shape)
indexs = list()
shapes = list()
for i in range(size):
indexs.extend(list(combinations([j for j in range(size)], i + 1)))
for idx in indexs:
shapes.append(
[in_shape[i] if i not in idx else -1 for i in range(size)]
)
return shapes
# Build program with inputs sizes that contain negative numbers
def test_negative_dims_program(obj):
for shape_x in generate_negative_dims(obj.shape_X):
for shape_y in generate_negative_dims(obj.shape_Y):
X = np.random.random(obj.shape_X).astype("float32")
Y = np.random.random(obj.shape_Y).astype("float32")
Ref = reference_matmul(X, Y, obj.transpose_X, obj.transpose_Y)
with program_guard(Program(), Program()):
x = fluid.data(name='x', shape=shape_x, dtype='float32')
y = fluid.data(name='y', shape=shape_y, dtype='float32')
output = fluid.layers.matmul(
x, y, obj.transpose_X, obj.transpose_Y
)
obj.assertEqual(len(Ref.shape), len(output.shape))
for idx in range(len(Ref.shape)):
if output.shape[idx] != -1:
obj.assertEqual(Ref.shape[idx], output.shape[idx])
exe = fluid.Executor(fluid.CPUPlace())
(res,) = exe.run(
fluid.default_main_program(),
feed={'x': X, 'y': Y},
fetch_list=[output],
)
np.allclose(res, Ref, atol=1e-5)
# Generate program api cases for all negative possibilities
def api_test(dim_x, dim_y, trans_x, trans_y):
test_name = 'TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
dim_x, dim_y, trans_x, trans_y
)
shape_x, shape_y = generate_compatible_shapes(
dim_x, dim_y, trans_x, trans_y
)
globals()[test_name] = type(
test_name,
(unittest.TestCase,),
{
'shape_X': shape_x,
'shape_Y': shape_y,
'transpose_X': trans_x,
'transpose_Y': trans_y,
'test_propram': test_negative_dims_program,
},
)
# Generate operators cases for all possibilities
def inject_test(dim_x, dim_y, trans_x, trans_y):
test_name = 'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
dim_x, dim_y, trans_x, trans_y
)
shape_x, shape_y = generate_compatible_shapes(
dim_x, dim_y, trans_x, trans_y
)
globals()[test_name] = type(
test_name,
(Generator, OpTest),
{
'shape_X': shape_x,
'shape_Y': shape_y,
'transpose_X': trans_x,
'transpose_Y': trans_y,
},
)
for dim_X in (1, 2, 3):
for dim_Y in (1, 2, 3):
for transose_x in (False, True):
for transose_y in (False, True):
inject_test(dim_X, dim_Y, transose_x, transose_y)
api_test(dim_X, dim_Y, transose_x, transose_y)
# Test case more batch_size and N, M, K
def generate_compatible_shapes_batch(
dim_X, dim_Y, transpose_X, transpose_Y, batch_size
):
BATCH_SIZE = 2
M = 3
N = 4
K = 5
if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
K = 1
if dim_X == 1:
if transpose_X:
shape_X = [M]
else:
shape_X = [K]
if dim_Y == 1:
if transpose_Y:
shape_Y = [N]
else:
shape_Y = [K]
if dim_X >= 2:
if transpose_X:
shape_X = [K, M]
else:
shape_X = [M, K]
if dim_X == 3:
shape_X = [BATCH_SIZE] + shape_X
if dim_Y >= 2:
if transpose_Y:
shape_Y = [N, K]
else:
shape_Y = [K, N]
if dim_Y == 3:
shape_Y = [BATCH_SIZE] + shape_Y
return shape_X, shape_Y
# Test case n-dim
def generate_compatible_shapes_ndim(dim, transpose_X, transpose_Y):
M = 2
......
......@@ -94,7 +94,7 @@ class TestMatmulDoubleGradCheck(unittest.TestCase):
y = paddle.create_parameter(
dtype=typename, shape=self.y_shape, name='y'
)
out = layers.matmul(
out = paddle.matmul(
x, y, self.transpose_x, self.transpose_y, name='out'
)
......
......@@ -616,13 +616,13 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1):
rnn = layers.StaticRNN()
def dot_attention(query, memory):
attn = layers.matmul(query, memory, transpose_y=True)
attn = paddle.matmul(query, memory, transpose_y=True)
weight = paddle.nn.functional.softmax(attn)
weight_memory = layers.matmul(weight, memory)
weight_memory = paddle.matmul(weight, memory)
return weight_memory, weight
y = layers.matmul(emb, w1)
y = paddle.matmul(emb, w1)
with rnn.step():
pre_h = rnn.memory(
shape=(self.sent_len, self.input_dim),
......@@ -631,7 +631,7 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1):
)
step_in = rnn.step_input(x)
concat_in = layers.concat([step_in, pre_h], 1)
new_h = layers.matmul(concat_in, w2)
new_h = paddle.matmul(concat_in, w2)
new_h = layers.unsqueeze(new_h, [1])
new_h, _ = dot_attention(new_h, y)
new_h = paddle.squeeze(new_h, [1])
......
......@@ -71,14 +71,14 @@ class DecoderCell(layers.RNNCell):
query = layers.fc(
hidden, size=encoder_output.shape[-1], bias_attr=False
)
attn_scores = layers.matmul(
attn_scores = paddle.matmul(
layers.unsqueeze(query, [1]), encoder_output, transpose_y=True
)
if encoder_padding_mask is not None:
attn_scores = paddle.add(attn_scores, encoder_padding_mask)
attn_scores = paddle.nn.functional.softmax(attn_scores)
attn_out = paddle.squeeze(
layers.matmul(attn_scores, encoder_output), [1]
paddle.matmul(attn_scores, encoder_output), [1]
)
attn_out = layers.concat([attn_out, hidden], 1)
attn_out = layers.fc(attn_out, size=self.hidden_size, bias_attr=False)
......
......@@ -115,7 +115,7 @@ class SimpleLSTMRNN(fluid.Layer):
bias = self.bias_arr[k]
nn = fluid.layers.concat([self._input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1)
gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(gate_input, bias)
i, j, f, o = fluid.layers.split(
......@@ -234,7 +234,7 @@ class PtbModel(fluid.Layer):
rnn_out = paddle.reshape(
rnn_out, shape=[-1, self.num_steps, self.hidden_size]
)
projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
projection = paddle.matmul(rnn_out, self.softmax_weight)
projection = paddle.add(projection, self.softmax_bias)
projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
loss = paddle.nn.functional.softmax_with_cross_entropy(
......
......@@ -163,13 +163,13 @@ def multi_head_attention(
return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
scaled_q = paddle.scale(x=q, scale=d_model**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
weights = __softmax(paddle.add(x=product, y=attn_bias))
if dropout_rate:
weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False
)
out = layers.matmul(weights, v)
out = paddle.matmul(weights, v)
return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
......
......@@ -31,7 +31,6 @@ import paddle
import paddle.incubate.nn.functional as incubate_f
import paddle.nn.functional as F
from paddle import tensor
from paddle.fluid import layers
from paddle.fluid.framework import default_main_program
from paddle.nn.layer.common import Dropout, Linear
from paddle.nn.layer.norm import LayerNorm
......@@ -164,7 +163,7 @@ class XPUTestFusedAttentionOp(XPUOpTestWrapper):
# [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, out_seq_len]
qk_out = layers.matmul(
qk_out = tensor.matmul(
x=q_out * self.head_dim**-0.5, y=k_out, transpose_y=True
)
......
......@@ -27,7 +27,6 @@ from xpu.get_test_cover_info import (
import paddle
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
......@@ -135,71 +134,11 @@ def generate_compatible_shapes_2(dim, transpose_X, transpose_Y):
return shape_X, shape_Y
def generate_negative_dims(in_shape):
from itertools import combinations
size = len(in_shape)
indexs = list()
shapes = list()
for i in range(size):
indexs.extend(list(combinations([j for j in range(size)], i + 1)))
for idx in indexs:
shapes.append(
[in_shape[i] if i not in idx else -1 for i in range(size)]
)
return shapes
def test_negative_dims_program(obj):
for shape_x in generate_negative_dims(obj.shape_X):
for shape_y in generate_negative_dims(obj.shape_Y):
X = np.random.random(obj.shape_X).astype(obj.in_type)
Y = np.random.random(obj.shape_Y).astype(obj.in_type)
Ref = reference_matmul(X, Y, obj.transpose_X, obj.transpose_Y)
with program_guard(Program(), Program()):
x = fluid.data(name='x', shape=shape_x, dtype=obj.in_type_str)
y = fluid.data(name='y', shape=shape_y, dtype=obj.in_type_str)
output = fluid.layers.matmul(
x, y, obj.transpose_X, obj.transpose_Y
)
obj.assertEqual(len(Ref.shape), len(output.shape))
for idx in range(len(Ref.shape)):
if output.shape[idx] != -1:
obj.assertEqual(Ref.shape[idx], output.shape[idx])
exe = fluid.Executor(fluid.XPUPlace(0))
(res,) = exe.run(
fluid.default_main_program(),
feed={'x': X, 'y': Y},
fetch_list=[output],
)
np.allclose(res, Ref, atol=1e-3)
class XPUTestMatmulOpErr(XPUOpTestWrapper):
def __init__(self):
self.op_name = "matmul"
self.use_dynamic_create_class = False
class TestMatmulOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
# The inputs type of matmul_op must be Variable.
input1 = 12
self.assertRaises(
TypeError, fluid.layers.matmul, input1, input1
)
# The inputs dtype of matmul_op must be float32, float16
input2 = fluid.layers.data(
name='input2', shape=[10, 10], dtype="int32"
)
self.assertRaises(
TypeError, fluid.layers.matmul, input2, input2
)
input3 = fluid.layers.data(
name='input3', shape=[2, 2], dtype="float16"
)
fluid.layers.matmul(input3, input3)
class API_TestMm(unittest.TestCase):
def test_out(self):
with fluid.program_guard(fluid.Program()):
......@@ -399,39 +338,6 @@ class XPUTestMatmulOp1(XPUOpTestWrapper):
return base_class, classes
class XPUTestMatmulOp2(XPUOpTestWrapper):
def __init__(self):
self.op_name = "matmul"
self.use_dynamic_create_class = True
def dynamic_create_class(self):
base_class = unittest.TestCase
classes = []
xpu_support_dims_list = [[1, 1], [2, 2], [3, 3]]
batch_size = [2, 4, 5, 10, 50, 100, 300]
for dims in xpu_support_dims_list:
dim_X = dims[0]
dim_Y = dims[1]
for transose_x in [True, False]:
for transose_y in [True, False]:
for batch in batch_size:
class_name = 'TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'.format(
dim_X, dim_Y, transose_x, transose_y, batch
)
shape_x, shape_y = generate_compatible_shapes(
dim_X, dim_Y, transose_x, transose_y, batch
)
attr_dict = {
'shape_X': shape_x,
'shape_Y': shape_y,
'transpose_X': transose_x,
'transpose_Y': transose_y,
'test_propram': test_negative_dims_program,
}
classes.append([class_name, attr_dict])
return base_class, classes
class XPUTestMatmulOp3(XPUOpTestWrapper):
def __init__(self):
self.op_name = "matmul"
......@@ -464,7 +370,6 @@ support_types = get_xpu_op_support_types('matmul')
for stype in support_types:
create_test_class(globals(), XPUTestMatmulOpErr, stype)
create_test_class(globals(), XPUTestMatmulOp1, stype)
create_test_class(globals(), XPUTestMatmulOp2, stype)
create_test_class(globals(), XPUTestMatmulOp3, stype)
if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册