未验证 提交 970db874 编写于 作者: 2 201716010711 提交者: GitHub

transfer scale api (#48356)

上级 128ef1ae
...@@ -36,6 +36,6 @@ from paddle.fluid.data_feeder import ( # noqa: F401 ...@@ -36,6 +36,6 @@ from paddle.fluid.data_feeder import ( # noqa: F401
check_variable_and_dtype, check_variable_and_dtype,
convert_dtype, convert_dtype,
) )
from paddle.fluid.layers import fill_constant, utils, scale # noqa: F401 from paddle.fluid.layers import fill_constant, utils # noqa: F401
from paddle.tensor.layer_function_generator import templatedoc # noqa: F401 from paddle.tensor.layer_function_generator import templatedoc # noqa: F401
import paddle.fluid as fluid # noqa: F401 import paddle.fluid as fluid # noqa: F401
...@@ -522,7 +522,7 @@ class _ProgramHolder: ...@@ -522,7 +522,7 @@ class _ProgramHolder:
with framework.program_guard(program): with framework.program_guard(program):
for i, out in enumerate(self._output_descs): for i, out in enumerate(self._output_descs):
var = program.global_block().var(out.name()) var = program.global_block().var(out.name())
var = nn.scale( var = paddle.scale(
var, 1.0, name="translated_layer/scale_{}".format(i) var, 1.0, name="translated_layer/scale_{}".format(i)
) )
scale_output_vars.append(var) scale_output_vars.append(var)
......
...@@ -189,6 +189,7 @@ class ListenAndServ: ...@@ -189,6 +189,7 @@ class ListenAndServ:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle
with fluid.program_guard(main): with fluid.program_guard(main):
serv = layers.ListenAndServ( serv = layers.ListenAndServ(
"127.0.0.1:6170", ["X"], optimizer_mode=False) "127.0.0.1:6170", ["X"], optimizer_mode=False)
...@@ -199,7 +200,7 @@ class ListenAndServ: ...@@ -199,7 +200,7 @@ class ListenAndServ:
name="X", name="X",
append_batch_size=False) append_batch_size=False)
fluid.initializer.Constant(value=1.0)(x, main.global_block()) fluid.initializer.Constant(value=1.0)(x, main.global_block())
layers.scale(x=x, scale=10.0, out=out_var) paddle.scale(x=x, scale=10.0, out=out_var)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(main) exe.run(main)
......
...@@ -113,7 +113,6 @@ __all__ = [ ...@@ -113,7 +113,6 @@ __all__ = [
'flatten', 'flatten',
'unique', 'unique',
'unique_with_counts', 'unique_with_counts',
'scale',
'elementwise_add', 'elementwise_add',
'elementwise_div', 'elementwise_div',
'elementwise_sub', 'elementwise_sub',
...@@ -7924,103 +7923,6 @@ def _elementwise_op(helper): ...@@ -7924,103 +7923,6 @@ def _elementwise_op(helper):
return helper.append_activation(out) return helper.append_activation(out)
def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
"""
Putting scale and bias to the input Tensor as following:
``bias_after_scale`` is True:
.. math::
Out=scale*X+bias
``bias_after_scale`` is False:
.. math::
Out=scale*(X+bias)
Args:
x(Tensor): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8.
scale(float|Tensor): The scale factor of the input, it should be a float number or a Tensor with shape [1] and data type as float32.
bias(float): The bias to be put on the input.
bias_after_scale(bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances.
act(str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu.
name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
Returns:
Tensor: Output tensor of scale operator, with shape and data type same as input.
Examples:
.. code-block:: python
# scale as a float32 number
import paddle
data = paddle.randn(shape=[2,3], dtype='float32')
res = paddle.scale(data, scale=2.0, bias=1.0)
.. code-block:: python
# scale with parameter scale as a Tensor
import paddle
data = paddle.randn(shape=[2, 3], dtype='float32')
factor = paddle.to_tensor([2], dtype='float32')
res = paddle.scale(data, scale=factor, bias=1.0)
"""
if in_dygraph_mode():
out = _C_ops.scale(x, scale, float(bias), bias_after_scale)
return dygraph_utils._append_activation_in_dygraph(out)
if _non_static_mode():
_scale = scale.numpy().item(0) if isinstance(scale, Variable) else scale
out = _legacy_C_ops.scale(
x,
'scale',
float(_scale),
'bias',
float(bias),
'bias_after_scale',
bias_after_scale,
)
return dygraph_utils._append_activation_in_dygraph(out)
check_variable_and_dtype(
x,
"x",
[
'float16',
'uint16',
'float32',
'float64',
'int8',
'int16',
'int32',
'int64',
'uint8',
],
"scale",
)
inputs = {'X': [x]}
attrs = {
'bias': float(bias),
'bias_after_scale': bias_after_scale,
}
if isinstance(scale, Variable):
inputs['ScaleTensor'] = [scale]
else:
attrs['scale'] = float(scale)
helper = LayerHelper('scale', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type='scale', inputs=inputs, outputs={'Out': out}, attrs=attrs
)
return helper.append_activation(out)
def elementwise_add(x, y, axis=-1, act=None, name=None): def elementwise_add(x, y, axis=-1, act=None, name=None):
""" """
......
...@@ -620,7 +620,7 @@ def scaled_dot_product_attention( ...@@ -620,7 +620,7 @@ def scaled_dot_product_attention(
v = __split_heads(v, num_heads) v = __split_heads(v, num_heads)
key_dim_per_head = keys.shape[-1] // num_heads key_dim_per_head = keys.shape[-1] // num_heads
scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5) scaled_q = paddle.scale(x=q, scale=key_dim_per_head**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True) product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
x = paddle.reshape(x=product, shape=[-1, product.shape[-1]]) x = paddle.reshape(x=product, shape=[-1, product.shape[-1]])
......
...@@ -162,7 +162,7 @@ def model(): ...@@ -162,7 +162,7 @@ def model():
# need cos sim # need cos sim
inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
scale_infer = layers.scale(x=inference, scale=5.0) scale_infer = paddle.scale(x=inference, scale=5.0)
label = layers.data(name='score', shape=[1], dtype='float32') label = layers.data(name='score', shape=[1], dtype='float32')
square_cost = layers.square_error_cost(input=scale_infer, label=label) square_cost = layers.square_error_cost(input=scale_infer, label=label)
......
...@@ -537,7 +537,7 @@ class PrepareEncoderDecoderLayer(Layer): ...@@ -537,7 +537,7 @@ class PrepareEncoderDecoderLayer(Layer):
def forward(self, src_word, src_pos): def forward(self, src_word, src_pos):
src_word_emb = self._input_emb(src_word) src_word_emb = self._input_emb(src_word)
src_word_emb = fluid.layers.scale( src_word_emb = paddle.scale(
x=src_word_emb, scale=self._src_emb_dim**0.5 x=src_word_emb, scale=self._src_emb_dim**0.5
) )
# # TODO change this to fit dynamic length input # # TODO change this to fit dynamic length input
......
...@@ -1173,7 +1173,7 @@ def multi_head_attention( ...@@ -1173,7 +1173,7 @@ def multi_head_attention(
""" """
Scaled Dot-Product Attention Scaled Dot-Product Attention
""" """
scaled_q = layers.scale(x=q, scale=d_model**-0.5) scaled_q = paddle.scale(x=q, scale=d_model**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True) product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
if attn_bias: if attn_bias:
product += attn_bias product += attn_bias
...@@ -1305,7 +1305,7 @@ def prepare_encoder( ...@@ -1305,7 +1305,7 @@ def prepare_encoder(
), ),
) )
src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) src_word_emb = paddle.scale(x=src_word_emb, scale=src_emb_dim**0.5)
src_pos_enc = layers.embedding( src_pos_enc = layers.embedding(
src_pos, src_pos,
size=[src_max_len, src_emb_dim], size=[src_max_len, src_emb_dim],
......
...@@ -276,7 +276,7 @@ class BertModelLayer(Layer): ...@@ -276,7 +276,7 @@ class BertModelLayer(Layer):
self_attn_mask = fluid.layers.matmul( self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True x=input_mask, y=input_mask, transpose_y=True
) )
self_attn_mask = fluid.layers.scale( self_attn_mask = paddle.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False
) )
n_head_self_attn_mask = paddle.stack( n_head_self_attn_mask = paddle.stack(
......
...@@ -342,7 +342,7 @@ class WrapEncoder(Layer): ...@@ -342,7 +342,7 @@ class WrapEncoder(Layer):
def forward(self, src_word, src_pos, src_slf_attn_bias): def forward(self, src_word, src_pos, src_slf_attn_bias):
word_emb = self.word_embedder(src_word) word_emb = self.word_embedder(src_word)
word_emb = layers.scale(x=word_emb, scale=self.emb_dim**0.5) word_emb = paddle.scale(x=word_emb, scale=self.emb_dim**0.5)
pos_enc = self.pos_encoder(src_pos) pos_enc = self.pos_encoder(src_pos)
pos_enc.stop_gradient = True pos_enc.stop_gradient = True
emb = word_emb + pos_enc emb = word_emb + pos_enc
...@@ -546,7 +546,7 @@ class WrapDecoder(Layer): ...@@ -546,7 +546,7 @@ class WrapDecoder(Layer):
caches=None, caches=None,
): ):
word_emb = self.word_embedder(trg_word) word_emb = self.word_embedder(trg_word)
word_emb = layers.scale(x=word_emb, scale=self.emb_dim**0.5) word_emb = paddle.scale(x=word_emb, scale=self.emb_dim**0.5)
pos_enc = self.pos_encoder(trg_pos) pos_enc = self.pos_encoder(trg_pos)
pos_enc.stop_gradient = True pos_enc.stop_gradient = True
emb = word_emb + pos_enc emb = word_emb + pos_enc
......
...@@ -55,7 +55,7 @@ class TestBase(IPUOpTest): ...@@ -55,7 +55,7 @@ class TestBase(IPUOpTest):
x = paddle.static.data( x = paddle.static.data(
name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32' name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32'
) )
out = paddle.fluid.layers.scale(x, **self.attrs) out = paddle.scale(x, **self.attrs)
self.fetch_list = [out.name] self.fetch_list = [out.name]
def run_model(self, exec_mode): def run_model(self, exec_mode):
...@@ -126,7 +126,7 @@ class TestCase5(TestBase): ...@@ -126,7 +126,7 @@ class TestCase5(TestBase):
y = paddle.static.data( y = paddle.static.data(
name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32' name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32'
) )
out = paddle.fluid.layers.scale(x, scale=y, **self.attrs) out = paddle.scale(x, scale=y, **self.attrs)
self.fetch_list = [out.name] self.fetch_list = [out.name]
......
...@@ -62,9 +62,9 @@ class TestBase(IPUOpTest): ...@@ -62,9 +62,9 @@ class TestBase(IPUOpTest):
add1 = paddle.fluid.layers.elementwise_add(x, x) add1 = paddle.fluid.layers.elementwise_add(x, x)
reshape = paddle.reshape(add1, **self.attrs) reshape = paddle.reshape(add1, **self.attrs)
add2 = paddle.fluid.layers.elementwise_add(reshape, reshape) add2 = paddle.fluid.layers.elementwise_add(reshape, reshape)
scale1 = paddle.fluid.layers.scale(add2) scale1 = paddle.scale(add2)
scale2 = paddle.fluid.layers.scale(scale1, scale=1.3, bias=0.5) scale2 = paddle.scale(scale1, scale=1.3, bias=0.5)
scale3 = paddle.fluid.layers.scale(scale2, scale=2, bias=0.7) scale3 = paddle.scale(scale2, scale=2, bias=0.7)
fetch_list = [scale3.name] fetch_list = [scale3.name]
......
...@@ -17,6 +17,7 @@ import unittest ...@@ -17,6 +17,7 @@ import unittest
import numpy as np import numpy as np
from inference_pass_test import InferencePassTest from inference_pass_test import InferencePassTest
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.core import AnalysisConfig, PassVersionChecker from paddle.fluid.core import AnalysisConfig, PassVersionChecker
...@@ -39,7 +40,7 @@ class TRTScaleTest(InferencePassTest): ...@@ -39,7 +40,7 @@ class TRTScaleTest(InferencePassTest):
self.fetch_list = [out] self.fetch_list = [out]
def append_scale(self, data): def append_scale(self, data):
return fluid.layers.scale( return paddle.scale(
x=data, scale=2.0, bias=-1.0, bias_after_scale=False x=data, scale=2.0, bias=-1.0, bias_after_scale=False
) )
...@@ -71,7 +72,7 @@ class TRTScaleShape2Test(InferencePassTest): ...@@ -71,7 +72,7 @@ class TRTScaleShape2Test(InferencePassTest):
self.fetch_list = [out] self.fetch_list = [out]
def append_scale(self, data): def append_scale(self, data):
return fluid.layers.scale( return paddle.scale(
x=data, scale=2.0, bias=-1.0, bias_after_scale=False x=data, scale=2.0, bias=-1.0, bias_after_scale=False
) )
......
...@@ -207,7 +207,7 @@ class FusionGroupPassFillConstantTest(FusionGroupPassTest): ...@@ -207,7 +207,7 @@ class FusionGroupPassFillConstantTest(FusionGroupPassTest):
tmp_0 = layers.elementwise_add(self.feed_vars[0], self.feed_vars[1]) tmp_0 = layers.elementwise_add(self.feed_vars[0], self.feed_vars[1])
tmp_1 = layers.fill_constant(shape=[2, 2], dtype=dtype, value=2.0) tmp_1 = layers.fill_constant(shape=[2, 2], dtype=dtype, value=2.0)
tmp_2 = layers.scale( tmp_2 = paddle.scale(
tmp_1, scale=3.0, bias=1.0, bias_after_scale=True tmp_1, scale=3.0, bias=1.0, bias_after_scale=True
) )
tmp_3 = layers.elementwise_mul(tmp_2, tmp_0) tmp_3 = layers.elementwise_mul(tmp_2, tmp_0)
......
...@@ -131,7 +131,7 @@ class TestScaleOpSelectedRows(unittest.TestCase): ...@@ -131,7 +131,7 @@ class TestScaleOpSelectedRows(unittest.TestCase):
class TestScaleRaiseError(unittest.TestCase): class TestScaleRaiseError(unittest.TestCase):
def test_errors(self): def test_errors(self):
def test_type(): def test_type():
fluid.layers.scale([10]) paddle.scale([10])
self.assertRaises(TypeError, test_type) self.assertRaises(TypeError, test_type)
......
...@@ -80,7 +80,7 @@ class TestArrayReadWrite(unittest.TestCase): ...@@ -80,7 +80,7 @@ class TestArrayReadWrite(unittest.TestCase):
self.assertEqual(outs[0], outs[1]) self.assertEqual(outs[0], outs[1])
total_sum = layers.sums(input=[a_sum, x_sum]) total_sum = layers.sums(input=[a_sum, x_sum])
total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0) total_sum_scaled = paddle.scale(x=total_sum, scale=1 / 6.0)
append_backward(total_sum_scaled) append_backward(total_sum_scaled)
...@@ -117,7 +117,7 @@ class TestArrayReadWrite(unittest.TestCase): ...@@ -117,7 +117,7 @@ class TestArrayReadWrite(unittest.TestCase):
total_sum_dygraph = layers.sums( total_sum_dygraph = layers.sums(
input=[a_sum_dygraph, x_sum_dygraph] input=[a_sum_dygraph, x_sum_dygraph]
) )
total_sum_scaled_dygraph = layers.scale( total_sum_scaled_dygraph = paddle.scale(
x=total_sum_dygraph, scale=1 / 6.0 x=total_sum_dygraph, scale=1 / 6.0
) )
total_sum_scaled_dygraph.backward() total_sum_scaled_dygraph.backward()
......
...@@ -29,6 +29,7 @@ import paddle.fluid.layers.ops as ops ...@@ -29,6 +29,7 @@ import paddle.fluid.layers.ops as ops
from dist_test_utils import remove_ps_flag from dist_test_utils import remove_ps_flag
from paddle.fluid import core from paddle.fluid import core
import paddle
RPC_OP_ROLE_ATTR_NAME = ( RPC_OP_ROLE_ATTR_NAME = (
op_role_attr_name op_role_attr_name
...@@ -150,7 +151,7 @@ class TestSendOp(unittest.TestCase): ...@@ -150,7 +151,7 @@ class TestSendOp(unittest.TestCase):
append_batch_size=False, append_batch_size=False,
) )
fluid.initializer.Constant(value=2.3)(x, main.global_block()) fluid.initializer.Constant(value=2.3)(x, main.global_block())
o = layers.scale(x=x, scale=10.0) o = paddle.scale(x=x, scale=10.0)
exe = fluid.Executor(place) exe = fluid.Executor(place)
self.local_out = exe.run(main, fetch_list=[o]) self.local_out = exe.run(main, fetch_list=[o])
......
...@@ -155,7 +155,7 @@ class EagerDeletionRecurrentOpTest1(unittest.TestCase): ...@@ -155,7 +155,7 @@ class EagerDeletionRecurrentOpTest1(unittest.TestCase):
h_pre = rnn.memory(init=h_boot) h_pre = rnn.memory(init=h_boot)
x_t = rnn.step_input(x) x_t = rnn.step_input(x)
h = layers.scale( h = paddle.scale(
x=layers.elementwise_add(x=h_pre, y=x_t), x=layers.elementwise_add(x=h_pre, y=x_t),
scale=self.py_rnn.scale, scale=self.py_rnn.scale,
) )
...@@ -431,8 +431,8 @@ class EagerDeletionRecurrentOpMultipleMemoryTest(EagerDeletionRecurrentOpTest1): ...@@ -431,8 +431,8 @@ class EagerDeletionRecurrentOpMultipleMemoryTest(EagerDeletionRecurrentOpTest1):
h_pre2 = rnn.memory(init=h_boot2) h_pre2 = rnn.memory(init=h_boot2)
x_t = rnn.step_input(x) x_t = rnn.step_input(x)
mem1 = layers.scale(x=h_pre1, scale=1.0) mem1 = paddle.scale(x=h_pre1, scale=1.0)
mem2 = layers.scale(x=h_pre2, scale=1.0) mem2 = paddle.scale(x=h_pre2, scale=1.0)
out = layers.sums(input=[mem1, x_t, mem2]) out = layers.sums(input=[mem1, x_t, mem2])
rnn.update_memory(h_pre1, mem1) rnn.update_memory(h_pre1, mem1)
...@@ -691,7 +691,7 @@ class EagerDeletionFarwardOnlyRnnAndBackwardRnnTest( ...@@ -691,7 +691,7 @@ class EagerDeletionFarwardOnlyRnnAndBackwardRnnTest(
h_pre = forward_only_rnn.memory(init=h_boot) h_pre = forward_only_rnn.memory(init=h_boot)
x_t = forward_only_rnn.step_input(x) x_t = forward_only_rnn.step_input(x)
h = layers.scale( h = paddle.scale(
x=layers.elementwise_add(x=h_pre, y=x_t), x=layers.elementwise_add(x=h_pre, y=x_t),
scale=self.py_rnn.scale, scale=self.py_rnn.scale,
) )
...@@ -707,7 +707,7 @@ class EagerDeletionFarwardOnlyRnnAndBackwardRnnTest( ...@@ -707,7 +707,7 @@ class EagerDeletionFarwardOnlyRnnAndBackwardRnnTest(
h_pre = rnn.memory(init=h_boot) h_pre = rnn.memory(init=h_boot)
x_t = rnn.step_input(x) x_t = rnn.step_input(x)
h = layers.scale( h = paddle.scale(
x=layers.elementwise_add(x=h_pre, y=x_t), x=layers.elementwise_add(x=h_pre, y=x_t),
scale=self.py_rnn.scale, scale=self.py_rnn.scale,
) )
......
...@@ -692,7 +692,7 @@ class PrepareEncoderDecoderLayer(Layer): ...@@ -692,7 +692,7 @@ class PrepareEncoderDecoderLayer(Layer):
def forward(self, src_word, src_pos): def forward(self, src_word, src_pos):
src_word_emb = self._input_emb(src_word) src_word_emb = self._input_emb(src_word)
src_word_emb = fluid.layers.scale( src_word_emb = paddle.scale(
x=src_word_emb, scale=self._src_emb_dim**0.5 x=src_word_emb, scale=self._src_emb_dim**0.5
) )
# # TODO change this to fit dynamic length input # # TODO change this to fit dynamic length input
......
...@@ -3632,7 +3632,7 @@ class TestBook(LayerTest): ...@@ -3632,7 +3632,7 @@ class TestBook(LayerTest):
dtype='float32', dtype='float32',
append_batch_size=False, append_batch_size=False,
) )
out = layers.scale(input, scale=scale_var) out = paddle.scale(input, scale=scale_var)
return out return out
def make_iou_similarity(self): def make_iou_similarity(self):
......
...@@ -151,7 +151,7 @@ class RecurrentOpTest1(unittest.TestCase): ...@@ -151,7 +151,7 @@ class RecurrentOpTest1(unittest.TestCase):
h_pre = rnn.memory(init=h_boot) h_pre = rnn.memory(init=h_boot)
x_t = rnn.step_input(x) x_t = rnn.step_input(x)
h = layers.scale( h = paddle.scale(
x=layers.elementwise_add(x=h_pre, y=x_t), x=layers.elementwise_add(x=h_pre, y=x_t),
scale=self.py_rnn.scale, scale=self.py_rnn.scale,
) )
...@@ -419,8 +419,8 @@ class RecurrentOpMultipleMemoryTest(RecurrentOpTest1): ...@@ -419,8 +419,8 @@ class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
h_pre2 = rnn.memory(init=h_boot2) h_pre2 = rnn.memory(init=h_boot2)
x_t = rnn.step_input(x) x_t = rnn.step_input(x)
mem1 = layers.scale(x=h_pre1, scale=1.0) mem1 = paddle.scale(x=h_pre1, scale=1.0)
mem2 = layers.scale(x=h_pre2, scale=1.0) mem2 = paddle.scale(x=h_pre2, scale=1.0)
out = layers.sums(input=[mem1, x_t, mem2]) out = layers.sums(input=[mem1, x_t, mem2])
rnn.update_memory(h_pre1, mem1) rnn.update_memory(h_pre1, mem1)
......
...@@ -133,7 +133,7 @@ class TestScaleOpSelectedRows(unittest.TestCase): ...@@ -133,7 +133,7 @@ class TestScaleOpSelectedRows(unittest.TestCase):
class TestScaleRaiseError(unittest.TestCase): class TestScaleRaiseError(unittest.TestCase):
def test_errors(self): def test_errors(self):
def test_type(): def test_type():
fluid.layers.scale([10]) paddle.scale([10])
self.assertRaises(TypeError, test_type) self.assertRaises(TypeError, test_type)
......
...@@ -161,7 +161,7 @@ def multi_head_attention( ...@@ -161,7 +161,7 @@ def multi_head_attention(
sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False) sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
return layers.elementwise_div(x=exp_out, y=sum_out, axis=0) return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
scaled_q = layers.scale(x=q, scale=d_model**-0.5) scaled_q = paddle.scale(x=q, scale=d_model**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True) product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
weights = __softmax(layers.elementwise_add(x=product, y=attn_bias)) weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
if dropout_rate: if dropout_rate:
......
...@@ -26,7 +26,6 @@ from paddle.fluid import ( ...@@ -26,7 +26,6 @@ from paddle.fluid import (
CompiledProgram, CompiledProgram,
default_main_program, default_main_program,
Program, Program,
layers,
unique_name, unique_name,
program_guard, program_guard,
) )
...@@ -201,7 +200,7 @@ def normalize_program(program, feed_vars, fetch_vars): ...@@ -201,7 +200,7 @@ def normalize_program(program, feed_vars, fetch_vars):
uniq_fetch_vars = [] uniq_fetch_vars = []
for i, var in enumerate(fetch_vars): for i, var in enumerate(fetch_vars):
if var.dtype != paddle.bool: if var.dtype != paddle.bool:
var = layers.scale( var = paddle.scale(
var, 1.0, name="save_infer_model/scale_{}".format(i) var, 1.0, name="save_infer_model/scale_{}".format(i)
) )
uniq_fetch_vars.append(var) uniq_fetch_vars.append(var)
......
...@@ -52,7 +52,7 @@ __inplace_unary_func__ = [ ...@@ -52,7 +52,7 @@ __inplace_unary_func__ = [
__all__ = [] __all__ = []
# It is a hot fix in some unittest using: # It is a hot fix in some unittest using:
# fluid.layers.scale(x=x, scale=10.0, out=out_var) # paddle.scale(x=x, scale=10.0, out=out_var)
# e.g.: test_program_code.py, test_dist_train.py # e.g.: test_program_code.py, test_dist_train.py
globals()['_scale'] = generate_layer_fn('scale') globals()['_scale'] = generate_layer_fn('scale')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册