diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 211186a65fcf301f4bfcf58f41b56ef7f223a5f8..ed2739232e3c7ffd8f58f3395c294176942652c3 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -130,7 +130,7 @@ paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)) -paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'scale', 'bias', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, 0.0, None)) +paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)) paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False)) paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h index 2148ed25916c82c0de10b0bbd5004bf0aa37bdc0..b870d08a1a28fd3e678aeb7211f7e3ec8b2c4c65 100644 --- a/paddle/fluid/operators/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise_mul_op.h @@ -93,7 +93,6 @@ class ElementwiseMulGradKernel : public ElemwiseGradKernel { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - // auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* out = dout; // out is not necessary auto* dx = ctx.Output(framework::GradVarName("X")); diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 75645598f82117f40c91e00608f460b7ef6debec..242a1b9ae92ade0caf1b0f1fcb5458b8b7070d84 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -59,9 +59,8 @@ class MatMulKernel : public framework::OpKernel { RowMatrixFromVector(x.dims()), 0, context.Attr("transpose_X")); auto mat_dim_b = math::CreateMatrixDescriptor( ColumnMatrixFromVector(y.dims()), 0, context.Attr("transpose_Y")); - auto scale = static_cast(context.Attr("scale")); - auto bias = static_cast(context.Attr("bias")); - blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, bias); + auto scale = static_cast(context.Attr("alpha")); + blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0)); } }; @@ -188,7 +187,7 @@ class MatMulGradKernel : public framework::OpKernel { auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a); auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b); blas.MatMul(a, mat_dim_a, b, mat_dim_b, - static_cast(context.Attr("scale")), out, T(0)); + static_cast(context.Attr("alpha")), out, T(0)); } void CalcInputGrad(const framework::ExecutionContext &context, @@ -337,8 +336,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { R"DOC(If true, use the transpose of `Y`. )DOC") .SetDefault(false); - AddAttr("scale", "Scale").SetDefault(1.0f); - AddAttr("bias", "Bias").SetDefault(0.0f); + AddAttr("alpha", "The scale of Out").SetDefault(1.0f); AddComment(R"DOC( MatMul Operator. diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 87642b9481e591b6f9097705a323790a19bf13c6..13be6c65be58314a75124106eb09b1300305baf0 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -53,6 +53,11 @@ $$Out = scale*X$$ AddAttr("scale", "The scaling factor of the scale operator.") .SetDefault(1.0); AddAttr("bias", "The bias of the scale operator.").SetDefault(0.0); + AddAttr( + "bias_after_scale", + "Apply bias addition after or before scaling. It is useful for " + "numeric stability in some circumstances.") + .SetDefault(true); } }; @@ -82,6 +87,7 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { grad_op->SetOutput("Out", InputGrad("X")); grad_op->SetAttr("scale", GetAttr("scale")); grad_op->SetAttr("bias", 0.0f); + grad_op->SetAttr("bias_after_scale", true); return std::unique_ptr(grad_op); } }; diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index 7b659153aeab280dca5c42411038c239003ef21b..d8a199bc2b860515645b4954b49d8eb59fbd02dc 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -35,6 +35,7 @@ class ScaleKernel : public framework::OpKernel { auto scale = static_cast(ctx.Attr("scale")); auto bias = static_cast(ctx.Attr("bias")); + auto bias_after_scale = ctx.Attr("bias_after_scale"); if (in_var->IsType() && in_var != out_var) { auto& in_slr = in_var->Get(); @@ -46,8 +47,11 @@ class ScaleKernel : public framework::OpKernel { auto eigen_out = framework::EigenVector::Flatten(*out); auto eigen_in = framework::EigenVector::Flatten(*in); auto& dev = *ctx.template device_context().eigen_device(); - eigen_out.device(dev) = - static_cast(scale) * eigen_in + static_cast(bias); + if (bias_after_scale) { + eigen_out.device(dev) = scale * eigen_in + bias; + } else { + eigen_out.device(dev) = scale * (eigen_in + bias); + } } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 8c0899a815d208f31447c35cc31f4e5a47cb713c..35f2876f32c0e80f5cf1573d2aa5325c93444f39 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3388,13 +3388,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): return out -def matmul(x, - y, - transpose_x=False, - transpose_y=False, - scale=1.0, - bias=0.0, - name=None): +def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None): """ Applies matrix multiplication to two tensors. @@ -3428,8 +3422,7 @@ def matmul(x, y (Variable): The input variable which is a Tensor or LoDTensor. transpose_x (bool): Whether to transpose :math:`x` before multiplication. transpose_y (bool): Whether to transpose :math:`y` before multiplication. - scale (float): The scale of output. Default 1.0. - bias (float): The bias added to output. Default 0.0. + alpha (float): The scale of output. Default 1.0. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -3500,8 +3493,7 @@ def matmul(x, attrs={ 'transpose_X': transpose_x, 'transpose_Y': transpose_y, - 'scale': scale, - 'bias': bias + 'alpha': alpha, }) return out