未验证 提交 968f7f24 编写于 作者: S Sławomir Siwek 提交者: GitHub

Fused elementwise_(mul/div) (#50428)

* extract Op and OPMaker to .h

* extend pattern for fused_op

* set "with_residual" default to false

* adjust fuse passes

* remove fc+eltwise flag

* fused_output_scale

* activation attrs

* remove extra attrs

* fix int8/bf16 unit tests

* simplify RecomputeOutputDims

* remove unused method

* Add description for attributes

* add extra check

* adjust op compats

* update quantize test

* fix protobuf parsing error

* fix int8 performance

* fused elementwises

* merge develop

* remove activation

* restore activation for existing add/sub ops
上级 14abafa1
......@@ -87,7 +87,12 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
return;
}
ConvertToFusedOp(op->Op());
// Remove this condition when all fused_elementwise ops are merged
if (!(op->Op()->Type() == "elementwise_add" ||
op->Op()->Type() == "elementwise_sub" ||
op->Op()->Type() == "elementwise_mul")) {
ConvertToFusedOp(op->Op());
}
op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
};
gpd(graph, handler);
......
......@@ -148,6 +148,8 @@ inline void ConvertToFusedOp(OpDesc* op) {
const std::map<std::string, std::string> fused_ops = {
{"conv2d", "fused_conv2d"},
{"depthwise_conv2d", "fused_conv2d"},
{"elementwise_mul", "fused_elementwise_mul"},
{"elementwise_div", "fused_elementwise_div"},
{"matmul", "fused_matmul"},
{"matmul_v2", "fused_matmul"},
{"softplus", "fused_softplus"},
......
......@@ -31,6 +31,8 @@ void FuseOperatorScaleOneDNNPass::ApplyImpl(Graph *graph) const {
"fused_matmul",
"matmul",
"matmul_v2",
"fused_elementwise_mul",
"fused_elementwise_div",
"elementwise_add",
"elementwise_sub",
"elementwise_mul",
......@@ -87,10 +89,7 @@ void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph,
scale = *(scale_tensor->data<float>());
}
if (op_type == "matmul" || op_type == "matmul_v2") {
ConvertToFusedOp(operator_op->Op());
}
ConvertToFusedOp(operator_op->Op());
operator_op->Op()->SetAttr("fused_output_scale", scale);
operator_op->Op()->SetOutput("Out", {scale_out->Name()});
......@@ -120,6 +119,8 @@ REGISTER_PASS_CAPABILITY(operator_scale_onednn_fuse_pass)
.EQ("fused_matmul", 0)
.LE("matmul", 1)
.EQ("matmul_v2", 0)
.EQ("fused_elementwise_mul", 0)
.EQ("fused_elementwise_div", 0)
.LE("elementwise_add", 1)
.LE("elementwise_sub", 1)
.LE("elementwise_mul", 1)
......
......@@ -27,7 +27,10 @@ using string::PrettyLogDetail;
void FuseOperatorUnsqueeze2OneDNNPass::ApplyImpl(Graph *graph) const {
std::vector<std::pair<std::string, int>> ops_and_outputs = {
{"fused_transpose", 2}, {"transpose2", 2}, {"elementwise_mul", 1}};
{"fused_transpose", 2},
{"transpose2", 2},
{"fused_elementwise_mul", 1},
{"elementwise_mul", 1}};
for (const auto &op_and_outputs : ops_and_outputs)
FuseUnsqueeze2(graph, op_and_outputs.first, op_and_outputs.second);
......
type: "fused_elementwise_div"
def {
inputs {
name: "X"
}
inputs {
name: "Y"
}
outputs {
name: "Out"
}
attrs {
name: "axis"
type: INT
}
}
extra {
attrs {
name: "fuse_activation"
type: STRING
}
attrs {
name: "fuse_alpha"
type: FLOAT
}
attrs {
name: "fuse_beta"
type: FLOAT
}
attrs {
name: "fused_output_scale"
type: FLOAT
}
attrs {
name: "fused_unsqueeze2_axes"
type: INTS
}
attrs {
name: "scale_x"
type: FLOAT
}
attrs {
name: "scale_y"
type: FLOAT
}
attrs {
name: "scale_out"
type: FLOAT
}
}
type: "fused_elementwise_mul"
def {
inputs {
name: "X"
}
inputs {
name: "Y"
}
outputs {
name: "Out"
}
attrs {
name: "axis"
type: INT
}
}
extra {
attrs {
name: "fuse_activation"
type: STRING
}
attrs {
name: "fuse_alpha"
type: FLOAT
}
attrs {
name: "fuse_beta"
type: FLOAT
}
attrs {
name: "fused_output_scale"
type: FLOAT
}
attrs {
name: "fused_unsqueeze2_axes"
type: INTS
}
attrs {
name: "scale_x"
type: FLOAT
}
attrs {
name: "scale_y"
type: FLOAT
}
attrs {
name: "scale_out"
type: FLOAT
}
}
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
namespace operators {
class FusedElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() final {
AddInput("X", "The first input tensor of elementwise op.");
AddInput("Y", "The second input tensor of elementwise op.");
AddOutput("Out", "A location into which the result is stored.");
AddAttr<int>(
"axis",
"If X.dimension != Y.dimension, Y.dimension must be a "
"subsequence of X.dimension. And axis is the start dimension index "
"for broadcasting Y onto X.")
.SetDefault(-1);
AddAttr<std::string>(
"fuse_activation",
"Activation type from elementwise_act_onednn_fuse_pass")
.SetDefault("");
AddAttr<float>("fuse_alpha",
"Activation alpha from elementwise_act_onednn_fuse_pass")
.SetDefault(0.0f);
AddAttr<float>("fuse_beta",
"Activation beta from elementwise_act_onednn_fuse_pass")
.SetDefault(0.0f);
AddAttr<float>("fused_output_scale",
"Obtained from operator_scale_onednn_fuse_pass")
.SetDefault(1.0f);
AddAttr<std::vector<int>>(
"fused_unsqueeze2_axes",
"Obtained from operator_unsqueeze2_onednn_fuse_pass")
.SetDefault({});
AddAttr<float>("scale_x", "Elementwise X input quantization scale")
.SetDefault(1.0f);
AddAttr<float>("scale_y", "Elementwise Y input quantization scale")
.SetDefault(1.0f);
AddAttr<float>("scale_out", "Elementwise Out output quantization scale")
.SetDefault(1.0f);
AddComment(
R"DOC(Elementwise operator extended with oneDNN-specific fusion logic.)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
fused_elementwise_mul,
ops::ElementwiseOp,
ops::FusedElementwiseOpMaker,
ops::ElementwiseOpInferVarType,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(
fused_elementwise_div,
ops::ElementwiseOp,
ops::FusedElementwiseOpMaker,
ops::ElementwiseOpInferVarType,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
......@@ -109,7 +109,6 @@ const std::unordered_map<std::string, ExtraAttrPropertySet>
{"Scale_weights", ExtraAttrProperty::ONEDNN},
{"x_data_format", ExtraAttrProperty::ONEDNN},
{"y_data_format", ExtraAttrProperty::ONEDNN},
{"fused_unsqueeze2_axes", ExtraAttrProperty::ONEDNN},
{"fused_reshape2_shape", ExtraAttrProperty::ONEDNN},
// ONEDNN pass dedicated attributes
{"Activation_scale", ExtraAttrProperty::ONEDNN},
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, dnnl::algorithm BINARY_OP>
void FusedElementwiseKernel(const OneDNNContext& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const int axis,
const std::string& fuse_activation,
const float fuse_alpha,
const float fuse_beta,
const float fused_output_scale,
const std::vector<int>& fused_unsqueeze2_axes,
const float scale_x,
const float scale_y,
const float scale_out,
DenseTensor* out) {
const auto& onednn_engine = dev_ctx.GetEngine();
dnnl::post_ops post_operations;
funcs::AppendActivation(
dev_ctx, post_operations, 1.0f, fuse_activation, fuse_alpha, fuse_beta);
if (fused_output_scale != 1.0) {
post_operations.append_eltwise(
1.0, dnnl::algorithm::eltwise_linear, fused_output_scale, 0.0f);
}
auto* non_const_x = &x;
auto* non_const_y = &y;
funcs::BinaryOneDNNHandler<T> handler(BINARY_OP,
axis,
onednn_engine,
dev_ctx.GetPlace(),
non_const_x,
non_const_y,
out,
scale_x,
scale_y,
scale_out,
true,
post_operations);
// oneDNN's binary is optimized for broadcasting y into x, so in other case
// we have to swap tensors to achieve optimal performance
if (x.numel() < y.numel()) {
std::swap(non_const_x, non_const_y);
}
const auto src_x_memory = handler.AcquireSrcMemory(non_const_x);
const auto src_y_memory = handler.AcquireSecondSrcMemory(non_const_y);
// For Inplace src and dst should be the same memory object.
// So x should share buffer with z. But UT mechanics is testing inplace
// execution for this op not checking that x can be bradcasted to match in
// shape y tensor.
// This is wrong as when x is to be broadcasted then z(out) will match the
// shape of y which is bigger than x. Hence if x is smaller in shape than z
// and they share a buffer (of shape x) then this buffer is not big enough
// to hold result of elementwise operation.
const bool reuse_x_memory = non_const_x->numel() == out->numel() &&
non_const_x->IsSharedBufferWith(*out);
std::shared_ptr<dnnl::memory> dst_memory;
if (reuse_x_memory) {
dst_memory = src_x_memory;
// NOTE(chenfeiyu): when the output reuses memory from other tensor rather
// than allocate its own, it's still need to take care of its data type.
// Unfortunately, paddle's operator only infers the output' shape, but not
// the data type. Alloc<T> takes care of allocation and data type
// normally, but if the memory is already allocated and there is no need
// to re-allocate, it just set the data type. So this it added there to
// get the right data type.
dev_ctx.template Alloc<T>(out);
} else {
dst_memory = handler.AcquireDstMemory(out);
}
const auto binary_prim = handler.AcquireForwardPrimitive();
auto& astream = OneDNNContext::tls().get_stream();
const std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC_0, *src_x_memory},
{DNNL_ARG_SRC_1, *src_y_memory},
{DNNL_ARG_DST, *dst_memory}};
binary_prim->execute(astream, args);
astream.wait();
auto out_md = dst_memory->get_desc();
if (handler.use_broadcasting_hack) {
auto dims = out_md.dims();
dims.insert(dims.begin(), non_const_x->dims()[0]);
dims[1] /= dims[0];
out_md = out_md.reshape(dims);
}
if (fused_unsqueeze2_axes.empty()) {
out->set_mem_desc(out_md);
} else {
funcs::SetOutMemDescWithUnsqueeze2FuseSupport(
fused_unsqueeze2_axes, out, out_md);
}
}
#define DEFINE_ONEDNN_ELEMENTWISE_KERNEL(name, algorithm) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
const int axis, \
const std::string& fuse_activation, \
const float fuse_alpha, \
const float fuse_beta, \
const float fused_output_scale, \
const std::vector<int>& fused_unsqueeze2_axes, \
const float scale_x, \
const float scale_y, \
const float scale_out, \
DenseTensor* out) { \
FusedElementwiseKernel<T, algorithm>(dev_ctx, \
x, \
y, \
axis, \
fuse_activation, \
fuse_alpha, \
fuse_beta, \
fused_output_scale, \
fused_unsqueeze2_axes, \
scale_x, \
scale_y, \
scale_out, \
out); \
}
DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedMultiply, dnnl::algorithm::binary_mul)
DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedDivide, dnnl::algorithm::binary_div)
} // namespace phi
PD_REGISTER_KERNEL(fused_elementwise_mul,
OneDNN,
ONEDNN,
phi::FusedMultiplyKernel,
float,
phi::dtype::bfloat16,
int8_t,
uint8_t) {}
PD_REGISTER_KERNEL(fused_elementwise_div,
OneDNN,
ONEDNN,
phi::FusedDivideKernel,
float,
phi::dtype::bfloat16,
int8_t,
uint8_t) {}
......@@ -122,18 +122,7 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
out_md = out_md.reshape(dims);
}
const auto fused_unsqueeze2_axes =
dev_ctx.HasDnnAttr("fused_unsqueeze2_axes")
? PADDLE_GET_CONST(std::vector<int>,
dev_ctx.GetDnnAttr("fused_unsqueeze2_axes"))
: std::vector<int>();
if (!fused_unsqueeze2_axes.empty()) {
funcs::SetOutMemDescWithUnsqueeze2FuseSupport(
fused_unsqueeze2_axes, out, out_md);
} else {
out->set_mem_desc(out_md);
}
out->set_mem_desc(out_md);
}
#define DEFINE_ONEDNN_ELEMENTWISE_KERNEL(name, algorithm) \
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature FusedElementwiseMulOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("fused_elementwise_mul",
{"X", "Y"},
{"axis",
"fuse_activation",
"fuse_alpha",
"fuse_beta",
"fused_output_scale",
"fused_unsqueeze2_axes",
"scale_x",
"scale_y",
"scale_out"},
{"Out"});
}
KernelSignature FusedElementwiseDivOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("fused_elementwise_div",
{"X", "Y"},
{"axis",
"fuse_activation",
"fuse_alpha",
"fuse_beta",
"fused_output_scale",
"fused_unsqueeze2_axes",
"scale_x",
"scale_y",
"scale_out"},
{"Out"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_mul,
phi::FusedElementwiseMulOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_div,
phi::FusedElementwiseDivOpArgumentMapping);
......@@ -143,7 +143,7 @@ class TestElementwiseMulUnsqueeze2OneDNNFusePass(PassAutoScanTest):
"operator_unsqueeze2_onednn_fuse_pass",
],
)
yield config, ["elementwise_mul"], (1e-5, 1e-5)
yield config, ["fused_elementwise_mul"], (1e-5, 1e-5)
def test(self):
self.run_and_statis(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册