From 945f918c81deac7810ced70d9a9859b15028774a Mon Sep 17 00:00:00 2001 From: zhupengyang Date: Fri, 10 Feb 2023 14:29:53 +0800 Subject: [PATCH] [XPU] add fc_xpu op&pass to optimize ernie model (#50277) --- paddle/fluid/framework/ir/CMakeLists.txt | 8 + .../ir/adaptive_pool2d_convert_global_pass.cc | 1 - .../framework/ir/xpu/fc_xpu_fuse_pass.cc | 346 ++++++++++++++++++ paddle/fluid/framework/ir/xpu/pass_utils.h | 47 +++ paddle/fluid/framework/ir/xpu/quant_utils.cc | 181 +++++++++ paddle/fluid/framework/ir/xpu/quant_utils.h | 33 ++ .../inference/api/paddle_pass_builder.cc | 13 + .../fluid/inference/api/paddle_pass_builder.h | 2 +- .../operators/generator/templates/op.c.j2 | 1 + paddle/phi/api/yaml/static_ops.yaml | 9 + paddle/phi/backends/xpu/xpu1_op_list.cc | 1 + paddle/phi/backends/xpu/xpu2_op_list.cc | 1 + paddle/phi/infermeta/CMakeLists.txt | 2 +- paddle/phi/infermeta/fusion.cc | 45 +++ paddle/phi/infermeta/fusion.h | 37 ++ .../phi/kernels/fusion/xpu/fc_xpu_kernel.cc | 73 ++++ .../unittests/ir/inference/CMakeLists.txt | 16 + .../unittests/ir/inference/auto_scan_test.py | 5 + .../ir/inference/test_xpu_fc_xpu_fuse_pass.py | 100 +++++ 19 files changed, 918 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/xpu/pass_utils.h create mode 100644 paddle/fluid/framework/ir/xpu/quant_utils.cc create mode 100644 paddle/fluid/framework/ir/xpu/quant_utils.h create mode 100644 paddle/phi/infermeta/fusion.cc create mode 100644 paddle/phi/infermeta/fusion.h create mode 100644 paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_xpu_fc_xpu_fuse_pass.py diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index c387c1869b3..7a703722218 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -210,6 +210,14 @@ if(WITH_IPU) pass_library(inference_dtype_transfer_pass base DIR ipu) endif() +if(WITH_XPU) + cc_library( + quant_utils + SRCS xpu/quant_utils.cc + DEPS pass) + pass_library(fc_xpu_fuse_pass inference DIR xpu DEPS quant_utils) +endif() + cc_library( fuse_bn_act_pass SRCS fuse_bn_act_pass.cc diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc index aa11994fb8e..3bd1ad609f3 100644 --- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc +++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc @@ -96,7 +96,6 @@ void AdaptivePool2dConvertGlobalPass::ApplyImpl(ir::Graph* graph) const { } } } - // LOG(INFO) << "--- processed " << num << " nodes"; AddStatis(num); } diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc new file mode 100644 index 00000000000..dde8a55c7a0 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc @@ -0,0 +1,346 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/xpu/pass_utils.h" +#include "paddle/fluid/framework/ir/xpu/quant_utils.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace phi { +class DenseTensor; +} // namespace phi + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +struct FcXPUPattern : public PatternBase { + FcXPUPattern(PDPattern* pattern, + const std::string& name_scope, + const std::string& mul_type, + bool with_bias, + const std::string& act_type); + + // declare operator node's name + PATTERN_DECL_NODE(mul); + PATTERN_DECL_NODE(add); + PATTERN_DECL_NODE(act); + // declare variable node's name + PATTERN_DECL_NODE(mul_x); + PATTERN_DECL_NODE(mul_w); + PATTERN_DECL_NODE(mul_out); + PATTERN_DECL_NODE(bias); + PATTERN_DECL_NODE(add_out); + PATTERN_DECL_NODE(act_out); + + private: + std::string mul_type_; + bool with_bias_{false}; + std::string act_type_; +}; + +FcXPUPattern::FcXPUPattern(PDPattern* pattern, + const std::string& name_scope, + const std::string& mul_type, + bool with_bias, + const std::string& act_type) + : PatternBase(pattern, name_scope, name_scope), + mul_type_(mul_type), + with_bias_(with_bias), + act_type_(act_type) { + auto* mul_x = pattern->NewNode(mul_x_repr()) + ->assert_is_op_input(mul_type_, "X") + ->assert_var_not_persistable(); + auto* mul_w = pattern->NewNode(mul_w_repr()) + ->assert_is_op_input(mul_type_, "Y") + ->assert_is_persistable_var() + ->assert_more([](Node* node) { + return true; + return node->Var()->GetShape().size() == 2; + }); + auto* mul = + pattern->NewNode(mul_repr()) + ->assert_is_op(mul_type_) + ->assert_more([](Node* node) { + return true; + auto op_type = node->Op()->Type(); + if (op_type == "matmul") { + return !PADDLE_GET_CONST(bool, + node->Op()->GetAttr("transpose_X")); + } else if (op_type == "matmul_v2") { + return !PADDLE_GET_CONST(bool, node->Op()->GetAttr("trans_x")); + } else { + return true; + } + }); + auto* mul_out = pattern->NewNode(mul_out_repr()) + ->assert_is_op_output(mul_type_, "Out") + ->assert_var_not_persistable(); + mul->LinksFrom({mul_x, mul_w}).LinksTo({mul_out}); + PDNode* bias = nullptr; + PDNode* add = nullptr; + PDNode* add_out = nullptr; + PDNode* act = nullptr; + PDNode* act_out = nullptr; + if (with_bias_) { + mul_out->assert_is_op_input("elementwise_add", "X"); + bias = pattern->NewNode(bias_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->assert_is_persistable_var(); + add = pattern->NewNode(add_repr())->assert_is_op("elementwise_add"); + add_out = pattern->NewNode(add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out") + ->assert_var_not_persistable(); + add->LinksFrom({mul_out, bias}).LinksTo({add_out}); + } else { + add_out = mul_out; + } + if (!act_type_.empty()) { + add_out->assert_is_op_input(act_type_, "X"); + act = pattern->NewNode(act_repr())->assert_is_op(act_type_); + act_out = pattern->NewNode(act_out_repr()) + ->assert_is_op_output(act_type_, "Out") + ->assert_var_not_persistable(); + act->LinksFrom({add_out}).LinksTo({act_out}); + } +} + +} // namespace patterns + +/* +1. fuse mul/matmul/matmul_v2 + add + act into fc_xpu +2. add is optional +3. act is optional + +Origin subgraph: + mul_x mul_w + \ / + \ / + mul + | + | + mul_out bias + \ / + \ / + elementwise_add + | + | + elementwise_add_out + | + | + act + | + | + act_out + +Fused subgraph: + mul_x mul_w bias mul_w_max + \ | / | + \ | / | + \ | / | + fc_xpu----------- + | + | + act_out +*/ +class FcXPUFusePass : public FusePassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; + + private: + void ApplyImpl(ir::Graph* graph, + const std::string& mul_type, + bool with_bias, + const std::string& act_type) const; + + const std::string name_scope_{"fc_xpu_fuse_pass"}; + const std::map act_map_{{"", 0}, + {"relu", 1}, + {"sigmoid", 2}, + {"tanh", 3}, + {"gelu", 4}, + {"leaky_relu", 5}, + {"hard_swish", 14}, + {"hard_sigmoid", 15}, + {"relu6", 17}}; +}; + +void FcXPUFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + Init(name_scope_, graph); + for (auto mul_type : {"mul", "matmul", "matmul_v2"}) { + for (auto with_bias : {true, false}) { + for (auto act_type : { + "relu", + "gelu", + "", + }) { + ApplyImpl(graph, mul_type, with_bias, act_type); + } + } + } +} + +void FcXPUFusePass::ApplyImpl(ir::Graph* graph, + const std::string& mul_type, + bool with_bias, + const std::string& act_type) const { + GraphPatternDetector gpd; + patterns::FcXPUPattern pattern( + gpd.mutable_pattern(), name_scope_, mul_type, with_bias, act_type); + + int found_subgraph_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(4) << "handle FcXPUFusePass fuse"; + GET_IR_NODE(mul_x); + GET_IR_NODE(mul_w); + GET_IR_NODE(mul); + GET_IR_NODE(mul_out); + GET_IR_NODE(bias); + GET_IR_NODE(add); + GET_IR_NODE(add_out); + GET_IR_NODE(act); + GET_IR_NODE(act_out); + auto* block = mul->Op()->Block(); + auto* scope = param_scope(); + + auto mul_w_name = mul_w->Name(); + auto mul_w_tensor = + scope->FindVar(mul_w_name)->GetMutable(); + // 1. Transform weight to int16/int31 + // 2. Avoid transform repeatly, because weight may be shared with other ops. + // TODO(zhupengyang): support int31 + std::string mul_w_max_name = mul_w_name + "_max"; + Node* mul_w_max = nullptr; + if (mul_w_tensor->dtype() != phi::DataType::INT16) { + // Create weight_max node + VarDesc mul_w_max_desc(mul_w_max_name); + mul_w_max_desc.SetPersistable(true); + mul_w_max = graph->CreateVarNode(&mul_w_max_desc); + // Create weight_max var/tensor + auto mul_w_max_var = block->Var(mul_w_max_name); + mul_w_max_var->SetPersistable(true); + auto mul_w_max_tensor = + scope->Var(mul_w_max_name)->GetMutable(); + auto* xpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::XPUPlace())); + int max_ptr_size = xpu_ctx->x_context()->max_ptr_size(); + bool transpose_w = false; + if (mul_type == "matmul") { + transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("transpose_Y")); + } else if (mul_type == "matmul_v2") { + transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("trans_y")); + } + QuantWeight( + mul_w_tensor, mul_w_max_tensor, !transpose_w, max_ptr_size); + } + + // Generate fc_xpu op + framework::OpDesc fc_xpu_op_desc(block); + fc_xpu_op_desc.SetType("fc_xpu"); + fc_xpu_op_desc.SetInput("x", {mul_x->Name()}); + fc_xpu_op_desc.SetInput("w", {mul_w->Name()}); + fc_xpu_op_desc.SetInput("w_max", {mul_w_max_name}); + if (bias) { + fc_xpu_op_desc.SetInput("bias", {bias->Name()}); + } + fc_xpu_op_desc.SetAttr( + "in_num_col_dims", + static_cast(mul_x->Var()->GetShape().size() - 1)); + if (mul_type == "mul") { + fc_xpu_op_desc.SetAttr( + "in_num_col_dims", + PADDLE_GET_CONST(int, mul->Op()->GetAttr("in_num_col_dims"))); + } + fc_xpu_op_desc.SetAttr("transpose_x", false); + fc_xpu_op_desc.SetAttr("alpha", 1.f); + fc_xpu_op_desc.SetAttr("beta", 0.f); + if (mul_type == "matmul") { + fc_xpu_op_desc.SetAttr( + "alpha", PADDLE_GET_CONST(float, mul->Op()->GetAttr("alpha"))); + fc_xpu_op_desc.SetAttr( + "beta", PADDLE_GET_CONST(float, mul->Op()->GetAttr("beta"))); + } + fc_xpu_op_desc.SetAttr("act_type", 0); + fc_xpu_op_desc.SetAttr("act_alpha", 0.f); + if (act) { + fc_xpu_op_desc.SetAttr("act_type", act_map_.at(act_type)); + if (act_type == "leaky_relu") { + fc_xpu_op_desc.SetAttr( + "act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("alpha"))); + } else if (act_type == "hard_sigmoid") { + fc_xpu_op_desc.SetAttr( + "act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("slope"))); + } + } + if (act_out) { + fc_xpu_op_desc.SetOutput("out", {act_out->Name()}); + } else if (add_out) { + fc_xpu_op_desc.SetOutput("out", {add_out->Name()}); + } else { + fc_xpu_op_desc.SetOutput("out", {mul_out->Name()}); + } + auto* fc_xpu = graph->CreateOpNode(&fc_xpu_op_desc); + SAFE_IR_NODE_LINK_TO(mul_x, fc_xpu); + SAFE_IR_NODE_LINK_TO(mul_w, fc_xpu); + SAFE_IR_NODE_LINK_TO(mul_w_max, fc_xpu); + SAFE_IR_NODE_LINK_TO(bias, fc_xpu); + if (act_out) { + SAFE_IR_NODE_LINK_TO(fc_xpu, act_out); + } else if (add_out) { + SAFE_IR_NODE_LINK_TO(fc_xpu, add_out); + } else { + SAFE_IR_NODE_LINK_TO(fc_xpu, mul_out); + } + + // delete useless node + std::unordered_set delete_nodes; + if (act != nullptr && add != nullptr) { + delete_nodes = {mul, mul_out, add, add_out, act}; + } else if (act) { + delete_nodes = {mul, mul_out, act}; + } else if (add) { + delete_nodes = {mul, mul_out, add}; + } + GraphSafeRemoveNodes(graph, delete_nodes); + found_subgraph_count++; + }; + + gpd(graph, handler); + AddStatis(found_subgraph_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fc_xpu_fuse_pass, paddle::framework::ir::FcXPUFusePass); + +REGISTER_PASS_CAPABILITY(fc_xpu_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination().EQ( + "fc_xpu", 0)); diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.h b/paddle/fluid/framework/ir/xpu/pass_utils.h new file mode 100644 index 00000000000..38735c73158 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/pass_utils.h @@ -0,0 +1,47 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node_) SAFE_GET_IR_NODE_FROM_SUBGRAPH(node_, node_, pattern) + +// Get an ir::Node* from the matched subgraph. +// var: variable. +// arg: the argument declared by PATTERN_DECL_NODE in a pattern definition. +// pat: the pattern object. +#define SAFE_GET_IR_NODE_FROM_SUBGRAPH(var, arg, pat) \ + Node* var = nullptr; \ + if (pat.arg##_n()) { \ + PADDLE_ENFORCE_NE(subgraph.count(pat.arg##_n()), \ + 0UL, \ + platform::errors::NotFound( \ + "Node not found for PDNode %s", pat.arg##_repr())); \ + var = subgraph.at(pat.arg##_n()); \ + PADDLE_ENFORCE_NOT_NULL(var, \ + platform::errors::NotFound( \ + "node %s not exists in the sub-graph", #arg)); \ + } + +#define SAFE_IR_NODE_LINK_TO(a, b) \ + if (a != nullptr && b != nullptr) { \ + IR_NODE_LINK_TO(a, b) \ + } + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc new file mode 100644 index 00000000000..aa1463cb3a5 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/xpu/quant_utils.h" +#include +#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +template +static void Transpose(const T* in, T* out, int h, int w) { + for (int h1 = 0; h1 < w; ++h1) { + for (int w1 = 0; w1 < h; ++w1) { + out[h1 * h + w1] = in[w1 * w + h1]; + } + } +} + +static float FindMaxAbs(const float* data, int len) { + float max_f = 0.0f; + for (int i = 0; i < len; ++i) { + float max = std::abs(data[i]); + if (max > max_f) { + max_f = max; + } + } + return max_f; +} + +static float IEEECompliance0(float f) { + uint32_t* ptr = reinterpret_cast(&f); + uint32_t sign = (*ptr) & 0x80000000; + uint32_t uf = 0; + // nan -> inf + if (std::isnan(f)) { + uf = (sign | 0x7F800000); + float* ptr = reinterpret_cast(&uf); + return *ptr; + } else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) { + return f; + } else { + // denormal -> +-0 + uf = 0x0; + float* ptr = reinterpret_cast(&uf); + return *ptr; + } +} + +static inline long RoundHalfToEven(const float src) { // NOLINT + long ret = llround(src); // NOLINT + if (fabs(fabs(round(src) - src) - 0.5) > 0) { + return ret; + } else { + if (abs(ret) % 2 == 0) { + return ret; + } else { + return ret + (ret > 0 ? -1 : 1); + } + } +} + +template +static T Fp32ToIntx(const float f, float max) { + max = IEEECompliance0(max); + float input = IEEECompliance0(f); + // +0 and -0 -> +0 + if (input == 0) { + input = 0.0f; + } + + float tmp = RMAX / max; + if (std::isinf(tmp)) { + uint32_t* ptr = reinterpret_cast(&input); + if ((*ptr) >> 31 & 1) { + return T(-RMAX); + } else { + return T(RMAX); + } + } + + tmp = input * tmp; + if (std::isnan(tmp)) { + return T(RMAX); + } + + tmp = IEEECompliance0(tmp); + // early check to avoid INF or big value get into convertor func. + if (tmp > RMAX) { + return T(RMAX); + } + if (tmp < -RMAX) { + return T(-RMAX); + } + T ret = (T)RoundHalfToEven(tmp); + if (ret > RMAX) { + ret = T(RMAX); + } + if (ret < -RMAX) { + ret = T(-RMAX); + } + return ret; +} + +template +static void QuantFP32ToIntX(const float* src_ptr, + T* dst_ptr, + float max_val, + int numel) { + LOG(FATAL) << "Not support."; +} + +template <> +void QuantFP32ToIntX(const float* src_ptr, + int16_t* dst_ptr, + float max_val, + int numel) { + for (int i = 0; i < numel; i++) { + dst_ptr[i] = Fp32ToIntx(src_ptr[i], max_val); + } +} + +template +void QuantWeight(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + int max_ptr_size) { + // Transpose + auto* weight_data = weight->data(); + auto dims = weight->dims(); + auto size = weight->numel(); + std::vector transpose_data(weight_data, weight_data + size); + if (transpose) { + PADDLE_ENFORCE_EQ( + dims.size(), + 2, + platform::errors::InvalidArgument( + "Only support 2D weight, but received weight rank is [%d].", + dims.size())); + Transpose(weight_data, transpose_data.data(), dims[0], dims[1]); + weight->Resize({dims[1], dims[0]}); + } + weight_data = transpose_data.data(); + // Find max + float max_val = FindMaxAbs(weight_data, size); + std::vector max_vec(max_ptr_size, max_val); + weight_max->set_type(paddle::experimental::CppTypeToDataType::Type()); + weight_max->Resize({max_ptr_size}); + auto* dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + memcpy(dev_ctx->Alloc(weight_max), + max_vec.data(), + max_ptr_size * sizeof(float)); + // Quant + std::vector quant_data(size); + QuantFP32ToIntX(weight_data, quant_data.data(), max_val, size); + weight->set_type(paddle::experimental::CppTypeToDataType::Type()); + memcpy(dev_ctx->Alloc(weight), quant_data.data(), size * sizeof(T)); +} + +template void QuantWeight(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + int max_ptr_size); + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.h b/paddle/fluid/framework/ir/xpu/quant_utils.h new file mode 100644 index 00000000000..a5ae003d910 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/quant_utils.h @@ -0,0 +1,33 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/dense_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +// 1. Quant weight from fp32 to int16/int31 +// 2. Weight data is in-place update. +// 3. Generate weight max tensor +template +void QuantWeight(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + int max_ptr_size); + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index af7d1148a39..46ddc2feebe 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -513,6 +513,19 @@ void CpuPassStrategy::EraseFcMkldnnPasses() { } } +XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { + passes_.assign({ + "delete_dropout_op_pass", + // "multi_encoder_xpu_fuse_pass", + // "embedding_with_eltwise_add_xpu_fuse_pass", + "fc_xpu_fuse_pass", + // "multi_encoder_slice_link_xpu_fuse_pass", + // "generate_sequence_xpu_fuse_pass", + // "link_previous_out_max_xpu_pass", + }); + use_xpu_ = true; +} + IpuPassStrategy::IpuPassStrategy() : PassStrategy({}) { passes_.assign({"inference_process_pass"}); } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 8dea84400e8..021b758239e 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -290,7 +290,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { /// mode. class PD_INFER_DECL XpuPassStrategy final : public PassStrategy { public: - XpuPassStrategy() : PassStrategy({}) { use_xpu_ = true; } + XpuPassStrategy(); }; /// \class NpuPassStrategy diff --git a/paddle/fluid/operators/generator/templates/op.c.j2 b/paddle/fluid/operators/generator/templates/op.c.j2 index f54f91073da..b39bdc663fd 100644 --- a/paddle/fluid/operators/generator/templates/op.c.j2 +++ b/paddle/fluid/operators/generator/templates/op.c.j2 @@ -11,6 +11,7 @@ #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/binary.h" +#include "paddle/phi/infermeta/fusion.h" #include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/infermeta/nullary.h" #include "paddle/phi/infermeta/ternary.h" diff --git a/paddle/phi/api/yaml/static_ops.yaml b/paddle/phi/api/yaml/static_ops.yaml index 1849b9f6c1e..eb918ce5b10 100644 --- a/paddle/phi/api/yaml/static_ops.yaml +++ b/paddle/phi/api/yaml/static_ops.yaml @@ -1,3 +1,12 @@ +- op : fc_xpu + args : (Tensor x, Tensor w, Tensor w_max, Tensor bias, int in_num_col_dims, bool transpose_x, float alpha, float beta, int act_type, float act_alpha) + output : Tensor + infer_meta : + func : FcXPUInferMeta + kernel : + func : fc_xpu + optional : bias + - op : share_buffer args : (Tensor[] x, bool[] share_dims_and_dtype={}) output : Tensor[](out){x.size()}, Tensor[](xout){x.size()} diff --git a/paddle/phi/backends/xpu/xpu1_op_list.cc b/paddle/phi/backends/xpu/xpu1_op_list.cc index 0a51baad7cf..6b8f9b47011 100644 --- a/paddle/phi/backends/xpu/xpu1_op_list.cc +++ b/paddle/phi/backends/xpu/xpu1_op_list.cc @@ -93,6 +93,7 @@ XPUOpMap& get_kl1_ops() { phi::DataType::BOOL, phi::DataType::FLOAT16, phi::DataType::FLOAT32})}, + {"fc_xpu", XPUKernelSet({phi::DataType::FLOAT32})}, {"fill_any_like", XPUKernelSet({phi::DataType::INT64})}, {"fill_constant", XPUKernelSet({phi::DataType::INT32, diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index 91c42bb7005..93f431fa056 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -224,6 +224,7 @@ XPUOpMap& get_kl2_ops() { phi::DataType::BOOL, phi::DataType::FLOAT16, phi::DataType::FLOAT32})}, + {"fc_xpu", XPUKernelSet({phi::DataType::FLOAT32})}, {"fill", XPUKernelSet({phi::DataType::INT64, phi::DataType::INT32, diff --git a/paddle/phi/infermeta/CMakeLists.txt b/paddle/phi/infermeta/CMakeLists.txt index b896bb818fa..658621d441e 100644 --- a/paddle/phi/infermeta/CMakeLists.txt +++ b/paddle/phi/infermeta/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library( infermeta - SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc + SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc fusion.cc DEPS convert_utils meta_tensor infermeta_utils) cc_library( backward_infermeta diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc new file mode 100644 index 00000000000..f7188cdf77e --- /dev/null +++ b/paddle/phi/infermeta/fusion.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/infermeta/fusion.h" +#include +#include "paddle/phi/common/layout.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/core/meta_tensor.h" + +namespace phi { + +void FcXPUInferMeta(const MetaTensor& x, + const MetaTensor& w, + const MetaTensor& w_max, + const MetaTensor& bias, + int in_num_col_dims, + bool transpose_x, + float alpha, + float beta, + int act_type, + float act_alpha, + MetaTensor* out) { + std::vector out_shape(in_num_col_dims + 1); + for (int i = 0; i < in_num_col_dims; i++) { + out_shape[i] = x.dims()[i]; + } + out_shape[in_num_col_dims] = w.dims()[0]; + out->set_dims(DDim(out_shape.data(), out_shape.size())); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); +} + +} // namespace phi diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h new file mode 100644 index 00000000000..ba60dacb340 --- /dev/null +++ b/paddle/phi/infermeta/fusion.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/phi/common/int_array.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/meta_tensor.h" + +namespace phi { + +// Common InferMeta Functions for fusion operators. +// NOTE: The InferMeta Functions in this file are arranged in alphabetic order. + +void FcXPUInferMeta(const MetaTensor& x, + const MetaTensor& w, + const MetaTensor& w_max, + const MetaTensor& bias, + int in_num_col_dims, + bool transpose_x, + float alpha, + float beta, + int act_type, + float act_alpha, + MetaTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc new file mode 100644 index 00000000000..fabf0bec6d9 --- /dev/null +++ b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +namespace fusion { + +template +void FcXPUKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& w_max, + const paddle::optional& bias, + int in_num_col_dims, + bool transpose_x, + float alpha, + float beta, + int act_type, + float act_alpha, + DenseTensor* out) { + auto in_mat_dims = flatten_to_2d(x.dims(), in_num_col_dims); + int m = in_mat_dims[0]; + int k = in_mat_dims[1]; + int n = w.dims()[0]; + const float* bias_data = + bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data(); + xpu::Activation_t act(static_cast(act_type)); + if (act_type == 5) { + act.leaky_alpha = act_alpha; + } else if (act_type == 15) { + act.hard_sigmoid_slope = act_alpha; + } + ctx.template Alloc(out); + int r = xpu::fc_fusion( // TX, TW. TY, TGEMM + ctx.x_context(), // ctx + x.data(), // x + w.data(), // w + out->data(), // y + m, // m + n, // n + k, // k + transpose_x, // x_trans + true, // w_trans + nullptr, // x_maxptr + w_max.data(), // w_maxptr + nullptr, // y_maxptr + transpose_x ? m : k, // ldx + k, // ldw + n, // ldy + alpha, // alpha + beta, // beta + bias_data, // bias + act); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_xpu"); +} + +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(fc_xpu, XPU, ALL_LAYOUT, phi::fusion::FcXPUKernel, float) {} diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index 96a0cfb3fb0..312b55bf5ee 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -93,6 +93,22 @@ if(WITH_MKLDNN) endforeach() endif() +file( + GLOB TEST_XPU_IR_PASSES + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "test_xpu_*.py") +string(REPLACE ".py" "" TEST_XPU_IR_PASSES "${TEST_XPU_IR_PASSES}") +foreach(TEST_XPU_IR_PASS ${TEST_XPU_IR_PASSES}) + list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES ${TEST_XPU_IR_PASS}) +endforeach() + +if(WITH_XPU) + foreach(target ${TEST_XPU_IR_PASSES}) + py_test_modules(${target} MODULES ${target}) + set_tests_properties(${target} PROPERTIES LABELS "RUN_TYPE=INFER") + endforeach() +endif() + # below are cutlass unitests file( GLOB TEST_CUTLASS diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py index 99450cae46f..3f3af3be444 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py @@ -223,6 +223,7 @@ class AutoScanTest(unittest.TestCase): passes: Optional[List[str]] = None, use_gpu: bool = False, use_mkldnn: bool = False, + use_xpu: bool = False, ir_optim: Optional[bool] = None, ): config = paddle_infer.Config() @@ -235,6 +236,8 @@ class AutoScanTest(unittest.TestCase): config.enable_use_gpu(100, 0) if use_mkldnn: config.enable_mkldnn() + if use_xpu: + config.enable_xpu() if passes is not None: config.pass_builder().set_passes(passes) self.passes = passes @@ -571,6 +574,8 @@ class PassAutoScanTest(AutoScanTest): dic['use_mkldnn'] = enable_mkldnn enable_gpu = config.use_gpu() dic['use_gpu'] = enable_gpu + enable_xpu = config.use_xpu() + dic['use_xpu'] = enable_xpu if not self.passes: dic['passes'] = self.passes diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_xpu_fc_xpu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_xpu_fc_xpu_fuse_pass.py new file mode 100644 index 00000000000..2cd0a3f4ad8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_xpu_fc_xpu_fuse_pass.py @@ -0,0 +1,100 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import hypothesis.strategies as st +from auto_scan_test import PassAutoScanTest +from program_config import OpConfig, ProgramConfig, TensorConfig + + +class TestFcXPUFusePass(PassAutoScanTest): + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_xpu=True) + yield config, ["fc_xpu"], (1e-3, 1e-3) + + def sample_program_config(self, draw): + # 1. matmul_v2 + # Generate shape of input:X of matmul_v2 + x_shape = draw( + st.lists( + st.integers(min_value=1, max_value=4), min_size=2, max_size=4 + ) + ) + # Generate attr trans_x, trans_y + trans_x = False + trans_y = draw(st.booleans()) + # Generate legal shape of input:Y of mul + y_shape = draw( + st.lists( + st.integers(min_value=1, max_value=8), min_size=2, max_size=2 + ) + ) + if trans_y: + y_shape[1] = x_shape[-1] + else: + y_shape[0] = x_shape[-1] + # 2. elementwise_add + # Generate legal attr:axis of elementwise_add + axis = -1 + # Generate legal shape of input:Y of elementwise_add + bias_shape = [y_shape[0]] if trans_y else [y_shape[1]] + # 3. activation + # Random choose if add a relu operator + has_relu = draw(st.booleans()) + + # Here we will compose a program + # Still has some risks that the program is invalid or cause bug while running + # Use function `is_program_valid` to filter the invalid programs before running + # Use function `add_skip_pass_case` to ignore the programs even if they cause bug while runing + matmul_v2_op = OpConfig( + "matmul_v2", + inputs={"X": ["matmul_v2_x"], "Y": ["matmul_v2_y"]}, + outputs={"Out": ["matmul_v2_out"]}, + trans_x=trans_x, + trans_y=trans_y, + ) + add_op = OpConfig( + "elementwise_add", + inputs={"X": ["matmul_v2_out"], "Y": ["bias"]}, + outputs={"Out": ["add_out"]}, + axis=axis, + ) + ops = [matmul_v2_op, add_op] + if has_relu: + relu_op = OpConfig( + "relu", inputs={"X": ["add_out"]}, outputs={"Out": ["relu_out"]} + ) + ops.append(relu_op) + program_config = ProgramConfig( + ops=ops, + weights={ + "matmul_v2_y": TensorConfig(shape=y_shape), + "bias": TensorConfig(shape=bias_shape), + }, + inputs={ + "matmul_v2_x": TensorConfig(shape=x_shape), + }, + outputs=ops[-1].outputs["Out"], + ) + return program_config + + def test(self): + self.run_and_statis( + quant=False, max_examples=25, passes=["fc_xpu_fuse_pass"] + ) + + +if __name__ == "__main__": + unittest.main() -- GitLab