未验证 提交 945f918c 编写于 作者: Z zhupengyang 提交者: GitHub

[XPU] add fc_xpu op&pass to optimize ernie model (#50277)

上级 62fe3cf5
...@@ -210,6 +210,14 @@ if(WITH_IPU) ...@@ -210,6 +210,14 @@ if(WITH_IPU)
pass_library(inference_dtype_transfer_pass base DIR ipu) pass_library(inference_dtype_transfer_pass base DIR ipu)
endif() endif()
if(WITH_XPU)
cc_library(
quant_utils
SRCS xpu/quant_utils.cc
DEPS pass)
pass_library(fc_xpu_fuse_pass inference DIR xpu DEPS quant_utils)
endif()
cc_library( cc_library(
fuse_bn_act_pass fuse_bn_act_pass
SRCS fuse_bn_act_pass.cc SRCS fuse_bn_act_pass.cc
......
...@@ -96,7 +96,6 @@ void AdaptivePool2dConvertGlobalPass::ApplyImpl(ir::Graph* graph) const { ...@@ -96,7 +96,6 @@ void AdaptivePool2dConvertGlobalPass::ApplyImpl(ir::Graph* graph) const {
} }
} }
} }
// LOG(INFO) << "--- processed " << num << " nodes";
AddStatis(num); AddStatis(num);
} }
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
namespace phi {
class DenseTensor;
} // namespace phi
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace ir {
namespace patterns {
struct FcXPUPattern : public PatternBase {
FcXPUPattern(PDPattern* pattern,
const std::string& name_scope,
const std::string& mul_type,
bool with_bias,
const std::string& act_type);
// declare operator node's name
PATTERN_DECL_NODE(mul);
PATTERN_DECL_NODE(add);
PATTERN_DECL_NODE(act);
// declare variable node's name
PATTERN_DECL_NODE(mul_x);
PATTERN_DECL_NODE(mul_w);
PATTERN_DECL_NODE(mul_out);
PATTERN_DECL_NODE(bias);
PATTERN_DECL_NODE(add_out);
PATTERN_DECL_NODE(act_out);
private:
std::string mul_type_;
bool with_bias_{false};
std::string act_type_;
};
FcXPUPattern::FcXPUPattern(PDPattern* pattern,
const std::string& name_scope,
const std::string& mul_type,
bool with_bias,
const std::string& act_type)
: PatternBase(pattern, name_scope, name_scope),
mul_type_(mul_type),
with_bias_(with_bias),
act_type_(act_type) {
auto* mul_x = pattern->NewNode(mul_x_repr())
->assert_is_op_input(mul_type_, "X")
->assert_var_not_persistable();
auto* mul_w = pattern->NewNode(mul_w_repr())
->assert_is_op_input(mul_type_, "Y")
->assert_is_persistable_var()
->assert_more([](Node* node) {
return true;
return node->Var()->GetShape().size() == 2;
});
auto* mul =
pattern->NewNode(mul_repr())
->assert_is_op(mul_type_)
->assert_more([](Node* node) {
return true;
auto op_type = node->Op()->Type();
if (op_type == "matmul") {
return !PADDLE_GET_CONST(bool,
node->Op()->GetAttr("transpose_X"));
} else if (op_type == "matmul_v2") {
return !PADDLE_GET_CONST(bool, node->Op()->GetAttr("trans_x"));
} else {
return true;
}
});
auto* mul_out = pattern->NewNode(mul_out_repr())
->assert_is_op_output(mul_type_, "Out")
->assert_var_not_persistable();
mul->LinksFrom({mul_x, mul_w}).LinksTo({mul_out});
PDNode* bias = nullptr;
PDNode* add = nullptr;
PDNode* add_out = nullptr;
PDNode* act = nullptr;
PDNode* act_out = nullptr;
if (with_bias_) {
mul_out->assert_is_op_input("elementwise_add", "X");
bias = pattern->NewNode(bias_repr())
->assert_is_op_input("elementwise_add", "Y")
->assert_is_persistable_var();
add = pattern->NewNode(add_repr())->assert_is_op("elementwise_add");
add_out = pattern->NewNode(add_out_repr())
->assert_is_op_output("elementwise_add", "Out")
->assert_var_not_persistable();
add->LinksFrom({mul_out, bias}).LinksTo({add_out});
} else {
add_out = mul_out;
}
if (!act_type_.empty()) {
add_out->assert_is_op_input(act_type_, "X");
act = pattern->NewNode(act_repr())->assert_is_op(act_type_);
act_out = pattern->NewNode(act_out_repr())
->assert_is_op_output(act_type_, "Out")
->assert_var_not_persistable();
act->LinksFrom({add_out}).LinksTo({act_out});
}
}
} // namespace patterns
/*
1. fuse mul/matmul/matmul_v2 + add + act into fc_xpu
2. add is optional
3. act is optional
Origin subgraph:
mul_x mul_w
\ /
\ /
mul
|
|
mul_out bias
\ /
\ /
elementwise_add
|
|
elementwise_add_out
|
|
act
|
|
act_out
Fused subgraph:
mul_x mul_w bias mul_w_max
\ | / |
\ | / |
\ | / |
fc_xpu-----------
|
|
act_out
*/
class FcXPUFusePass : public FusePassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
private:
void ApplyImpl(ir::Graph* graph,
const std::string& mul_type,
bool with_bias,
const std::string& act_type) const;
const std::string name_scope_{"fc_xpu_fuse_pass"};
const std::map<std::string, int> act_map_{{"", 0},
{"relu", 1},
{"sigmoid", 2},
{"tanh", 3},
{"gelu", 4},
{"leaky_relu", 5},
{"hard_swish", 14},
{"hard_sigmoid", 15},
{"relu6", 17}};
};
void FcXPUFusePass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::PreconditionNotMet("graph should not be null."));
Init(name_scope_, graph);
for (auto mul_type : {"mul", "matmul", "matmul_v2"}) {
for (auto with_bias : {true, false}) {
for (auto act_type : {
"relu",
"gelu",
"",
}) {
ApplyImpl(graph, mul_type, with_bias, act_type);
}
}
}
}
void FcXPUFusePass::ApplyImpl(ir::Graph* graph,
const std::string& mul_type,
bool with_bias,
const std::string& act_type) const {
GraphPatternDetector gpd;
patterns::FcXPUPattern pattern(
gpd.mutable_pattern(), name_scope_, mul_type, with_bias, act_type);
int found_subgraph_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) {
VLOG(4) << "handle FcXPUFusePass fuse";
GET_IR_NODE(mul_x);
GET_IR_NODE(mul_w);
GET_IR_NODE(mul);
GET_IR_NODE(mul_out);
GET_IR_NODE(bias);
GET_IR_NODE(add);
GET_IR_NODE(add_out);
GET_IR_NODE(act);
GET_IR_NODE(act_out);
auto* block = mul->Op()->Block();
auto* scope = param_scope();
auto mul_w_name = mul_w->Name();
auto mul_w_tensor =
scope->FindVar(mul_w_name)->GetMutable<phi::DenseTensor>();
// 1. Transform weight to int16/int31
// 2. Avoid transform repeatly, because weight may be shared with other ops.
// TODO(zhupengyang): support int31
std::string mul_w_max_name = mul_w_name + "_max";
Node* mul_w_max = nullptr;
if (mul_w_tensor->dtype() != phi::DataType::INT16) {
// Create weight_max node
VarDesc mul_w_max_desc(mul_w_max_name);
mul_w_max_desc.SetPersistable(true);
mul_w_max = graph->CreateVarNode(&mul_w_max_desc);
// Create weight_max var/tensor
auto mul_w_max_var = block->Var(mul_w_max_name);
mul_w_max_var->SetPersistable(true);
auto mul_w_max_tensor =
scope->Var(mul_w_max_name)->GetMutable<phi::DenseTensor>();
auto* xpu_ctx = static_cast<phi::XPUContext*>(
platform::DeviceContextPool::Instance().Get(phi::XPUPlace()));
int max_ptr_size = xpu_ctx->x_context()->max_ptr_size();
bool transpose_w = false;
if (mul_type == "matmul") {
transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("transpose_Y"));
} else if (mul_type == "matmul_v2") {
transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("trans_y"));
}
QuantWeight<int16_t>(
mul_w_tensor, mul_w_max_tensor, !transpose_w, max_ptr_size);
}
// Generate fc_xpu op
framework::OpDesc fc_xpu_op_desc(block);
fc_xpu_op_desc.SetType("fc_xpu");
fc_xpu_op_desc.SetInput("x", {mul_x->Name()});
fc_xpu_op_desc.SetInput("w", {mul_w->Name()});
fc_xpu_op_desc.SetInput("w_max", {mul_w_max_name});
if (bias) {
fc_xpu_op_desc.SetInput("bias", {bias->Name()});
}
fc_xpu_op_desc.SetAttr(
"in_num_col_dims",
static_cast<int>(mul_x->Var()->GetShape().size() - 1));
if (mul_type == "mul") {
fc_xpu_op_desc.SetAttr(
"in_num_col_dims",
PADDLE_GET_CONST(int, mul->Op()->GetAttr("in_num_col_dims")));
}
fc_xpu_op_desc.SetAttr("transpose_x", false);
fc_xpu_op_desc.SetAttr("alpha", 1.f);
fc_xpu_op_desc.SetAttr("beta", 0.f);
if (mul_type == "matmul") {
fc_xpu_op_desc.SetAttr(
"alpha", PADDLE_GET_CONST(float, mul->Op()->GetAttr("alpha")));
fc_xpu_op_desc.SetAttr(
"beta", PADDLE_GET_CONST(float, mul->Op()->GetAttr("beta")));
}
fc_xpu_op_desc.SetAttr("act_type", 0);
fc_xpu_op_desc.SetAttr("act_alpha", 0.f);
if (act) {
fc_xpu_op_desc.SetAttr("act_type", act_map_.at(act_type));
if (act_type == "leaky_relu") {
fc_xpu_op_desc.SetAttr(
"act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("alpha")));
} else if (act_type == "hard_sigmoid") {
fc_xpu_op_desc.SetAttr(
"act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("slope")));
}
}
if (act_out) {
fc_xpu_op_desc.SetOutput("out", {act_out->Name()});
} else if (add_out) {
fc_xpu_op_desc.SetOutput("out", {add_out->Name()});
} else {
fc_xpu_op_desc.SetOutput("out", {mul_out->Name()});
}
auto* fc_xpu = graph->CreateOpNode(&fc_xpu_op_desc);
SAFE_IR_NODE_LINK_TO(mul_x, fc_xpu);
SAFE_IR_NODE_LINK_TO(mul_w, fc_xpu);
SAFE_IR_NODE_LINK_TO(mul_w_max, fc_xpu);
SAFE_IR_NODE_LINK_TO(bias, fc_xpu);
if (act_out) {
SAFE_IR_NODE_LINK_TO(fc_xpu, act_out);
} else if (add_out) {
SAFE_IR_NODE_LINK_TO(fc_xpu, add_out);
} else {
SAFE_IR_NODE_LINK_TO(fc_xpu, mul_out);
}
// delete useless node
std::unordered_set<const Node*> delete_nodes;
if (act != nullptr && add != nullptr) {
delete_nodes = {mul, mul_out, add, add_out, act};
} else if (act) {
delete_nodes = {mul, mul_out, act};
} else if (add) {
delete_nodes = {mul, mul_out, add};
}
GraphSafeRemoveNodes(graph, delete_nodes);
found_subgraph_count++;
};
gpd(graph, handler);
AddStatis(found_subgraph_count);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(fc_xpu_fuse_pass, paddle::framework::ir::FcXPUFusePass);
REGISTER_PASS_CAPABILITY(fc_xpu_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination().EQ(
"fc_xpu", 0));
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace paddle {
namespace framework {
namespace ir {
#define GET_IR_NODE(node_) SAFE_GET_IR_NODE_FROM_SUBGRAPH(node_, node_, pattern)
// Get an ir::Node* from the matched subgraph.
// var: variable.
// arg: the argument declared by PATTERN_DECL_NODE in a pattern definition.
// pat: the pattern object.
#define SAFE_GET_IR_NODE_FROM_SUBGRAPH(var, arg, pat) \
Node* var = nullptr; \
if (pat.arg##_n()) { \
PADDLE_ENFORCE_NE(subgraph.count(pat.arg##_n()), \
0UL, \
platform::errors::NotFound( \
"Node not found for PDNode %s", pat.arg##_repr())); \
var = subgraph.at(pat.arg##_n()); \
PADDLE_ENFORCE_NOT_NULL(var, \
platform::errors::NotFound( \
"node %s not exists in the sub-graph", #arg)); \
}
#define SAFE_IR_NODE_LINK_TO(a, b) \
if (a != nullptr && b != nullptr) { \
IR_NODE_LINK_TO(a, b) \
}
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
template <typename T>
static void Transpose(const T* in, T* out, int h, int w) {
for (int h1 = 0; h1 < w; ++h1) {
for (int w1 = 0; w1 < h; ++w1) {
out[h1 * h + w1] = in[w1 * w + h1];
}
}
}
static float FindMaxAbs(const float* data, int len) {
float max_f = 0.0f;
for (int i = 0; i < len; ++i) {
float max = std::abs(data[i]);
if (max > max_f) {
max_f = max;
}
}
return max_f;
}
static float IEEECompliance0(float f) {
uint32_t* ptr = reinterpret_cast<uint32_t*>(&f);
uint32_t sign = (*ptr) & 0x80000000;
uint32_t uf = 0;
// nan -> inf
if (std::isnan(f)) {
uf = (sign | 0x7F800000);
float* ptr = reinterpret_cast<float*>(&uf);
return *ptr;
} else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) {
return f;
} else {
// denormal -> +-0
uf = 0x0;
float* ptr = reinterpret_cast<float*>(&uf);
return *ptr;
}
}
static inline long RoundHalfToEven(const float src) { // NOLINT
long ret = llround(src); // NOLINT
if (fabs(fabs(round(src) - src) - 0.5) > 0) {
return ret;
} else {
if (abs(ret) % 2 == 0) {
return ret;
} else {
return ret + (ret > 0 ? -1 : 1);
}
}
}
template <typename T, int RMAX>
static T Fp32ToIntx(const float f, float max) {
max = IEEECompliance0(max);
float input = IEEECompliance0(f);
// +0 and -0 -> +0
if (input == 0) {
input = 0.0f;
}
float tmp = RMAX / max;
if (std::isinf(tmp)) {
uint32_t* ptr = reinterpret_cast<uint32_t*>(&input);
if ((*ptr) >> 31 & 1) {
return T(-RMAX);
} else {
return T(RMAX);
}
}
tmp = input * tmp;
if (std::isnan(tmp)) {
return T(RMAX);
}
tmp = IEEECompliance0(tmp);
// early check to avoid INF or big value get into convertor func.
if (tmp > RMAX) {
return T(RMAX);
}
if (tmp < -RMAX) {
return T(-RMAX);
}
T ret = (T)RoundHalfToEven(tmp);
if (ret > RMAX) {
ret = T(RMAX);
}
if (ret < -RMAX) {
ret = T(-RMAX);
}
return ret;
}
template <typename T>
static void QuantFP32ToIntX(const float* src_ptr,
T* dst_ptr,
float max_val,
int numel) {
LOG(FATAL) << "Not support.";
}
template <>
void QuantFP32ToIntX<int16_t>(const float* src_ptr,
int16_t* dst_ptr,
float max_val,
int numel) {
for (int i = 0; i < numel; i++) {
dst_ptr[i] = Fp32ToIntx<int16_t, 32767>(src_ptr[i], max_val);
}
}
template <typename T>
void QuantWeight(phi::DenseTensor* weight,
phi::DenseTensor* weight_max,
bool transpose,
int max_ptr_size) {
// Transpose
auto* weight_data = weight->data<float>();
auto dims = weight->dims();
auto size = weight->numel();
std::vector<float> transpose_data(weight_data, weight_data + size);
if (transpose) {
PADDLE_ENFORCE_EQ(
dims.size(),
2,
platform::errors::InvalidArgument(
"Only support 2D weight, but received weight rank is [%d].",
dims.size()));
Transpose(weight_data, transpose_data.data(), dims[0], dims[1]);
weight->Resize({dims[1], dims[0]});
}
weight_data = transpose_data.data();
// Find max
float max_val = FindMaxAbs(weight_data, size);
std::vector<float> max_vec(max_ptr_size, max_val);
weight_max->set_type(paddle::experimental::CppTypeToDataType<float>::Type());
weight_max->Resize({max_ptr_size});
auto* dev_ctx = static_cast<phi::CPUContext*>(
platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
memcpy(dev_ctx->Alloc<float>(weight_max),
max_vec.data(),
max_ptr_size * sizeof(float));
// Quant
std::vector<T> quant_data(size);
QuantFP32ToIntX(weight_data, quant_data.data(), max_val, size);
weight->set_type(paddle::experimental::CppTypeToDataType<T>::Type());
memcpy(dev_ctx->Alloc<T>(weight), quant_data.data(), size * sizeof(T));
}
template void QuantWeight<int16_t>(phi::DenseTensor* weight,
phi::DenseTensor* weight_max,
bool transpose,
int max_ptr_size);
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace paddle {
namespace framework {
namespace ir {
// 1. Quant weight from fp32 to int16/int31
// 2. Weight data is in-place update.
// 3. Generate weight max tensor
template <typename T>
void QuantWeight(phi::DenseTensor* weight,
phi::DenseTensor* weight_max,
bool transpose,
int max_ptr_size);
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -513,6 +513,19 @@ void CpuPassStrategy::EraseFcMkldnnPasses() { ...@@ -513,6 +513,19 @@ void CpuPassStrategy::EraseFcMkldnnPasses() {
} }
} }
XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
passes_.assign({
"delete_dropout_op_pass",
// "multi_encoder_xpu_fuse_pass",
// "embedding_with_eltwise_add_xpu_fuse_pass",
"fc_xpu_fuse_pass",
// "multi_encoder_slice_link_xpu_fuse_pass",
// "generate_sequence_xpu_fuse_pass",
// "link_previous_out_max_xpu_pass",
});
use_xpu_ = true;
}
IpuPassStrategy::IpuPassStrategy() : PassStrategy({}) { IpuPassStrategy::IpuPassStrategy() : PassStrategy({}) {
passes_.assign({"inference_process_pass"}); passes_.assign({"inference_process_pass"});
} }
......
...@@ -290,7 +290,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { ...@@ -290,7 +290,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
/// mode. /// mode.
class PD_INFER_DECL XpuPassStrategy final : public PassStrategy { class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
public: public:
XpuPassStrategy() : PassStrategy({}) { use_xpu_ = true; } XpuPassStrategy();
}; };
/// \class NpuPassStrategy /// \class NpuPassStrategy
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/binary.h" #include "paddle/phi/infermeta/binary.h"
#include "paddle/phi/infermeta/fusion.h"
#include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/infermeta/multiary.h"
#include "paddle/phi/infermeta/nullary.h" #include "paddle/phi/infermeta/nullary.h"
#include "paddle/phi/infermeta/ternary.h" #include "paddle/phi/infermeta/ternary.h"
......
- op : fc_xpu
args : (Tensor x, Tensor w, Tensor w_max, Tensor bias, int in_num_col_dims, bool transpose_x, float alpha, float beta, int act_type, float act_alpha)
output : Tensor
infer_meta :
func : FcXPUInferMeta
kernel :
func : fc_xpu
optional : bias
- op : share_buffer - op : share_buffer
args : (Tensor[] x, bool[] share_dims_and_dtype={}) args : (Tensor[] x, bool[] share_dims_and_dtype={})
output : Tensor[](out){x.size()}, Tensor[](xout){x.size()} output : Tensor[](out){x.size()}, Tensor[](xout){x.size()}
......
...@@ -93,6 +93,7 @@ XPUOpMap& get_kl1_ops() { ...@@ -93,6 +93,7 @@ XPUOpMap& get_kl1_ops() {
phi::DataType::BOOL, phi::DataType::BOOL,
phi::DataType::FLOAT16, phi::DataType::FLOAT16,
phi::DataType::FLOAT32})}, phi::DataType::FLOAT32})},
{"fc_xpu", XPUKernelSet({phi::DataType::FLOAT32})},
{"fill_any_like", XPUKernelSet({phi::DataType::INT64})}, {"fill_any_like", XPUKernelSet({phi::DataType::INT64})},
{"fill_constant", {"fill_constant",
XPUKernelSet({phi::DataType::INT32, XPUKernelSet({phi::DataType::INT32,
......
...@@ -224,6 +224,7 @@ XPUOpMap& get_kl2_ops() { ...@@ -224,6 +224,7 @@ XPUOpMap& get_kl2_ops() {
phi::DataType::BOOL, phi::DataType::BOOL,
phi::DataType::FLOAT16, phi::DataType::FLOAT16,
phi::DataType::FLOAT32})}, phi::DataType::FLOAT32})},
{"fc_xpu", XPUKernelSet({phi::DataType::FLOAT32})},
{"fill", {"fill",
XPUKernelSet({phi::DataType::INT64, XPUKernelSet({phi::DataType::INT64,
phi::DataType::INT32, phi::DataType::INT32,
......
cc_library( cc_library(
infermeta infermeta
SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc fusion.cc
DEPS convert_utils meta_tensor infermeta_utils) DEPS convert_utils meta_tensor infermeta_utils)
cc_library( cc_library(
backward_infermeta backward_infermeta
......
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/infermeta/fusion.h"
#include <vector>
#include "paddle/phi/common/layout.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/core/meta_tensor.h"
namespace phi {
void FcXPUInferMeta(const MetaTensor& x,
const MetaTensor& w,
const MetaTensor& w_max,
const MetaTensor& bias,
int in_num_col_dims,
bool transpose_x,
float alpha,
float beta,
int act_type,
float act_alpha,
MetaTensor* out) {
std::vector<int> out_shape(in_num_col_dims + 1);
for (int i = 0; i < in_num_col_dims; i++) {
out_shape[i] = x.dims()[i];
}
out_shape[in_num_col_dims] = w.dims()[0];
out->set_dims(DDim(out_shape.data(), out_shape.size()));
out->set_dtype(x.dtype());
out->set_layout(x.layout());
}
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/meta_tensor.h"
namespace phi {
// Common InferMeta Functions for fusion operators.
// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
void FcXPUInferMeta(const MetaTensor& x,
const MetaTensor& w,
const MetaTensor& w_max,
const MetaTensor& bias,
int in_num_col_dims,
bool transpose_x,
float alpha,
float beta,
int act_type,
float act_alpha,
MetaTensor* out);
} // namespace phi
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
namespace fusion {
template <typename T, typename Context>
void FcXPUKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& w,
const DenseTensor& w_max,
const paddle::optional<DenseTensor>& bias,
int in_num_col_dims,
bool transpose_x,
float alpha,
float beta,
int act_type,
float act_alpha,
DenseTensor* out) {
auto in_mat_dims = flatten_to_2d(x.dims(), in_num_col_dims);
int m = in_mat_dims[0];
int k = in_mat_dims[1];
int n = w.dims()[0];
const float* bias_data =
bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<T>();
xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
if (act_type == 5) {
act.leaky_alpha = act_alpha;
} else if (act_type == 15) {
act.hard_sigmoid_slope = act_alpha;
}
ctx.template Alloc<T>(out);
int r = xpu::fc_fusion<T, int16_t, T, int16_t>( // TX, TW. TY, TGEMM
ctx.x_context(), // ctx
x.data<T>(), // x
w.data<int16_t>(), // w
out->data<T>(), // y
m, // m
n, // n
k, // k
transpose_x, // x_trans
true, // w_trans
nullptr, // x_maxptr
w_max.data<float>(), // w_maxptr
nullptr, // y_maxptr
transpose_x ? m : k, // ldx
k, // ldw
n, // ldy
alpha, // alpha
beta, // beta
bias_data, // bias
act);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_xpu");
}
} // namespace fusion
} // namespace phi
PD_REGISTER_KERNEL(fc_xpu, XPU, ALL_LAYOUT, phi::fusion::FcXPUKernel, float) {}
...@@ -93,6 +93,22 @@ if(WITH_MKLDNN) ...@@ -93,6 +93,22 @@ if(WITH_MKLDNN)
endforeach() endforeach()
endif() endif()
file(
GLOB TEST_XPU_IR_PASSES
RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
"test_xpu_*.py")
string(REPLACE ".py" "" TEST_XPU_IR_PASSES "${TEST_XPU_IR_PASSES}")
foreach(TEST_XPU_IR_PASS ${TEST_XPU_IR_PASSES})
list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES ${TEST_XPU_IR_PASS})
endforeach()
if(WITH_XPU)
foreach(target ${TEST_XPU_IR_PASSES})
py_test_modules(${target} MODULES ${target})
set_tests_properties(${target} PROPERTIES LABELS "RUN_TYPE=INFER")
endforeach()
endif()
# below are cutlass unitests # below are cutlass unitests
file( file(
GLOB TEST_CUTLASS GLOB TEST_CUTLASS
......
...@@ -223,6 +223,7 @@ class AutoScanTest(unittest.TestCase): ...@@ -223,6 +223,7 @@ class AutoScanTest(unittest.TestCase):
passes: Optional[List[str]] = None, passes: Optional[List[str]] = None,
use_gpu: bool = False, use_gpu: bool = False,
use_mkldnn: bool = False, use_mkldnn: bool = False,
use_xpu: bool = False,
ir_optim: Optional[bool] = None, ir_optim: Optional[bool] = None,
): ):
config = paddle_infer.Config() config = paddle_infer.Config()
...@@ -235,6 +236,8 @@ class AutoScanTest(unittest.TestCase): ...@@ -235,6 +236,8 @@ class AutoScanTest(unittest.TestCase):
config.enable_use_gpu(100, 0) config.enable_use_gpu(100, 0)
if use_mkldnn: if use_mkldnn:
config.enable_mkldnn() config.enable_mkldnn()
if use_xpu:
config.enable_xpu()
if passes is not None: if passes is not None:
config.pass_builder().set_passes(passes) config.pass_builder().set_passes(passes)
self.passes = passes self.passes = passes
...@@ -571,6 +574,8 @@ class PassAutoScanTest(AutoScanTest): ...@@ -571,6 +574,8 @@ class PassAutoScanTest(AutoScanTest):
dic['use_mkldnn'] = enable_mkldnn dic['use_mkldnn'] = enable_mkldnn
enable_gpu = config.use_gpu() enable_gpu = config.use_gpu()
dic['use_gpu'] = enable_gpu dic['use_gpu'] = enable_gpu
enable_xpu = config.use_xpu()
dic['use_xpu'] = enable_xpu
if not self.passes: if not self.passes:
dic['passes'] = self.passes dic['passes'] = self.passes
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import hypothesis.strategies as st
from auto_scan_test import PassAutoScanTest
from program_config import OpConfig, ProgramConfig, TensorConfig
class TestFcXPUFusePass(PassAutoScanTest):
def sample_predictor_configs(self, program_config):
config = self.create_inference_config(use_xpu=True)
yield config, ["fc_xpu"], (1e-3, 1e-3)
def sample_program_config(self, draw):
# 1. matmul_v2
# Generate shape of input:X of matmul_v2
x_shape = draw(
st.lists(
st.integers(min_value=1, max_value=4), min_size=2, max_size=4
)
)
# Generate attr trans_x, trans_y
trans_x = False
trans_y = draw(st.booleans())
# Generate legal shape of input:Y of mul
y_shape = draw(
st.lists(
st.integers(min_value=1, max_value=8), min_size=2, max_size=2
)
)
if trans_y:
y_shape[1] = x_shape[-1]
else:
y_shape[0] = x_shape[-1]
# 2. elementwise_add
# Generate legal attr:axis of elementwise_add
axis = -1
# Generate legal shape of input:Y of elementwise_add
bias_shape = [y_shape[0]] if trans_y else [y_shape[1]]
# 3. activation
# Random choose if add a relu operator
has_relu = draw(st.booleans())
# Here we will compose a program
# Still has some risks that the program is invalid or cause bug while running
# Use function `is_program_valid` to filter the invalid programs before running
# Use function `add_skip_pass_case` to ignore the programs even if they cause bug while runing
matmul_v2_op = OpConfig(
"matmul_v2",
inputs={"X": ["matmul_v2_x"], "Y": ["matmul_v2_y"]},
outputs={"Out": ["matmul_v2_out"]},
trans_x=trans_x,
trans_y=trans_y,
)
add_op = OpConfig(
"elementwise_add",
inputs={"X": ["matmul_v2_out"], "Y": ["bias"]},
outputs={"Out": ["add_out"]},
axis=axis,
)
ops = [matmul_v2_op, add_op]
if has_relu:
relu_op = OpConfig(
"relu", inputs={"X": ["add_out"]}, outputs={"Out": ["relu_out"]}
)
ops.append(relu_op)
program_config = ProgramConfig(
ops=ops,
weights={
"matmul_v2_y": TensorConfig(shape=y_shape),
"bias": TensorConfig(shape=bias_shape),
},
inputs={
"matmul_v2_x": TensorConfig(shape=x_shape),
},
outputs=ops[-1].outputs["Out"],
)
return program_config
def test(self):
self.run_and_statis(
quant=False, max_examples=25, passes=["fc_xpu_fuse_pass"]
)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册