未验证 提交 0a2f9036 编写于 作者: H hong 提交者: GitHub

Merge branch 'develop' into add_some_yaml_config

......@@ -213,7 +213,8 @@ def ParseYamlArgs(string):
default_value = m.group(3).split("=")[1].strip() if len(
m.group(3).split("=")) > 1 else None
assert arg_type in yaml_types_mapping.keys(), arg_type
assert arg_type in yaml_types_mapping.keys(
), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping."
arg_type = yaml_types_mapping[arg_type]
arg_name = RemoveSpecialSymbolsInName(arg_name)
......@@ -248,7 +249,8 @@ def ParseYamlReturns(string):
else:
ret_type = ret.strip()
assert ret_type in yaml_types_mapping.keys(), ret_type
assert ret_type in yaml_types_mapping.keys(
), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping."
ret_type = yaml_types_mapping[ret_type]
assert "Tensor" in ret_type
......
......@@ -25,7 +25,7 @@ atype_to_parsing_function = {
"std::string": "CastPyArg2String",
"int64_t": "CastPyArg2Long",
"float": "CastPyArg2Float",
"string": "CastPyArg2String",
"std::string": "CastPyArg2String",
"std::vector<bool>": "CastPyArg2Booleans",
"std::vector<int>": "CastPyArg2Ints",
"std::vector<long>": "CastPyArg2Longs",
......
......@@ -2052,18 +2052,19 @@ PDNode *patterns::Pool::operator()() {
return output_var;
}
PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
->assert_is_op("elementwise_add");
x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
auto out_var = pattern->NewNode(elementwise_add_out_repr())
PDNode *patterns::Elementwise::operator()(PDNode *x_var, PDNode *y_var,
const std::string elementwise_type) {
auto elementwise_op =
pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type);
x_var->AsInput()->assert_is_op_input(elementwise_type, "X");
y_var->AsInput()->assert_is_op_input(elementwise_type, "Y");
auto out_var = pattern->NewNode(elementwise_out_repr())
->AsOutput()
->assert_is_op_output("elementwise_add", "Out");
->assert_is_op_output(elementwise_type, "Out");
elementwise_add_op->LinksFrom({x_var, y_var});
elementwise_add_op->LinksTo({out_var});
elementwise_op->LinksFrom({x_var, y_var});
elementwise_op->LinksTo({out_var});
return out_var;
}
......
......@@ -1016,20 +1016,20 @@ struct Pool : public PatternBase {
PATTERN_DECL_NODE(pool_output);
};
// ElementwiseAdd used in residual connections.
// y_var is used and convolution output.
// The operator is removed, when residual
// connection fusion is on.
struct ElementwiseAdd : public PatternBase {
ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "elementwise_add") {}
PDNode* operator()(PDNode* x_var, PDNode* y_var);
PATTERN_DECL_NODE(elementwise_add_op);
PATTERN_DECL_NODE(elementwise_add_x);
PATTERN_DECL_NODE(elementwise_add_y);
PATTERN_DECL_NODE(elementwise_add_out);
// Elementwise ops
// Forward pass for element-wise operators (add, mul)
// elementwise_mul_out is the result of the operator
struct Elementwise : public PatternBase {
Elementwise(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "elementwise") {}
PDNode* operator()(PDNode* x_var, PDNode* y_var,
const std::string elementwise_type);
PATTERN_DECL_NODE(elementwise_op);
PATTERN_DECL_NODE(elementwise_x);
PATTERN_DECL_NODE(elementwise_y);
PATTERN_DECL_NODE(elementwise_out);
};
// Transpose op
......
......@@ -145,10 +145,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
patterns::Conv conv_pattern{pattern, name_scope};
auto conv_output = conv_pattern();
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
elementwise_add_pattern(
conv_output,
pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
patterns::Elementwise elementwise_pattern{pattern, name_scope};
elementwise_pattern(
conv_output, pattern->NewNode(elementwise_pattern.elementwise_y_repr()),
"elementwise_add");
conv_output->AsIntermediate();
int found_conv_as_x_count = 0;
......@@ -160,16 +160,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
elementwise_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_identity, elementwise_y,
elementwise_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
elementwise_pattern);
if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return;
if (!IsReachable(g, elementwise_add_identity, conv_output)) return;
if (!IsReachable(g, elementwise_identity, conv_output)) return;
if (HasFusedActivation(conv_op)) return;
......@@ -179,14 +179,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
return;
}
conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
conv_op->Op()->SetInput("ResidualData", {elementwise_identity->Name()});
conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
conv_op->Op()->SetAttr("fuse_residual_connection", true);
GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
GraphSafeRemoveNodes(g, {conv_output, elementwise_op});
IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
IR_NODE_LINK_TO(conv_op, elementwise_add_out);
IR_NODE_LINK_TO(elementwise_identity, conv_op);
IR_NODE_LINK_TO(conv_op, elementwise_out);
found_conv_as_x_count++;
};
......@@ -212,10 +212,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
patterns::Conv conv_pattern{pattern, name_scope};
auto conv_output = conv_pattern();
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
elementwise_add_pattern(
pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
conv_output);
patterns::Elementwise elementwise_pattern{pattern, name_scope};
elementwise_pattern(
pattern->NewNode(elementwise_pattern.elementwise_x_repr()), conv_output,
"elementwise_add");
conv_output->AsIntermediate();
int found_conv_as_y_count = 0;
......@@ -227,16 +227,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
elementwise_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x,
elementwise_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
elementwise_pattern);
if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return;
if (!IsReachable(g, elementwise_add_x, conv_output)) return;
if (!IsReachable(g, elementwise_x, conv_output)) return;
if (HasFusedActivation(conv_op)) return;
......@@ -246,14 +246,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
return;
}
conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()});
conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
conv_op->Op()->SetInput("ResidualData", {elementwise_x->Name()});
conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
conv_op->Op()->SetAttr("fuse_residual_connection", true);
GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
GraphSafeRemoveNodes(g, {conv_output, elementwise_op});
IR_NODE_LINK_TO(elementwise_add_x, conv_op);
IR_NODE_LINK_TO(conv_op, elementwise_add_out);
IR_NODE_LINK_TO(elementwise_x, conv_op);
IR_NODE_LINK_TO(conv_op, elementwise_out);
found_conv_as_y_count++;
};
......@@ -282,8 +282,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
patterns::Conv conv_y_pattern{pattern, name_scope};
auto conv_y_output = conv_y_pattern();
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
elementwise_add_pattern(conv_x_output, conv_y_output);
patterns::Elementwise elementwise_pattern{pattern, name_scope};
elementwise_pattern(conv_x_output, conv_y_output, "elementwise_add");
conv_x_output->AsIntermediate();
conv_y_output->AsIntermediate();
......@@ -301,10 +301,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
elementwise_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
elementwise_pattern);
if (!IsCompat(subgraph, g)) {
LOG(WARNING)
......@@ -312,8 +312,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
return;
}
if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return;
if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return;
if (FindFuseOption(*conv_x_op, *elementwise_op) != FUSE_MKLDNN) return;
if (FindFuseOption(*conv_y_op, *elementwise_op) != FUSE_MKLDNN) return;
Node* projection_node;
Node* residual_conv_op;
......@@ -333,14 +333,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
if (HasFusedActivation(residual_conv_op)) return;
residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
residual_conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op});
GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_op});
IR_NODE_LINK_TO(projection_node, residual_conv_op);
IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
IR_NODE_LINK_TO(residual_conv_op, elementwise_out);
found_projection_conv_count++;
};
......
......@@ -807,74 +807,74 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
PrettyLogDetail("--- quantized %d matmul ops", quantize_matmul_count);
}
void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
void CPUQuantizePass::QuantizeElementwise(
Graph* graph, const std::string elementwise_type) const {
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_};
patterns::Elementwise elementwise_pattern{pattern, name_scope_};
elementwise_add_pattern(
pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
elementwise_pattern(
pattern->NewNode(elementwise_pattern.elementwise_x_repr()),
pattern->NewNode(elementwise_pattern.elementwise_y_repr()),
elementwise_type);
int quantize_elementwise_add_count = 0;
int quantize_elementwise_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "Quantize elementwise_add op";
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
VLOG(4) << "Quantize " + elementwise_type + " op";
GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
elementwise_pattern);
// skip if should not be quantized
if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) {
LogQuantizationDisabled(elementwise_add_op);
if (!platform::HasOpINT8DataType(elementwise_op->Op())) {
LogQuantizationDisabled(elementwise_op);
return;
}
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x,
elementwise_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_y, elementwise_y,
elementwise_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
elementwise_pattern);
if (!AreScalesPresentForNodes(
{elementwise_add_x, elementwise_add_y, elementwise_add_out})) {
LogCannotQuantizeOp(elementwise_add_op,
{elementwise_x, elementwise_y, elementwise_out})) {
LogCannotQuantizeOp(elementwise_op,
"No scale available for the operator");
return;
}
bool is_x_unsigned{false}, is_y_unsigned{false};
auto input_x_scale =
GetScaleValueForNode(elementwise_add_x, &is_x_unsigned);
auto input_y_scale =
GetScaleValueForNode(elementwise_add_y, &is_y_unsigned);
auto input_x_scale = GetScaleValueForNode(elementwise_x, &is_x_unsigned);
auto input_y_scale = GetScaleValueForNode(elementwise_y, &is_y_unsigned);
// TODO(sfraczek): add support for different signness
if (is_x_unsigned != is_y_unsigned) {
LogCannotQuantizeOp(elementwise_add_op,
"ElementwiseAdd inputs must be of the same type.");
LogCannotQuantizeOp(elementwise_op,
"Elementwise inputs must be of the same type.");
return;
}
QuantizeInput(g, elementwise_add_op, elementwise_add_x, "X", input_x_scale,
QuantizeInput(g, elementwise_op, elementwise_x, "X", input_x_scale,
is_x_unsigned, "Scale_x");
QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale,
QuantizeInput(g, elementwise_op, elementwise_y, "Y", input_y_scale,
is_y_unsigned, "Scale_y");
bool is_output_unsigned{false};
auto output_scale =
GetScaleValueForNode(elementwise_add_out, &is_output_unsigned);
GetScaleValueForNode(elementwise_out, &is_output_unsigned);
DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out",
output_scale, is_output_unsigned, "Scale_out");
DequantizeOutput(g, elementwise_op, elementwise_out, "Out", output_scale,
is_output_unsigned, "Scale_out");
++quantize_elementwise_add_count;
++quantize_elementwise_count;
};
gpd(graph, handler);
AddStatis(quantize_elementwise_add_count);
AddStatis(quantize_elementwise_count);
PrettyLogDetail("--- quantized %d elementwise_add ops",
quantize_elementwise_add_count);
PrettyLogDetail("--- quantized %d %s ops", quantize_elementwise_count,
elementwise_type);
}
void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
......@@ -1146,7 +1146,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
QuantizeFc(graph);
QuantizeReshape(graph);
QuantizeMatmul(graph);
QuantizeElementwiseAdd(graph);
QuantizeElementwise(graph, "elementwise_add");
QuantizeElementwise(graph, "elementwise_mul");
QuantizeFusionGru(graph);
QuantizeMultiGru(graph);
QuantizeFusionLSTM(graph);
......
......@@ -57,7 +57,8 @@ class CPUQuantizePass : public FusePassBase {
void QuantizeTranspose(Graph* graph) const;
void QuantizeReshape(Graph* graph) const;
void QuantizeMatmul(Graph* graph) const;
void QuantizeElementwiseAdd(Graph* graph) const;
void QuantizeElementwise(Graph* graph,
const std::string elementwise_type) const;
void QuantizeFusionGru(Graph* graph) const;
void QuantizeMultiGru(Graph* graph) const;
void QuantizeFusionLSTM(Graph* graph) const;
......
......@@ -90,7 +90,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
op->SetAttr("Scale_x", 1.0f);
op->SetAttr("Scale_y", 1.0f);
op->SetAttr("Scale_out", 1.0f);
} else if (type == "elementwise_add") {
} else if (type == "elementwise_add" || type == "elementwise_mul") {
op->SetInput("X", {inputs[0]});
if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
op->SetOutput("Out", {outputs[0]});
......@@ -167,7 +167,8 @@ void CheckScales(const OpDesc* op, float scale, float shift) {
scale);
scale_names.push_back("Scale_in");
scale_names.push_back("Scale_out");
} else if (type == "matmul" || type == "elementwise_add") {
} else if (type == "matmul" || type == "elementwise_add" ||
type == "elementwise_mul") {
scale_names.push_back("Scale_x");
scale_names.push_back("Scale_y");
scale_names.push_back("Scale_out");
......@@ -546,46 +547,77 @@ TEST(CpuQuantizePass, matmul_not_quantized) {
expected_operators, added_nodes, 1.0f);
}
static const std::initializer_list<std::string> variable_names_elementwise_add =
{"a", "b", "c", "d", "e", "f"};
static const std::initializer_list<std::string> variable_names_elementwise = {
"a", "b", "c", "d", "e", "f"};
ProgramDesc BuildProgramDescElementwiseAdd() {
ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type,
const std::string elementwise_name) {
ProgramDesc prog;
for (auto& v : variable_names_elementwise_add) {
for (auto& v : variable_names_elementwise) {
prog.MutableBlock(0)->Var(v);
}
SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true,
SetOp(&prog, elementwise_type, elementwise_name, {"b", "d"}, {"e"}, true,
"int8");
SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
return prog;
}
TEST(CpuQuantizePass, elementwise_add) {
void TestElementwise(const std::string elementwise_type,
const std::string elementwise_name) {
// 2 Quant + 2 IN + 1 DeQuant + 1 OUT
int added_nodes = 6;
std::unordered_map<std::string, int> expected_operators = {
{"elementwise_add", 1}, {"quantize", 2}, {"dequantize", 3}};
MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
expected_operators, added_nodes, SCALE * S8_MAX);
{elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}};
MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
variable_names_elementwise, expected_operators, added_nodes,
SCALE * S8_MAX);
}
TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
void TestElementwiseOutputScaleMissing(const std::string elementwise_type,
const std::string elementwise_name) {
int added_nodes = 0;
std::unordered_map<std::string, int> expected_operators = {
{"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
expected_operators, added_nodes, 1.f, 1.f, "e");
{elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
variable_names_elementwise, expected_operators, added_nodes, 1.f,
1.f, "e");
}
TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type,
const std::string elementwise_name) {
int added_nodes = 0;
std::unordered_map<std::string, int> expected_operators = {
{"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
expected_operators, added_nodes, 1.f, 1.f, "", "b");
{elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
variable_names_elementwise, expected_operators, added_nodes, 1.f,
1.f, "", "b");
}
TEST(CpuQuantizePass, elementwise_add) {
TestElementwise("elementwise_add", "ElementwiseAdd");
}
TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd");
}
TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd");
}
TEST(CpuQuantizePass, elementwise_mul) {
TestElementwise("elementwise_mul", "ElementwiseMul");
}
TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) {
TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul");
}
TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) {
TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul");
}
const std::vector<std::string> churn_out_vars(ProgramDesc* prog,
......
......@@ -26,10 +26,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "Marks operators which are to be quantized.";
std::unordered_set<std::string> supported_op_types =
std::unordered_set<std::string>(
{"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc",
"matmul", "nearest_interp", "nearest_interp_v2", "pool2d",
"prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm",
"multi_gru", "slice"});
{"concat", "conv2d", "depthwise_conv2d", "elementwise_add",
"elementwise_mul", "fc", "matmul", "nearest_interp",
"nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2",
"fusion_gru", "fusion_lstm", "multi_gru", "slice"});
const auto& excluded_ids_list =
Get<std::unordered_set<int>>("quantize_excluded_op_ids");
const auto& op_types_list =
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/phi/kernels/funcs/activation_functor.h"
namespace paddle {
namespace operators {
......@@ -1148,63 +1150,221 @@ REGISTER_OP_CUDA_KERNEL(
FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
#ifdef PADDLE_WITH_XPU_KP
#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor, \
grad_functor) \
REGISTER_OP_KERNEL( \
act_type, KP, plat::XPUPlace, \
ops::ActivationCudaKernel<plat::XPUDeviceContext, ops::functor<float>>); \
REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace, \
ops::ActivationGradCudaKernel<plat::XPUDeviceContext, \
ops::grad_functor<float>>);
REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
CudaLeakyReluGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
CudaSigmoidGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
CudaReciprocalGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
CudaSoftplusGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor,
CudaHardSwishGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor,
CudaCELUGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
CudaSqrtGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor,
CudaSquareGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor,
CudaSiluGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
CudaLogSigmoidGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
CudaSoftShrinkGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor,
CudaZeroGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor,
CudaZeroGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor,
CudaLog1pGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor,
CudaBReluGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
CudaSoftReluGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
CudaSoftsignGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor,
CudaRelu6GradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
CudaHardShrinkGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid,
CudaHardSigmoidFunctor,
CudaHardSigmoidGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor,
CudaSwishGradFunctor);
REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu,
CudaThresholdedReluFunctor,
CudaThresholdedReluGradFunctor);
REGISTER_OP_KERNEL(
brelu, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
phi::funcs::CudaBReluFunctor<float>>);
REGISTER_OP_KERNEL(
brelu_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
phi::funcs::CudaBReluGradFunctor<float>>);
REGISTER_OP_KERNEL(ceil, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaCeilFunctor<float>>);
REGISTER_OP_KERNEL(
ceil_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaZeroGradFunctor<float>>);
REGISTER_OP_KERNEL(celu, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaCELUFunctor<float>>);
REGISTER_OP_KERNEL(
celu_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaCELUGradFunctor<float>>);
REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaELUFunctor<float>>);
REGISTER_OP_KERNEL(
elu_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaELUGradFunctor<float>>);
REGISTER_OP_KERNEL(exp, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaExpFunctor<float>>);
REGISTER_OP_KERNEL(
exp_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaExpGradFunctor<float>>);
REGISTER_OP_KERNEL(floor, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaFloorFunctor<float>>);
REGISTER_OP_KERNEL(
floor_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaZeroGradFunctor<float>>);
REGISTER_OP_KERNEL(
hard_shrink, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaHardShrinkFunctor<float>>);
REGISTER_OP_KERNEL(
hard_shrink_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaHardShrinkGradFunctor<float>>);
REGISTER_OP_KERNEL(
hard_sigmoid, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaHardSigmoidFunctor<float>>);
REGISTER_OP_KERNEL(
hard_sigmoid_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaHardSigmoidGradFunctor<float>>);
REGISTER_OP_KERNEL(hard_swish, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaHardSwishFunctor<float>>);
REGISTER_OP_KERNEL(
hard_swish_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaHardSwishGradFunctor<float>>);
REGISTER_OP_KERNEL(
leaky_relu, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
phi::funcs::CudaLeakyReluFunctor<float>>);
REGISTER_OP_KERNEL(
leaky_relu_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
phi::funcs::CudaLeakyReluGradFunctor<float>>);
REGISTER_OP_KERNEL(log, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaLogFunctor<float>>);
REGISTER_OP_KERNEL(
log_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaLogGradFunctor<float>>);
REGISTER_OP_KERNEL(log1p, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaLog1pFunctor<float>>);
REGISTER_OP_KERNEL(
log1p_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaLog1pGradFunctor<float>>);
REGISTER_OP_KERNEL(
logsigmoid, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaLogSigmoidFunctor<float>>);
REGISTER_OP_KERNEL(
logsigmoid_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaLogSigmoidGradFunctor<float>>);
REGISTER_OP_KERNEL(
reciprocal, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaReciprocalFunctor<float>>);
REGISTER_OP_KERNEL(
reciprocal_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaReciprocalGradFunctor<float>>);
REGISTER_OP_KERNEL(
relu, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
phi::funcs::CudaReluFunctor<float>>);
REGISTER_OP_KERNEL(
relu_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
phi::funcs::CudaReluGradFunctor<float>>);
REGISTER_OP_KERNEL(relu6, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaRelu6Functor<float>>);
REGISTER_OP_KERNEL(
relu6_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaRelu6GradFunctor<float>>);
REGISTER_OP_KERNEL(sigmoid, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSigmoidFunctor<float>>);
REGISTER_OP_KERNEL(
sigmoid_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSigmoidGradFunctor<float>>);
REGISTER_OP_KERNEL(silu, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSiluFunctor<float>>);
REGISTER_OP_KERNEL(
silu_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSiluGradFunctor<float>>);
REGISTER_OP_KERNEL(soft_relu, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSoftReluFunctor<float>>);
REGISTER_OP_KERNEL(
soft_relu_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSoftReluGradFunctor<float>>);
REGISTER_OP_KERNEL(softplus, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSoftplusFunctor<float>>);
REGISTER_OP_KERNEL(
softplus_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSoftplusGradFunctor<float>>);
REGISTER_OP_KERNEL(
softshrink, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSoftShrinkFunctor<float>>);
REGISTER_OP_KERNEL(
softshrink_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSoftShrinkGradFunctor<float>>);
REGISTER_OP_KERNEL(softsign, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSoftsignFunctor<float>>);
REGISTER_OP_KERNEL(
softsign_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSoftsignGradFunctor<float>>);
REGISTER_OP_KERNEL(sqrt, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSqrtFunctor<float>>);
REGISTER_OP_KERNEL(
sqrt_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSqrtGradFunctor<float>>);
REGISTER_OP_KERNEL(square, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSquareFunctor<float>>);
REGISTER_OP_KERNEL(
square_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSquareGradFunctor<float>>);
REGISTER_OP_KERNEL(swish, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSwishFunctor<float>>);
REGISTER_OP_KERNEL(
swish_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaSwishGradFunctor<float>>);
REGISTER_OP_KERNEL(
thresholded_relu, KP, plat::XPUPlace,
ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaThresholdedReluFunctor<float>>);
REGISTER_OP_KERNEL(
thresholded_relu_grad, KP, plat::XPUPlace,
ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
ops::CudaThresholdedReluGradFunctor<float>>);
#endif // PADDLE_WITH_XPU_KP
......@@ -13,6 +13,10 @@
// limitations under the License.
#include "paddle/fluid/operators/determinant_op.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
......@@ -20,11 +24,6 @@ namespace operators {
class DeterminantOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant");
}
};
class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -44,19 +43,6 @@ class DeterminantGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input",
"DeterminantGradOp");
OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
framework::GradVarName("Out"), "DeterminantGradOp");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output",
framework::GradVarName("Input"), "DeterminantGradOp");
ctx->SetOutputDim(framework::GradVarName("Input"),
ctx->GetInputDim("Input"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
......@@ -162,11 +148,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer,
namespace ops = paddle::operators;
namespace plat = paddle::platform;
DECLARE_INFER_SHAPE_FUNCTOR(determinant, DeterminantInferShapeFunctor,
PD_INFER_META(phi::UnchangedInferMeta));
REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker,
ops::DeterminantGradOpMaker<paddle::framework::OpDesc>,
ops::DeterminantGradOpMaker<paddle::imperative::OpBase>);
ops::DeterminantGradOpMaker<paddle::imperative::OpBase>,
DeterminantInferShapeFunctor);
REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp)
DECLARE_INFER_SHAPE_FUNCTOR(determinant_grad, DeterminantGradInferShapeFunctor,
PD_INFER_META(phi::GeneralUnaryGradInferMeta));
REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp,
DeterminantGradInferShapeFunctor);
REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp,
ops::SlogDeterminantOpMaker,
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -12,100 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
namespace paddle {
namespace framework {
class ExecutionContext;
} // namespace framework
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
template <typename T>
class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor;
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& onednn_engine = dev_ctx.GetEngine();
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto tz = phi::vectorize<int64_t>(dout->dims());
memory::data_type dout_type = framework::ToMKLDNNDataType(
framework::TransToProtoVarType(dout->dtype()));
platform::ReorderMKLDNNHandler handler(
tz, framework::TransToProtoVarType(dout->dtype()), dout_type,
onednn_engine);
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
auto reorder_src_memory_p = handler.AcquireSrcMemory(
dout->format(), platform::to_void_cast(dout->data<T>()));
if (dx) {
auto reorder_dst_memory_p =
handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
auto reorder_p =
handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
platform::RecordEvent record_reorder(
"int_reorder", platform::TracerEventType::UserDefined, 2,
platform::EventRole::kUniqueOp);
reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
astream.wait();
dx->set_layout(DataLayout::kMKLDNN);
dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
}
if (dy) {
// Direct copy
if (dout->dims() == dy->dims()) {
auto reorder_dst_memory_p =
handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
auto reorder_p =
handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
platform::RecordEvent record_reorder(
"int_reorder", platform::TracerEventType::UserDefined, 2,
platform::EventRole::kUniqueOp);
reorder_p->execute(astream, *reorder_src_memory_p,
*reorder_dst_memory_p);
astream.wait();
dy->set_layout(DataLayout::kMKLDNN);
dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
} else {
// Broadcasting
platform::ReductionMKLDNNHandler<T> handler_sum(
dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
auto reduction_p = handler_sum.AcquireForwardPrimitive();
reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
{DNNL_ARG_DST, *dy_memory_p}});
astream.wait();
dy->set_layout(DataLayout::kMKLDNN);
dy->set_format(
platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
phi::vectorize<int64_t>(dy->dims()))));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(
......@@ -116,6 +24,8 @@ REGISTER_OP_KERNEL(
ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_add>,
ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_add>)
REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::EltwiseAddMKLDNNGradKernel<paddle::platform::bfloat16>,
ops::EltwiseAddMKLDNNGradKernel<float>)
REGISTER_OP_KERNEL(
elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
dnnl::algorithm::binary_add>,
ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_add>)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
namespace paddle {
namespace framework {
class ExecutionContext;
} // namespace framework
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
template <typename T>
class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto* y = ctx.Input<framework::Tensor>("Y");
auto* out = ctx.Input<framework::Tensor>("Out");
auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
if (dx) {
// dx = dout / y
platform::BinaryMKLDNNHandler<T> handler(
dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(),
dout, y, dx, 1.0f, 1.0f, 1.0f);
const auto src_dout_memory = handler.AcquireSrcMemory(dout);
const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
const auto dst_dx_memory = handler.AcquireDstMemory(dx);
const auto binary_prim = handler.AcquireForwardPrimitive();
const std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC_0, *src_dout_memory},
{DNNL_ARG_SRC_1, *src_y_memory},
{DNNL_ARG_DST, *dst_dx_memory}};
binary_prim->execute(astream, args);
astream.wait();
dx->set_layout(framework::DataLayout::kMKLDNN);
dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
}
if (dy) {
// dy = -dout * out / y
platform::BinaryMKLDNNHandler<T> y_handler(
dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y,
y, nullptr, 1.0f, 1.0f, 1.0f);
const auto y_memory = y_handler.AcquireSrcMemory(y);
dnnl::post_ops po;
po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc());
platform::BinaryMKLDNNHandler<T> handler(
dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
dout, out, nullptr, -1.0f, 1.0f, 1.0f, po);
const auto src_dout_memory = handler.AcquireSrcMemory(dout);
const auto src_out_memory = handler.AcquireSecondSrcMemory(out);
// If broadcasting is in use then let's write to temporary
// buffer allocated by oneDNN
const auto dst_dy_memory = (dout->dims() == dy->dims())
? handler.AcquireDstMemory(dy)
: handler.AcquireDstMemory();
const auto binary_prim = handler.AcquireForwardPrimitive();
const std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC_0, *src_dout_memory},
{DNNL_ARG_SRC_1, *src_out_memory},
{DNNL_ARG_DST, *dst_dy_memory},
{DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}};
binary_prim->execute(astream, args);
astream.wait();
dy->set_layout(framework::DataLayout::kMKLDNN);
// Reduction is needed for broadcasting scenario
if (dout->dims() != dy->dims()) {
platform::ReductionMKLDNNHandler<T> handler_sum(
dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine,
ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
auto reduction_p = handler_sum.AcquireForwardPrimitive();
// As source we use mem object with results from binary operation
reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
{DNNL_ARG_DST, *dy_memory_p}});
astream.wait();
dy->set_format(
platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
phi::vectorize<int64_t>(dy->dims()))));
} else {
dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
// TODO(piotrekobi) add int8, uint8 support
REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace,
ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_div>,
ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
dnnl::algorithm::binary_div>)
REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace,
ops::EltwiseDivMKLDNNGradKernel<paddle::platform::bfloat16>,
ops::EltwiseDivMKLDNNGradKernel<float>)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace,
ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_div>,
ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
dnnl::algorithm::binary_div>)
REGISTER_OP_KERNEL(
elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace,
ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
dnnl::algorithm::binary_div>,
ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_div>)
......@@ -15,20 +15,35 @@
#pragma once
#include <string>
#include <unordered_map>
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
using framework::DataLayout;
using framework::Tensor;
using dnnl::memory;
using dnnl::primitive;
using dnnl::stream;
using framework::DataLayout;
using framework::Tensor;
inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
const Tensor* y) {
const auto src_tz = phi::vectorize(x->dims());
const auto dst_tz = phi::vectorize(y->dims());
size_t j = 0;
std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
for (size_t i = 0; i < src_tz.size(); ++i) {
dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
if (j == dst_tz.size()) break;
}
return dst_tz_ex;
}
template <typename T, dnnl::algorithm BINARY_OP>
class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
......@@ -103,7 +118,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
// operation.
const bool reuse_x_memopry =
x->numel() == z->numel() && x->IsSharedBufferWith(*z);
std::shared_ptr<dnnl::memory> dst_memory = nullptr;
std::shared_ptr<dnnl::memory> dst_memory;
if (reuse_x_memopry) {
dst_memory = src_x_memory;
// NOTE(chenfeiyu): when the output reuses memory from other tensor rather
......@@ -135,19 +150,193 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
}
};
inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
const Tensor* y) {
const auto src_tz = phi::vectorize(x->dims());
const auto dst_tz = phi::vectorize(y->dims());
template <typename T, dnnl::algorithm BINARY_OP>
class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor;
size_t j = 0;
std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
for (size_t i = 0; i < src_tz.size(); ++i) {
dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
if (j == dst_tz.size()) break;
}
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& onednn_engine = dev_ctx.GetEngine();
return dst_tz_ex;
}
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
int axis = ctx.Attr<int>("axis");
auto tz = phi::vectorize<int64_t>(dout->dims());
auto proto_type_dout = framework::TransToProtoVarType(dout->dtype());
platform::ReorderMKLDNNHandler reorder_handler(
tz, proto_type_dout, framework::ToMKLDNNDataType(proto_type_dout),
onednn_engine);
auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
dout->format(), platform::to_void_cast(dout->data<T>()));
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
if (dx) {
std::shared_ptr<dnnl::memory> dst_memory;
// elementwise_add & elementwise_sub
if (BINARY_OP == dnnl::algorithm::binary_add ||
BINARY_OP == dnnl::algorithm::binary_sub) {
dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(),
ctx.GetPlace());
auto reorder_p =
reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p);
platform::RecordEvent record_reorder(
"int_reorder", platform::TracerEventType::UserDefined, 2,
platform::EventRole::kUniqueOp);
reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory);
}
// elementwise_mul & elementwise_div
else {
platform::BinaryMKLDNNHandler<T> binary_handler(
BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f,
1.0f, 1.0f);
const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout);
const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y);
dst_memory = binary_handler.AcquireDstMemory(dx);
const auto binary_prim = binary_handler.AcquireForwardPrimitive();
const std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC_0, *src_dout_memory},
{DNNL_ARG_SRC_1, *src_y_memory},
{DNNL_ARG_DST, *dst_memory}};
binary_prim->execute(astream, args);
}
astream.wait();
dx->set_layout(framework::DataLayout::kMKLDNN);
dx->set_format(platform::GetMKLDNNFormat(*dst_memory));
}
if (dy) {
dnnl::primitive_attr broadcast_reduction_attr;
std::shared_ptr<dnnl::memory> broadcast_src_memory;
std::shared_ptr<dnnl::memory> dst_memory;
// elementwise_add & elementwise_sub
if (BINARY_OP == dnnl::algorithm::binary_add ||
BINARY_OP == dnnl::algorithm::binary_sub) {
if (dout->dims() == dy->dims()) {
auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
dy, dout->format(), ctx.GetPlace());
dnnl::primitive_attr reorder_attr;
std::vector<float> scales(1);
scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1;
reorder_attr.set_output_scales(0, scales);
auto reorder_p = std::make_shared<dnnl::reorder>(
*(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
platform::RecordEvent record_reorder(
"int_reorder", platform::TracerEventType::UserDefined, 2,
platform::EventRole::kUniqueOp);
reorder_p->execute(astream, *reorder_src_memory_p,
*reorder_dst_memory_p);
dst_memory = reorder_dst_memory_p;
} else {
broadcast_src_memory = reorder_src_memory_p;
}
}
// elementwise_mul & elementwise_div
else {
std::unordered_map<int, dnnl::memory> args;
std::shared_ptr<dnnl::binary> binary_prim;
std::shared_ptr<dnnl::memory> post_op_memory;
std::shared_ptr<dnnl::memory> src_0_memory;
std::shared_ptr<dnnl::memory> src_1_memory;
platform::BinaryMKLDNNHandler<T> binary_handler(
dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(),
dout, x, nullptr, 1.0f, 1.0f, 1.0f);
src_1_memory = binary_handler.AcquireSecondSrcMemory(x);
if (BINARY_OP == dnnl::algorithm::binary_div) {
platform::BinaryMKLDNNHandler<T> post_op_binary_handler(
dnnl::algorithm::binary_div, axis, onednn_engine, ctx.GetPlace(),
y, y, nullptr, 1.0f, 1.0f, 1.0f);
post_op_memory = post_op_binary_handler.AcquireSrcMemory(y);
dnnl::post_ops po;
po.append_binary(dnnl::algorithm::binary_div,
post_op_memory->get_desc());
binary_handler = platform::BinaryMKLDNNHandler<T>(
dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(),
dout, out, nullptr, -1.0f, 1.0f, 1.0f, po);
src_1_memory = binary_handler.AcquireSecondSrcMemory(out);
}
src_0_memory = binary_handler.AcquireSrcMemory(dout);
const auto dst_dy_memory = (dout->dims() == dy->dims())
? binary_handler.AcquireDstMemory(dy)
: binary_handler.AcquireDstMemory();
binary_prim = binary_handler.AcquireForwardPrimitive();
args = {{DNNL_ARG_SRC_0, *src_0_memory},
{DNNL_ARG_SRC_1, *src_1_memory},
{DNNL_ARG_DST, *dst_dy_memory}};
if (BINARY_OP == dnnl::algorithm::binary_div)
args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1,
*post_op_memory});
binary_prim->execute(astream, args);
broadcast_src_memory = dst_dy_memory;
dst_memory = dst_dy_memory;
}
astream.wait();
dy->set_layout(DataLayout::kMKLDNN);
if (dout->dims() != dy->dims()) {
// Broadcasting
if (BINARY_OP == dnnl::algorithm::binary_sub) {
dnnl::post_ops po;
po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
broadcast_reduction_attr.set_post_ops(po);
}
platform::ReductionMKLDNNHandler<T> reduction_handler(
dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy),
broadcast_reduction_attr);
dst_memory = reduction_handler.AcquireDstMemory(dy);
auto reduction_p = reduction_handler.AcquireForwardPrimitive();
reduction_p->execute(astream, {
{DNNL_ARG_SRC, *broadcast_src_memory},
{DNNL_ARG_DST, *dst_memory},
});
astream.wait();
dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape(
phi::vectorize<int64_t>(dy->dims()))));
} else {
dy->set_format(platform::GetMKLDNNFormat(*dst_memory));
}
}
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
namespace paddle {
namespace framework {
class ExecutionContext;
} // namespace framework
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
template <typename T>
class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto* x = ctx.Input<framework::Tensor>("X");
auto* y = ctx.Input<framework::Tensor>("Y");
auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
if (dx) {
// dx = dout*y
platform::BinaryMKLDNNHandler<T> handler(
dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
dout, y, dx, 1.0f, 1.0f, 1.0f);
const auto src_dout_memory = handler.AcquireSrcMemory(dout);
const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
const auto dst_dx_memory = handler.AcquireDstMemory(dx);
const auto binary_prim = handler.AcquireForwardPrimitive();
const std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC_0, *src_dout_memory},
{DNNL_ARG_SRC_1, *src_y_memory},
{DNNL_ARG_DST, *dst_dx_memory}};
binary_prim->execute(astream, args);
astream.wait();
dx->set_layout(framework::DataLayout::kMKLDNN);
dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
}
if (dy) {
// dy = dout*x
// Handler is having nullptr passed instead of output tensor as
// we want Dst buffer to be allocated by oneDNN not to use Tensor
platform::BinaryMKLDNNHandler<T> handler(
dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
dout, x, nullptr, 1.0f, 1.0f, 1.0f);
const auto src_dout_memory = handler.AcquireSrcMemory(dout);
const auto src_x_memory = handler.AcquireSecondSrcMemory(x);
// If broadcasting is in use then let's write to temporary
// buffer allocated by oneDNN
const auto dst_dy_memory = (dout->dims() == dy->dims())
? handler.AcquireDstMemory(dy)
: handler.AcquireDstMemory();
const auto binary_prim = handler.AcquireForwardPrimitive();
const std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC_0, *src_dout_memory},
{DNNL_ARG_SRC_1, *src_x_memory},
{DNNL_ARG_DST, *dst_dy_memory}};
binary_prim->execute(astream, args);
astream.wait();
dy->set_layout(framework::DataLayout::kMKLDNN);
// Reduction is needed for broadcasting scenario
if (dout->dims() != dy->dims()) {
platform::ReductionMKLDNNHandler<T> handler_sum(
dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine,
ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
auto reduction_p = handler_sum.AcquireForwardPrimitive();
// As source we use mem object with results from binary operation
reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
{DNNL_ARG_DST, *dy_memory_p}});
astream.wait();
dy->set_format(
platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
phi::vectorize<int64_t>(dy->dims()))));
} else {
dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(
......@@ -132,6 +24,8 @@ REGISTER_OP_KERNEL(
ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_mul>,
ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_mul>)
REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::EltwiseMulMKLDNNGradKernel<paddle::platform::bfloat16>,
ops::EltwiseMulMKLDNNGradKernel<float>)
REGISTER_OP_KERNEL(
elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
dnnl::algorithm::binary_mul>,
ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_mul>)
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -13,113 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
namespace paddle {
namespace framework {
class ExecutionContext;
} // namespace framework
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
template <typename T>
class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor;
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& onednn_engine = dev_ctx.GetEngine();
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto tz = phi::vectorize<int64_t>(dout->dims());
memory::data_type dout_type = framework::ToMKLDNNDataType(
framework::TransToProtoVarType(dout->dtype()));
platform::ReorderMKLDNNHandler handler(
tz, framework::TransToProtoVarType(dout->dtype()), dout_type,
onednn_engine);
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
auto reorder_src_memory_p = handler.AcquireSrcMemory(
dout->format(), platform::to_void_cast(dout->data<T>()));
if (dx) {
auto reorder_dst_memory_p =
handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
auto reorder_p =
handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
platform::RecordEvent record_reorder(
"int_reorder", platform::TracerEventType::UserDefined, 2,
platform::EventRole::kUniqueOp);
reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
astream.wait();
dx->set_layout(DataLayout::kMKLDNN);
dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
}
if (dy) {
// Direct copy
if (dout->dims() == dy->dims()) {
auto reorder_dst_memory_p =
handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
dnnl::primitive_attr reorder_attr;
std::vector<float> scales = {-1};
reorder_attr.set_output_scales(0, scales);
auto reorder_p = std::make_shared<dnnl::reorder>(
*(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
platform::RecordEvent record_reorder(
"int_reorder", platform::TracerEventType::UserDefined, 2,
platform::EventRole::kUniqueOp);
reorder_p->execute(astream, *reorder_src_memory_p,
*reorder_dst_memory_p);
astream.wait();
dy->set_layout(DataLayout::kMKLDNN);
dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
} else {
// Broadcasting
dnnl::post_ops po;
po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
dnnl::primitive_attr attr;
attr.set_post_ops(po);
platform::ReductionMKLDNNHandler<T> handler_sum(
dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr);
auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
auto reduction_p = handler_sum.AcquireForwardPrimitive();
reduction_p->execute(astream, {
{DNNL_ARG_SRC, *reorder_src_memory_p},
{DNNL_ARG_DST, *dy_memory_p},
});
astream.wait();
dy->set_layout(DataLayout::kMKLDNN);
dy->set_format(
platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
phi::vectorize<int64_t>(dy->dims()))));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
......@@ -131,6 +24,8 @@ REGISTER_OP_KERNEL(
ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_sub>,
ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_sub>)
REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::EltwiseSubMKLDNNGradKernel<paddle::platform::bfloat16>,
ops::EltwiseSubMKLDNNGradKernel<float>)
REGISTER_OP_KERNEL(
elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
dnnl::algorithm::binary_sub>,
ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_sub>)
......@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mode_op.h"
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
......@@ -23,43 +27,6 @@ class ModeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode");
OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode");
auto input_dims = ctx->GetInputDim("X");
const int& dim_size = input_dims.size();
int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
PADDLE_ENFORCE_EQ(
(axis < dim_size) && (axis >= (-1 * dim_size)), true,
paddle::platform::errors::InvalidArgument(
"the axis of ModeOp must be [-%d, %d), but you set axis is %d",
dim_size, dim_size, axis));
PADDLE_ENFORCE_GE(input_dims.size(), 1,
paddle::platform::errors::InvalidArgument(
"input of ModeOp must have >= 1d shape"));
if (axis < 0) axis += dim_size;
bool keepdim = ctx->Attrs().Get<bool>("keepdim");
std::vector<int64_t> dimvec;
for (int64_t i = 0; i < axis; i++) {
dimvec.emplace_back(input_dims[i]);
}
if (keepdim) {
dimvec.emplace_back(static_cast<int64_t>(1));
}
for (int64_t i = axis + 1; i < dim_size; i++) {
dimvec.emplace_back(input_dims[i]);
}
framework::DDim dims = phi::make_ddim(dimvec);
PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
"input shape should >= 1d"));
ctx->SetOutputDim("Out", dims);
ctx->SetOutputDim("Indices", dims);
ctx->ShareLoD("X", "Out");
ctx->ShareLoD("X", "Indices");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -138,18 +105,11 @@ class ModeGradOpMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(mode, ModeInferShapeFunctor,
PD_INFER_META(phi::ModeInferMeta));
REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker,
ops::ModeGradOpMaker<paddle::framework::OpDesc>,
ops::ModeGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(mode,
ops::ModeCPUKernel<paddle::platform::CPUPlace, float>,
ops::ModeCPUKernel<paddle::platform::CPUPlace, double>,
ops::ModeCPUKernel<paddle::platform::CPUPlace, int32_t>,
ops::ModeCPUKernel<paddle::platform::CPUPlace, int64_t>);
ops::ModeGradOpMaker<paddle::imperative::OpBase>,
ModeInferShapeFunctor);
REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad);
REGISTER_OP_CPU_KERNEL(
mode_grad, ops::ModeGradCPUKernel<paddle::platform::CPUPlace, float>,
ops::ModeGradCPUKernel<paddle::platform::CPUPlace, double>,
ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/inner_product.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mode_op.h"
#include "paddle/fluid/operators/top_k_function_cuda.h"
namespace paddle {
namespace operators {
int ComputeBlockSize(int col) {
if (col > 512)
return 1024;
else if (col > 256 && col <= 512)
return 512;
else if (col > 128 && col <= 256)
return 256;
else if (col > 64 && col <= 128)
return 128;
else
return 64;
}
template <typename T>
void getModebySort(const platform::CUDADeviceContext& ctx,
const framework::Tensor* input_tensor,
const int64_t num_cols, const int64_t num_rows,
T* out_tensor, int64_t* indices_tensor) {
framework::Tensor input_tmp;
framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp);
T* input_tmp_data = input_tmp.mutable_data<T>(ctx.GetPlace());
input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
thrust::device_ptr<T> out_tensor_ptr(out_tensor);
thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
for (int64_t i = 0; i < num_rows; ++i) {
T* begin = input_tmp_data + num_cols * i;
T* end = input_tmp_data + num_cols * (i + 1);
thrust::device_vector<int64_t> indices_data(num_cols);
thrust::sequence(thrust::device, indices_data.begin(),
indices_data.begin() + num_cols);
thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1,
begin + 1, 0, thrust::plus<int>(),
thrust::not_equal_to<T>());
thrust::device_vector<T> keys_data(unique);
thrust::device_vector<int64_t> cnts_data(unique);
thrust::reduce_by_key(thrust::device, begin, end,
thrust::constant_iterator<int>(1), keys_data.begin(),
cnts_data.begin());
auto it = thrust::max_element(thrust::device, cnts_data.begin(),
cnts_data.begin() + unique);
T mode = keys_data[it - cnts_data.begin()];
int64_t counts = cnts_data[it - cnts_data.begin()];
auto pos = thrust::find(thrust::device, begin, end, mode);
int64_t index = indices_data[pos - begin + counts - 1];
out_tensor_ptr[i] = static_cast<T>(mode);
indices_tensor_ptr[i] = static_cast<int64_t>(index);
}
}
template <typename DeviceContext, typename T>
class ModeOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument(
"It must use CUDAPlace, you must check your device set."));
auto* input = ctx.Input<framework::Tensor>("X");
auto* output = ctx.Output<framework::Tensor>("Out");
auto* indices = ctx.Output<framework::Tensor>("Indices");
int axis = static_cast<int>(ctx.Attr<int>("axis"));
bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
// get the input dims
const auto& in_dims = input->dims();
// calcluate the real axis
if (axis < 0) axis += in_dims.size();
auto out_dims = output->dims();
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
if (axis == in_dims.size() - 1) {
const int64_t& input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t& input_width = in_dims[in_dims.size() - 1];
const auto& dev_ctx = ctx.cuda_device_context();
getModebySort<T>(dev_ctx, input, input_width, input_height, output_data,
indices_data);
} else {
std::vector<int> trans_axis;
for (int i = 0; i < axis; i++) {
trans_axis.emplace_back(i);
}
trans_axis.emplace_back(in_dims.size() - 1);
for (int i = axis + 1; i < in_dims.size() - 1; i++) {
trans_axis.emplace_back(i);
}
trans_axis.emplace_back(axis);
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
output->Resize(tmp_out_dim);
indices->Resize(tmp_out_dim);
}
framework::DDim trans_shape(in_dims);
framework::DDim trans_out_shape(in_dims);
for (int i = 0; i < trans_axis.size(); i++) {
trans_shape[i] = in_dims[trans_axis[i]];
trans_out_shape[i] = in_dims[trans_axis[i]];
}
trans_out_shape[in_dims.size() - 1] = 1;
// second step, tranpose the input
framework::Tensor trans_input;
trans_input.mutable_data<T>(trans_shape, ctx.GetPlace());
int ndims = trans_axis.size();
const auto& dev_ctx = ctx.cuda_device_context();
TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
&trans_input, trans_axis);
framework::Tensor trans_ind;
int64_t* trans_ind_data =
trans_ind.mutable_data<int64_t>(trans_out_shape, ctx.GetPlace());
framework::Tensor trans_out;
T* trans_out_data =
trans_out.mutable_data<T>(trans_out_shape, ctx.GetPlace());
const int64_t input_height =
phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
const int64_t input_width = trans_shape[trans_shape.size() - 1];
getModebySort<T>(dev_ctx, &trans_input, input_width, input_height,
trans_out_data, trans_ind_data);
// last step, tranpose back the indices and output
TransCompute<platform::CUDADeviceContext, int64_t>(
ndims, dev_ctx, trans_ind, indices, trans_axis);
TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
output, trans_axis);
if (!keepdim) {
output->Resize(out_dims);
indices->Resize(out_dims);
}
}
}
};
template <typename DeviceContext, typename T>
class ModeOpGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(context.GetPlace()), true,
platform::errors::InvalidArgument(
"It must use CUDAPlace, you must check your device set."));
auto* x = context.Input<framework::Tensor>("X");
auto* out_grad =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* indices = context.Input<framework::Tensor>("Indices");
auto* x_grad =
context.Output<framework::Tensor>(framework::GradVarName("X"));
int axis = context.Attr<int>("axis");
const auto& in_dims = x->dims();
auto out_dims = indices->dims();
if (axis < 0) axis += in_dims.size();
// allocate the cuda memory for the x_grad
T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
const T* out_grad_data = out_grad->data<T>();
const int64_t* indices_data = indices->data<int64_t>();
int pre, n, post;
GetDims(in_dims, axis, &pre, &n, &post);
// calcluate the block and grid num
auto& dev_ctx = context.cuda_device_context();
int block_size = ComputeBlockSize(post);
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
int grid_size = std::min(max_blocks, pre);
AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
mode, ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
mode_grad,
ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <iostream>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/transpose_op.h"
namespace paddle {
namespace operators {
template <typename T, typename Type>
static void getMode(Type input_height, Type input_width, int input_dim,
const framework::Tensor* input, T* t_out, Type* t_indices) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (Type i = 0; i < input_height; ++i) {
std::vector<std::pair<T, Type>> col_vec;
col_vec.reserve(input_width);
if (input_dim == 1) {
auto e_input = framework::EigenVector<T>::Flatten(*input);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
}
} else {
auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
}
}
std::sort(col_vec.begin(), col_vec.end(),
[](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
return (!std::isnan(static_cast<double>(l.first)) &&
std::isnan(static_cast<double>(r.first))) ||
(l.first < r.first);
});
T mode = 0;
int64_t indice = 0;
int64_t cur_freq = 0;
int64_t max_freq = 0;
for (int64_t i = 0; i < input_width; ++i) {
++cur_freq;
if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
if (cur_freq > max_freq) {
max_freq = cur_freq;
mode = col_vec[i].first;
indice = col_vec[i].second;
}
cur_freq = 0;
}
}
t_out[i] = mode;
t_indices[i] = indice;
}
}
template <typename T, typename Type>
static void ModeAssign(const Type& input_height, const Type& input_width,
const int& input_dim, const framework::Tensor* input,
const framework::Tensor* indices, T* output_data) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (Type i = 0; i < input_height; ++i) {
if (input_dim == 1) {
auto e_input = framework::EigenVector<T>::Flatten(*input);
auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
output_data[i * input_width + e_indices(0)] = e_input(0);
} else {
auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
auto e_indices =
framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
}
}
}
template <typename DeviceContext, typename T>
class ModeCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<framework::Tensor>("X");
auto* output = context.Output<framework::Tensor>("Out");
auto* indices = context.Output<framework::Tensor>("Indices");
const auto& in_dims = input->dims();
bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
// axis < 0, cacluate the real axis
int axis = static_cast<int>(context.Attr<int>("axis"));
if (axis < 0) axis += in_dims.size();
T* output_data = output->mutable_data<T>(context.GetPlace());
int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
auto out_dims = output->dims();
// if axis is not the last dim, transpose it to the last dim, do the
// calculation,
// then tranpose it back to orginal axis.
if (axis == in_dims.size() - 1) {
const int64_t& input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t& input_width = in_dims[in_dims.size() - 1];
getMode<T, int64_t>(input_height, input_width, in_dims.size(), input,
output_data, indices_data);
} else {
std::vector<int> trans_axis;
for (int i = 0; i < axis; i++) {
trans_axis.emplace_back(i);
}
trans_axis.push_back(in_dims.size() - 1);
for (int i = axis + 1; i < in_dims.size() - 1; i++) {
trans_axis.emplace_back(i);
}
trans_axis.emplace_back(axis);
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
output->Resize(tmp_out_dim);
indices->Resize(tmp_out_dim);
}
// get the trans input_dims, out_dims
framework::DDim trans_shape(in_dims);
framework::DDim trans_out_shape(in_dims);
for (size_t i = 0; i < trans_axis.size(); i++) {
trans_shape[i] = in_dims[trans_axis[i]];
trans_out_shape[i] = in_dims[trans_axis[i]];
}
trans_out_shape[in_dims.size() - 1] = 1;
framework::Tensor trans_input;
trans_input.mutable_data<T>(trans_shape, context.GetPlace());
int ndims = trans_axis.size();
auto& dev_context =
context.template device_context<platform::CPUDeviceContext>();
// transpose the input value
TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
&trans_input, trans_axis);
const int64_t input_height =
phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
const int64_t input_width = trans_shape[trans_shape.size() - 1];
framework::Tensor tmp_out;
T* t_out = tmp_out.mutable_data<T>(trans_out_shape, context.GetPlace());
framework::Tensor tmp_indices;
auto* t_ind = tmp_indices.mutable_data<int64_t>(trans_out_shape,
context.GetPlace());
getMode<T, int64_t>(input_height, input_width, in_dims.size(),
&trans_input, t_out, t_ind);
// transpose back
TransCompute<platform::CPUDeviceContext, int64_t>(
ndims, dev_context, tmp_indices, indices, trans_axis);
TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
output, trans_axis);
if (!keepdim) {
output->Resize(out_dims);
indices->Resize(out_dims);
}
}
}
};
template <typename DeviceContext, typename T>
class ModeGradCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<framework::Tensor>("X");
auto* out_grad =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* indices = context.Input<framework::Tensor>("Indices");
auto* x_grad =
context.Output<framework::Tensor>(framework::GradVarName("X"));
int axis = static_cast<int>(context.Attr<int>("axis"));
bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
auto in_dims = x->dims();
auto out_dims = indices->dims();
// axis < 0, get the real axis
axis = (axis < 0) ? (in_dims.size() + axis) : axis;
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(out_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(out_dims[i - 1]);
}
out_dims = phi::make_ddim(tmp_out_shape);
}
T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
if (axis == in_dims.size() - 1) {
// allocate the memory for the input_grad
// assign the out_grad to input_grad directly
const int64_t input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t input_width = in_dims[in_dims.size() - 1];
// init the output grad with 0, because some input elements has no grad
memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
// Assign the output_grad to input_grad
if (keepdim) {
ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices,
x_grad_data);
} else {
auto& dev_context =
context.template device_context<platform::CPUDeviceContext>();
framework::Tensor out_grad_tmp;
framework::Tensor indices_tmp;
out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
indices_tmp.mutable_data<int64_t>(indices->dims(),
dev_context.GetPlace());
framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
&out_grad_tmp);
framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
&indices_tmp);
out_grad_tmp.Resize(out_dims);
indices_tmp.Resize(out_dims);
ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
&indices_tmp, x_grad_data);
}
} else {
// can not assign grad to input_grad, must do the transpose
std::vector<int> trans_axis;
for (int i = 0; i < axis; i++) {
trans_axis.emplace_back(i);
}
trans_axis.emplace_back(out_dims.size() - 1);
for (int i = axis + 1; i < out_dims.size() - 1; i++) {
trans_axis.emplace_back(i);
}
trans_axis.emplace_back(axis);
framework::DDim trans_shape(out_dims);
framework::DDim trans_in_shape(in_dims);
for (size_t i = 0; i < trans_axis.size(); i++) {
trans_shape[i] = out_dims[trans_axis[i]];
trans_in_shape[i] = in_dims[trans_axis[i]];
}
// transpose the out_grad, indices
framework::Tensor trans_dO;
trans_dO.mutable_data<T>(trans_shape, context.GetPlace());
framework::Tensor trans_ind;
trans_ind.mutable_data<int64_t>(trans_shape, context.GetPlace());
int ndims = trans_axis.size();
auto& dev_context =
context.template device_context<platform::CPUDeviceContext>();
if (keepdim) {
// Do transpose
TransCompute<platform::CPUDeviceContext, T>(
ndims, dev_context, *out_grad, &trans_dO, trans_axis);
TransCompute<platform::CPUDeviceContext, int64_t>(
ndims, dev_context, *indices, &trans_ind, trans_axis);
} else {
framework::Tensor out_grad_tmp;
framework::Tensor indices_tmp;
out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
indices_tmp.mutable_data<int64_t>(indices->dims(),
dev_context.GetPlace());
framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
&out_grad_tmp);
framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
&indices_tmp);
out_grad_tmp.Resize(out_dims);
indices_tmp.Resize(out_dims);
// Do transpose
TransCompute<platform::CPUDeviceContext, T>(
ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis);
TransCompute<platform::CPUDeviceContext, int64_t>(
ndims, dev_context, indices_tmp, &trans_ind, trans_axis);
}
const int64_t input_height = phi::product(
phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
// Assign the out_grad to tranpose input_grad
framework::Tensor tmp_out;
T* t_out = tmp_out.mutable_data<T>(trans_in_shape, context.GetPlace());
memset(t_out, 0, x_grad->numel() * sizeof(T));
ModeAssign<T, int64_t>(input_height, input_width, in_dims.size(),
&trans_dO, &trans_ind, t_out);
// Transpose back
TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
x_grad, trans_axis);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -90,6 +90,9 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
return LoDTensorType::get(
parser.getContext(), shape, elementType, lod_level);
}
if (keyword == "dense_tensor_map") {
return DenseTensorMapType::get(parser.getContext());
}
if (keyword == "dense_tensor") {
// parse DenseTensor, for example: !i=Infrt.tensor<X86, CUDA, F32>
llvm::StringRef target;
......@@ -158,6 +161,10 @@ void InfrtDialect::printType(::mlir::Type type,
<< lod_tensor_type.getLod_level() << ">";
return;
}
if (type.isa<infrt::DenseTensorMapType>()) {
os << "dense_tensor_map";
return;
}
// print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
if (type.isa<DenseTensorType>()) {
......
......@@ -13,15 +13,17 @@
// limitations under the License.
#include "paddle/infrt/host_context/paddle_mlir.h"
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd_ops_info.h"
MLIRModelGenImpl::MLIRModelGenImpl()
: context_(infrt::Global::getMLIRContext()), builder_(context_) {
context_->allowUnregisteredDialects();
context_->getOrLoadDialect<mlir::StandardOpsDialect>();
context_->getOrLoadDialect<infrt::ts::TensorShapeDialect>();
context_->getOrLoadDialect<infrt::dt::DTDialect>();
context_->getOrLoadDialect<mlir::pd::PaddleDialect>();
context_->getOrLoadDialect<::infrt::InfrtDialect>();
module_ = mlir::ModuleOp::create(mlir::UnknownLoc::get(context_));
}
......@@ -55,7 +57,6 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
UpdateModelParams(program, &mainFunc);
UpdateModelOps(program);
UpdateModelOutputs(program);
return module_;
}
......@@ -171,7 +172,11 @@ void MLIRModelGenImpl::UpdateModelParams(
ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
builder_,
&precision_);
mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
mlir::Type type_ =
infrt::DenseTensorType::get(context_,
infrt::TargetType::CPU,
infrt::PrecisionType::FLOAT32,
infrt::LayoutType::NCHW);
auto op = builder_.create<infrt::dt::TensorMapGetTensorOp>(
mlir::UnknownLoc::get(context_), type_, map, name);
params_map_.insert(std::pair<std::string, mlir::Value>(
......@@ -197,8 +202,9 @@ void MLIRModelGenImpl::UpdateModelOutputs(
llvm::SmallVector<mlir::Type, 4> resultTypes;
llvm::SmallVector<mlir::NamedAttribute, 4> attrs;
mlir::OperationState state(loc,
mlir::ReturnOp::getOperationName(),
::infrt::ReturnOp::getOperationName(),
operands,
resultTypes,
attrs);
......@@ -321,7 +327,7 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
switch (type) {
ATTR_IMPL_CASE(FLOAT, f, getF32FloatAttr);
ATTR_IMPL_CASE(BOOLEAN, b, getBoolAttr);
ATTR_IMPL_CASE(INT, i, getI32IntegerAttr);
ATTR_IMPL_CASE(INT, i, getSI32IntegerAttr);
ATTR_IMPL_CASE(LONG, l, getI64IntegerAttr);
ATTR_IMPL_CASE(STRING, s, getStringAttr);
......
cc_test_tiny(test_abs_model SRCS model/test_abs.cc DEPS infrt ${MLIR_IR_LIBS})
configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py")
add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\""
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle.nn import Layer
from paddle.static import InputSpec
from paddle.jit import to_static
import sys
class AbsNet(paddle.nn.Layer):
def __init__(self):
super(AbsNet, self).__init__()
def forward(self, x):
x = paddle.abs(x)
return x
if __name__ == '__main__':
# build network
model = AbsNet()
# save inferencing format model
net = to_static(
model, input_spec=[InputSpec(
shape=[None, 1, 28, 28], name='x')])
paddle.jit.save(net, sys.argv[1])
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <llvm/Support/CommandLine.h>
#include <mlir/Pass/PassManager.h>
#include <iostream>
#include <string>
#include "llvm/Support/DynamicLibrary.h"
#include "paddle/infrt/common/global.h"
#include "paddle/infrt/dialect/mlir_loader.h"
#include "paddle/infrt/host_context/core_runtime.h"
#include "paddle/infrt/host_context/kernel_registry.h"
#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
#include "paddle/infrt/kernel/basic_kernels.h"
#include "paddle/infrt/kernel/control_flow_kernels.h"
#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
#include "paddle/infrt/kernel/phi/registry.h"
#include "paddle/infrt/kernel/tensor_kernels.h"
#include "paddle/infrt/kernel/tensor_shape_kernels.h"
#include "paddle/infrt/kernel/test_kernels.h"
#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
#include "paddle/infrt/host_context/paddle_mlir.h"
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
static llvm::cl::list<std::string> cl_shared_libs( // NOLINT
"shared_libs",
llvm::cl::desc("Specify shared library with kernels."),
llvm::cl::ZeroOrMore,
llvm::cl::MiscFlags::CommaSeparated);
TEST(ABS_MODEL, convert_and_execute) {
std::string model_file_name = "./abs.pdmodel";
std::string params_file_name = "./abs.pdiparams";
// convert model
MLIRModelGenImpl myGen;
auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name);
module_.dump();
// pick kernel
mlir::MLIRContext* context = infrt::Global::getMLIRContext();
context->allowUnregisteredDialects();
context->getOrLoadDialect<mlir::StandardOpsDialect>();
context->getOrLoadDialect<infrt::InfrtDialect>();
context->getOrLoadDialect<infrt::ts::TensorShapeDialect>();
context->getOrLoadDialect<infrt::InfrtDialect>();
context->getOrLoadDialect<infrt::dt::DTDialect>();
context->getOrLoadDialect<mlir::pd::PaddleDialect>();
context->getOrLoadDialect<infrt::phi::PHIDenseTensorDialect>();
context->getOrLoadDialect<infrt::phi::PHICPUKernelDialect>();
context->getOrLoadDialect<infrt::phi::PHIGPUKernelDialect>();
context->getOrLoadDialect<infrt::phi::PHIDialect>();
context->loadAllAvailableDialects();
mlir::PassManager pm(context);
mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
infrt::PrecisionType::FLOAT32,
infrt::LayoutType::NCHW}};
phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places));
phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
if (mlir::failed(pm.run(module_))) {
std::cout << "\npass failed!\n" << std::endl;
}
module_.dump();
// executate
infrt::host_context::KernelRegistry registry;
infrt::kernel::RegisterBasicKernels(&registry);
infrt::kernel::RegisterTestKernels(&registry);
infrt::kernel::RegisterTensorShapeKernels(&registry);
infrt::kernel::RegisterTensorKernels(&registry);
infrt::kernel::RegisterControlFlowKernels(&registry);
infrt::kernel::RegisterPhiKernels(&registry);
infrt::kernel::RegisterInferShapeLaunchers(&registry);
// load extra shared library
for (const auto& lib_path : cl_shared_libs) {
std::string err;
llvm::sys::DynamicLibrary dynLib =
llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err);
if (!dynLib.isValid()) {
llvm::errs() << "Load shared library failed. Error: " << err << "\n";
break;
}
if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) {
auto reg_func =
reinterpret_cast<void (*)(infrt::host_context::KernelRegistry*)>(
reg_sym);
reg_func(&registry);
} else {
llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path
<< "\". Skip.\n";
}
}
infrt::host_context::TestMlir(module_, &registry);
}
......@@ -648,6 +648,49 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
mask->set_dtype(paddle::experimental::CppTypeToDataType<int>::Type());
}
void ModeInferMeta(const MetaTensor& x,
int axis,
bool keepdim,
MetaTensor* out,
MetaTensor* indices) {
auto input_dims = x.dims();
const int& dim_size = input_dims.size();
PADDLE_ENFORCE_EQ(
(axis < dim_size) && (axis >= (-1 * dim_size)),
true,
errors::InvalidArgument(
"the axis of ModeOp must be [-%d, %d), but you set axis is %d",
dim_size,
dim_size,
axis));
PADDLE_ENFORCE_GE(
input_dims.size(),
1,
errors::InvalidArgument("input of ModeOp must have >= 1d shape"));
if (axis < 0) axis += dim_size;
std::vector<int64_t> dimvec;
for (int64_t i = 0; i < axis; i++) {
dimvec.emplace_back(input_dims[i]);
}
if (keepdim) {
dimvec.emplace_back(static_cast<int64_t>(1));
}
for (int64_t i = axis + 1; i < dim_size; i++) {
dimvec.emplace_back(input_dims[i]);
}
DDim dims = phi::make_ddim(dimvec);
PADDLE_ENFORCE_GE(input_dims.size(),
1,
errors::InvalidArgument("input shape should >= 1d"));
out->set_dims(dims);
out->share_lod(x);
out->set_dtype(x.dtype());
indices->set_dims(dims);
indices->share_lod(x);
indices->set_dtype(x.dtype());
}
void MultinomialInferMeta(const MetaTensor& x,
int num_samples,
bool replacement,
......
......@@ -112,6 +112,12 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
MetaTensor* mask,
MetaConfig config = MetaConfig());
void ModeInferMeta(const MetaTensor& x,
int axis,
bool keepdim,
MetaTensor* out,
MetaTensor* indices);
void MultinomialInferMeta(const MetaTensor& x,
int num_samples,
bool replacement,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/mode_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/mode.h"
namespace phi {
template <typename T, typename Context>
void ModeGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& indices,
const DenseTensor& out_grad,
int axis,
bool keepdim,
DenseTensor* x_grad) {
auto in_dims = x.dims();
auto out_dims = indices.dims();
// axis < 0, get the real axis
axis = (axis < 0) ? (in_dims.size() + axis) : axis;
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(out_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(out_dims[i - 1]);
}
out_dims = phi::make_ddim(tmp_out_shape);
}
T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
if (axis == in_dims.size() - 1) {
// allocate the memory for the input_grad
// assign the out_grad to input_grad directly
const int64_t input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t input_width = in_dims[in_dims.size() - 1];
// init the output grad with 0, because some input elements has no grad
memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
// Assign the output_grad to input_grad
if (keepdim) {
funcs::ModeAssign(input_height,
input_width,
in_dims.size(),
&out_grad,
&indices,
x_grad_data);
} else {
DenseTensor out_grad_tmp;
dev_ctx.template Alloc<T>(&out_grad_tmp);
DenseTensor indices_tmp;
dev_ctx.template Alloc<int64_t>(&indices_tmp);
phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
out_grad_tmp.Resize(out_dims);
indices_tmp.Resize(out_dims);
funcs::ModeAssign(input_height,
input_width,
in_dims.size(),
&out_grad_tmp,
&indices_tmp,
x_grad_data);
}
} else {
// can not assign grad to input_grad, must do the transpose
std::vector<int> trans_axis;
for (int i = 0; i < axis; i++) {
trans_axis.emplace_back(i);
}
trans_axis.emplace_back(out_dims.size() - 1);
for (int i = axis + 1; i < out_dims.size() - 1; i++) {
trans_axis.emplace_back(i);
}
trans_axis.emplace_back(axis);
DDim trans_shape(out_dims);
DDim trans_in_shape(in_dims);
for (size_t i = 0; i < trans_axis.size(); i++) {
trans_shape[i] = out_dims[trans_axis[i]];
trans_in_shape[i] = in_dims[trans_axis[i]];
}
// transpose the out_grad, indices
DenseTensor trans_dO;
trans_dO.Resize(trans_shape);
dev_ctx.template Alloc<T>(&trans_dO);
DenseTensor trans_ind;
trans_ind.Resize(trans_shape);
dev_ctx.template Alloc<int64_t>(&trans_ind);
int ndims = trans_axis.size();
if (keepdim) {
// Do transpose
funcs::TransCompute<CPUContext, T>(
ndims, dev_ctx, out_grad, &trans_dO, trans_axis);
funcs::TransCompute<CPUContext, int64_t>(
ndims, dev_ctx, indices, &trans_ind, trans_axis);
} else {
DenseTensor out_grad_tmp;
dev_ctx.template Alloc<T>(&out_grad_tmp);
DenseTensor indices_tmp;
dev_ctx.template Alloc<int64_t>(&indices_tmp);
phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
out_grad_tmp.Resize(out_dims);
indices_tmp.Resize(out_dims);
// Do transpose
funcs::TransCompute<CPUContext, T>(
ndims, dev_ctx, out_grad_tmp, &trans_dO, trans_axis);
funcs::TransCompute<CPUContext, int64_t>(
ndims, dev_ctx, indices_tmp, &trans_ind, trans_axis);
}
const int64_t input_height = phi::product(
phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
// Assign the out_grad to tranpose input_grad
DenseTensor tmp_out;
tmp_out.Resize(trans_in_shape);
T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
memset(t_out, 0, x_grad->numel() * sizeof(T));
funcs::ModeAssign<T, int64_t>(input_height,
input_width,
in_dims.size(),
&trans_dO,
&trans_ind,
t_out);
// Transpose back
funcs::TransCompute<CPUContext, T>(
ndims, dev_ctx, tmp_out, x_grad, trans_axis);
}
}
} // namespace phi
PD_REGISTER_KERNEL(mode_grad,
CPU,
ALL_LAYOUT,
phi::ModeGradKernel,
float,
double,
int32_t,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/mode_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/mode.h"
namespace phi {
template <typename T, typename Context>
void ModeKernel(const Context& dev_ctx,
const DenseTensor& x,
int axis,
bool keepdim,
DenseTensor* out,
DenseTensor* indices) {
const auto& in_dims = x.dims();
auto out_dims = out->dims();
// axis < 0, cacluate the real axis
if (axis < 0) axis += in_dims.size();
T* output_data = dev_ctx.template Alloc<T>(out);
int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
// if axis is not the last dim, transpose it to the last dim, do the
// calculation, then tranpose it back to original axis.
if (axis == in_dims.size() - 1) {
const int64_t& input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t& input_width = in_dims[in_dims.size() - 1];
funcs::GetMode<T, int64_t>(input_height,
input_width,
in_dims.size(),
&x,
output_data,
indices_data);
} else {
std::vector<int> trans_axis;
for (int i = 0; i < axis; i++) {
trans_axis.emplace_back(i);
}
trans_axis.push_back(in_dims.size() - 1);
for (int i = axis + 1; i < in_dims.size() - 1; i++) {
trans_axis.emplace_back(i);
}
trans_axis.emplace_back(axis);
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
out->Resize(tmp_out_dim);
indices->Resize(tmp_out_dim);
}
// get the trans input_dims, out_dims
DDim trans_shape(in_dims);
DDim trans_out_shape(in_dims);
for (size_t i = 0; i < trans_axis.size(); i++) {
trans_shape[i] = in_dims[trans_axis[i]];
trans_out_shape[i] = in_dims[trans_axis[i]];
}
trans_out_shape[in_dims.size() - 1] = 1;
DenseTensor trans_input;
trans_input.Resize(trans_shape);
dev_ctx.template Alloc<T>(&trans_input);
int ndims = trans_axis.size();
// transpose the input value
funcs::TransCompute<CPUContext, T>(
ndims, dev_ctx, x, &trans_input, trans_axis);
const int64_t input_height =
phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
const int64_t input_width = trans_shape[trans_shape.size() - 1];
DenseTensor tmp_out;
tmp_out.Resize(trans_out_shape);
T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
DenseTensor tmp_indices;
tmp_indices.Resize(trans_out_shape);
int64_t* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
funcs::GetMode<T, int64_t>(
input_height, input_width, in_dims.size(), &trans_input, t_out, t_ind);
// transpose back
funcs::TransCompute<CPUContext, int64_t>(
ndims, dev_ctx, tmp_indices, indices, trans_axis);
funcs::TransCompute<CPUContext, T>(
ndims, dev_ctx, tmp_out, out, trans_axis);
if (!keepdim) {
out->Resize(out_dims);
indices->Resize(out_dims);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(
mode, CPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
......@@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(segment_pool_grad,
ALL_LAYOUT,
phi::SegmentPoolGradKernel,
float,
double) {}
double,
int,
int64_t) {}
......@@ -18,5 +18,11 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
PD_REGISTER_KERNEL(
segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
PD_REGISTER_KERNEL(segment_pool,
CPU,
ALL_LAYOUT,
phi::SegmentPoolKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/extrema.h>
#include <thrust/functional.h>
#include <thrust/inner_product.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#endif
#include <algorithm>
#include <cmath>
#include <utility>
#include <vector>
#ifdef PADDLE_WITH_MKLML
#include <omp.h>
#endif
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
namespace funcs {
static int ComputeBlockSize(int col) {
if (col > 512)
return 1024;
else if (col > 256 && col <= 512)
return 512;
else if (col > 128 && col <= 256)
return 256;
else if (col > 64 && col <= 128)
return 128;
else
return 64;
}
static inline void GetDims(
const phi::DDim& dim, int axis, int* pre, int* n, int* post) {
*pre = 1;
*post = 1;
*n = dim[axis];
for (int i = 0; i < axis; ++i) {
(*pre) *= dim[i];
}
for (int i = axis + 1; i < dim.size(); ++i) {
(*post) *= dim[i];
}
}
template <typename T, typename Type>
static void GetMode(Type input_height,
Type input_width,
int input_dim,
const DenseTensor* input,
T* t_out,
Type* t_indices) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (Type i = 0; i < input_height; ++i) {
std::vector<std::pair<T, Type>> col_vec;
col_vec.reserve(input_width);
if (input_dim == 1) {
auto e_input = EigenVector<T>::Flatten(*input);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
}
} else {
auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
}
}
std::sort(col_vec.begin(),
col_vec.end(),
[](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
return (!std::isnan(static_cast<double>(l.first)) &&
std::isnan(static_cast<double>(r.first))) ||
(l.first < r.first);
});
T mode = 0;
int64_t indice = 0;
int64_t cur_freq = 0;
int64_t max_freq = 0;
for (int64_t i = 0; i < input_width; ++i) {
++cur_freq;
if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
if (cur_freq > max_freq) {
max_freq = cur_freq;
mode = col_vec[i].first;
indice = col_vec[i].second;
}
cur_freq = 0;
}
}
t_out[i] = mode;
t_indices[i] = indice;
}
}
template <typename T, typename Type>
static void ModeAssign(const Type& input_height,
const Type& input_width,
const int& input_dim,
const DenseTensor* input,
const DenseTensor* indices,
T* output_data) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (Type i = 0; i < input_height; ++i) {
if (input_dim == 1) {
auto e_input = EigenVector<T>::Flatten(*input);
auto e_indices = EigenVector<Type>::Flatten(*indices);
output_data[i * input_width + e_indices(0)] = e_input(0);
} else {
auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
}
}
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename T>
static void GetModebySort(const phi::GPUContext& dev_ctx,
const DenseTensor* input_tensor,
const int64_t num_cols,
const int64_t num_rows,
T* out_tensor,
int64_t* indices_tensor) {
DenseTensor input_tmp;
input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
T* input_tmp_data = dev_ctx.Alloc<T>(&input_tmp);
phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp);
thrust::device_ptr<T> out_tensor_ptr(out_tensor);
thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
for (int64_t i = 0; i < num_rows; ++i) {
T* begin = input_tmp_data + num_cols * i;
T* end = input_tmp_data + num_cols * (i + 1);
thrust::device_vector<int64_t> indices_data(num_cols);
thrust::sequence(
thrust::device, indices_data.begin(), indices_data.begin() + num_cols);
thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
int unique = 1 + thrust::inner_product(thrust::device,
begin,
end - 1,
begin + 1,
0,
thrust::plus<int>(),
thrust::not_equal_to<T>());
thrust::device_vector<T> keys_data(unique);
thrust::device_vector<int64_t> cnts_data(unique);
thrust::reduce_by_key(thrust::device,
begin,
end,
thrust::constant_iterator<int>(1),
keys_data.begin(),
cnts_data.begin());
auto it = thrust::max_element(
thrust::device, cnts_data.begin(), cnts_data.begin() + unique);
T mode = keys_data[it - cnts_data.begin()];
int64_t counts = cnts_data[it - cnts_data.begin()];
auto pos = thrust::find(thrust::device, begin, end, mode);
int64_t index = indices_data[pos - begin + counts - 1];
out_tensor_ptr[i] = static_cast<T>(mode);
indices_tensor_ptr[i] = static_cast<int64_t>(index);
}
}
#endif
} // namespace funcs
} // namespace phi
......@@ -392,7 +392,7 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
int nthreads = batch_size * output_channels * output_height * output_width;
int thread_num = 1024;
#ifdef WITH_NV_JETSON
// paddle::platform::ChangeThreadNum(context, &thread_num);
// backends::gpu::ChangeThreadNum(context, &thread_num);
thread_num = 512;
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
......@@ -460,7 +460,7 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
int nthreads = batch_size * output_channels * output_height * output_width;
int thread_num = 1024;
#ifdef WITH_NV_JETSON
paddle::platform::ChangeThreadNum(context, &thread_num);
backends::gpu::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
dim3 threads(thread_num, 1);
......@@ -527,7 +527,7 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
int nthreads = batch_size * output_channels * output_height * output_width;
int thread_num = 1024;
#ifdef WITH_NV_JETSON
paddle::platform::ChangeThreadNum(context, &thread_num);
backends::gpu::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
dim3 threads(thread_num, 1);
......@@ -1293,7 +1293,7 @@ class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
output_width;
int thread_num = 1024;
#ifdef WITH_NV_JETSON
paddle::platform::ChangeThreadNum(context, &thread_num);
backends::gpu::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
dim3 threads(thread_num, 1);
......@@ -1369,7 +1369,7 @@ class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
output_width;
int thread_num = 1024;
#ifdef WITH_NV_JETSON
paddle::platform::ChangeThreadNum(context, &thread_num);
backends::gpu::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
dim3 threads(thread_num, 1);
......@@ -1906,7 +1906,7 @@ class MaxPool2dWithIndexFunctor<phi::GPUContext, T1, T2> {
int nthreads = batch_size * output_channels * output_height * output_width;
int thread_num = 1024;
#ifdef WITH_NV_JETSON
paddle::platform::ChangeThreadNum(context, &thread_num);
backends::gpu::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
......@@ -2205,7 +2205,7 @@ class MaxPool3dWithIndexFunctor<phi::GPUContext, T1, T2> {
output_width;
int thread_num = 1024;
#ifdef WITH_NV_JETSON
paddle::platform::ChangeThreadNum(context, &thread_num);
backends::gpu::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
......
......@@ -149,10 +149,19 @@ template class SegmentPoolFunctor<CPU, float, int>;
template class SegmentPoolFunctor<CPU, float, int64_t>;
template class SegmentPoolFunctor<CPU, double, int>;
template class SegmentPoolFunctor<CPU, double, int64_t>;
template class SegmentPoolFunctor<CPU, int, int>;
template class SegmentPoolFunctor<CPU, int, int64_t>;
template class SegmentPoolFunctor<CPU, int64_t, int>;
template class SegmentPoolFunctor<CPU, int64_t, int64_t>;
template class SegmentPoolGradFunctor<CPU, float, int>;
template class SegmentPoolGradFunctor<CPU, float, int64_t>;
template class SegmentPoolGradFunctor<CPU, double, int>;
template class SegmentPoolGradFunctor<CPU, double, int64_t>;
template class SegmentPoolGradFunctor<CPU, int, int>;
template class SegmentPoolGradFunctor<CPU, int, int64_t>;
template class SegmentPoolGradFunctor<CPU, int64_t, int>;
template class SegmentPoolGradFunctor<CPU, int64_t, int64_t>;
} // namespace funcs
} // namespace phi
......@@ -453,10 +453,19 @@ template class SegmentPoolFunctor<GPU, float, int>;
template class SegmentPoolFunctor<GPU, float, int64_t>;
template class SegmentPoolFunctor<GPU, double, int>;
template class SegmentPoolFunctor<GPU, double, int64_t>;
template class SegmentPoolFunctor<GPU, int, int>;
template class SegmentPoolFunctor<GPU, int, int64_t>;
template class SegmentPoolFunctor<GPU, int64_t, int>;
template class SegmentPoolFunctor<GPU, int64_t, int64_t>;
template class SegmentPoolGradFunctor<GPU, float, int>;
template class SegmentPoolGradFunctor<GPU, float, int64_t>;
template class SegmentPoolGradFunctor<GPU, double, int>;
template class SegmentPoolGradFunctor<GPU, double, int64_t>;
template class SegmentPoolGradFunctor<GPU, int, int>;
template class SegmentPoolGradFunctor<GPU, int, int64_t>;
template class SegmentPoolGradFunctor<GPU, int64_t, int>;
template class SegmentPoolGradFunctor<GPU, int64_t, int64_t>;
} // namespace funcs
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/mode_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/mode.h"
namespace phi {
template <typename T>
__global__ void AssignGradWithAxis(const T* grad_out,
const int64_t* indices,
T* grad_in,
int pre,
int post,
int raw_height,
int k) {
// raw_height is the length of topk axis
for (int i = blockIdx.x; i < pre; i += gridDim.x) {
int base_index = i * post * k;
int base_grad = i * post * raw_height;
for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
grad_in[base_grad + j] = static_cast<T>(0);
}
__syncthreads();
for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
int64_t idx_ij = indices[base_index + j];
int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
grad_in[in_ij] = grad_out[base_index + j];
}
}
}
template <typename T, typename Context>
void ModeGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& indices,
const DenseTensor& out_grad,
int axis,
bool keepdim,
DenseTensor* x_grad) {
const auto& in_dims = x.dims();
auto out_dims = indices.dims();
if (axis < 0) axis += in_dims.size();
// allocate the cuda memory for the x_grad
T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
const T* out_grad_data = out_grad.data<T>();
const int64_t* indices_data = indices.data<int64_t>();
int pre, n, post;
funcs::GetDims(in_dims, axis, &pre, &n, &post);
// calcluate the block and grid num
int block_size = funcs::ComputeBlockSize(post);
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
int grid_size = std::min(max_blocks, pre);
AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
}
} // namespace phi
PD_REGISTER_KERNEL(mode_grad,
GPU,
ALL_LAYOUT,
phi::ModeGradKernel,
float,
double,
int32_t,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/mode_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/mode.h"
namespace phi {
template <typename T, typename Context>
void ModeKernel(const Context& dev_ctx,
const DenseTensor& x,
int axis,
bool keepdim,
DenseTensor* out,
DenseTensor* indices) {
// get the input dims
const auto& in_dims = x.dims();
// calcluate the real axis
if (axis < 0) axis += in_dims.size();
auto out_dims = out->dims();
const T* input_data = x.data<T>();
T* output_data = dev_ctx.template Alloc<T>(out);
int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
if (axis == in_dims.size() - 1) {
const int64_t& input_height =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t& input_width = in_dims[in_dims.size() - 1];
funcs::GetModebySort<T>(
dev_ctx, &x, input_width, input_height, output_data, indices_data);
} else {
std::vector<int> trans_axis;
for (int i = 0; i < axis; i++) {
trans_axis.emplace_back(i);
}
trans_axis.emplace_back(in_dims.size() - 1);
for (int i = axis + 1; i < in_dims.size() - 1; i++) {
trans_axis.emplace_back(i);
}
trans_axis.emplace_back(axis);
if (!keepdim) {
std::vector<int> tmp_out_shape;
for (int i = 0; i < axis; i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
tmp_out_shape.emplace_back(1);
for (int i = axis + 1; i < in_dims.size(); i++) {
tmp_out_shape.emplace_back(in_dims[i]);
}
DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
out->Resize(tmp_out_dim);
indices->Resize(tmp_out_dim);
}
DDim trans_shape(in_dims);
DDim trans_out_shape(in_dims);
for (int i = 0; i < trans_axis.size(); i++) {
trans_shape[i] = in_dims[trans_axis[i]];
trans_out_shape[i] = in_dims[trans_axis[i]];
}
trans_out_shape[in_dims.size() - 1] = 1;
// second step, tranpose the input
DenseTensor trans_input;
trans_input.Resize(trans_shape);
dev_ctx.template Alloc<T>(&trans_input);
int ndims = trans_axis.size();
funcs::TransCompute<Context, T>(
ndims, dev_ctx, x, &trans_input, trans_axis);
DenseTensor trans_ind;
trans_ind.Resize(trans_out_shape);
int64_t* trans_ind_data = dev_ctx.template Alloc<int64_t>(&trans_ind);
DenseTensor trans_out;
trans_out.Resize(trans_out_shape);
T* trans_out_data = dev_ctx.template Alloc<T>(&trans_out);
const int64_t input_height =
phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
const int64_t input_width = trans_shape[trans_shape.size() - 1];
funcs::GetModebySort<T>(dev_ctx,
&trans_input,
input_width,
input_height,
trans_out_data,
trans_ind_data);
// last step, tranpose back the indices and output
funcs::TransCompute<Context, int64_t>(
ndims, dev_ctx, trans_ind, indices, trans_axis);
funcs::TransCompute<Context, T>(ndims, dev_ctx, trans_out, out, trans_axis);
if (!keepdim) {
out->Resize(out_dims);
indices->Resize(out_dims);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(
mode, GPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
......@@ -24,4 +24,6 @@ PD_REGISTER_KERNEL(segment_pool_grad,
ALL_LAYOUT,
phi::SegmentPoolGradKernel,
float,
double) {}
double,
int,
int64_t) {}
......@@ -19,5 +19,11 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
PD_REGISTER_KERNEL(
segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
PD_REGISTER_KERNEL(segment_pool,
GPU,
ALL_LAYOUT,
phi::SegmentPoolKernel,
float,
double,
int,
int64_t) {}
......@@ -121,17 +121,10 @@ struct ReduceMaxFunctor {
};
template <typename Tx, typename Ty = Tx>
struct ExpSubFunctor {
HOSTDEVICE inline ExpSubFunctor() { y = static_cast<Tx>(0.0f); }
HOSTDEVICE explicit inline ExpSubFunctor(Tx y) : y((Tx)(y)) {}
struct ExpFunctor {
HOSTDEVICE inline Ty operator()(const Tx& x) const {
return static_cast<Ty>(std::exp(x - y));
return static_cast<Ty>(std::exp(x));
}
private:
Tx y;
};
template <typename Tx, typename Ty = Tx>
......@@ -293,10 +286,14 @@ __global__ void WarpSoftmaxForward(T* softmax,
}
// data src
AccT srcdata[kBatchSize][kLoopsV][kVSize];
T src_tmp[kBatchSize][kLoopsV][kVSize];
kps::Init<AccT, kStep>(&srcdata[0][0][0], kLowInf);
kps::Init<T, kStep>(&src_tmp[0][0][0], -std::numeric_limits<T>::infinity());
// src_data: the raw data form global memory
// sub_data: store the data obtained by (src_data - max), used by log_softmax
// exp_data: store the data obtained by (exp(sub_data)), used by softmax
T src_data[kBatchSize][kLoopsV][kVSize];
AccT sub_data[kBatchSize][kLoopsV][kVSize];
AccT exp_data[kBatchSize][kLoopsV][kVSize];
kps::Init<AccT, kStep>(&sub_data[0][0][0], kLowInf);
kps::Init<T, kStep>(&src_data[0][0][0], -std::numeric_limits<T>::infinity());
// data dst
T out_tmp[kBatchSize][kLoopsV][kVSize];
......@@ -313,11 +310,11 @@ __global__ void WarpSoftmaxForward(T* softmax,
for (int i = 0; i < kBatchSize; ++i) {
const VecT* src_v =
reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
VecT* reg_v = reinterpret_cast<VecT*>(&src_tmp[i][0][0]);
VecT* reg_v = reinterpret_cast<VecT*>(&src_data[i][0][0]);
kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
&reg_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1);
kps::ElementwiseUnary<T, AccT, kVItem, 1, 1, DataTransFunctor<T, AccT>>(
&srcdata[i][0][0], &src_tmp[i][0][0], DataTransFunctor<T, AccT>());
&sub_data[i][0][0], &src_data[i][0][0], DataTransFunctor<T, AccT>());
}
// compute max
......@@ -327,14 +324,16 @@ __global__ void WarpSoftmaxForward(T* softmax,
1,
ReduceMaxFunctor<AccT>,
kMode::kLocalMode>(
&max[0], &srcdata[0][0][0], ReduceMaxFunctor<AccT>(), true);
&max[0], &sub_data[0][0][0], ReduceMaxFunctor<AccT>(), true);
WarpReduceMax<AccT, kBatchSize, kWarpSize>(max);
// compute sum
#pragma unroll
for (int i = 0; i < kBatchSize; ++i) {
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpSubFunctor<AccT>>(
&srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor<AccT>(max[i]));
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
&sub_data[i][0][0], &sub_data[i][0][0], UnarySubFunctor<AccT>(max[i]));
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpFunctor<AccT>>(
&exp_data[i][0][0], &sub_data[i][0][0], ExpFunctor<AccT>());
}
kps::Reduce<AccT,
kVItem,
......@@ -342,7 +341,7 @@ __global__ void WarpSoftmaxForward(T* softmax,
1,
kps::AddFunctor<AccT>,
kMode::kLocalMode>(
&sum[0], &srcdata[0][0][0], kps::AddFunctor<AccT>(), true);
&sum[0], &exp_data[0][0][0], kps::AddFunctor<AccT>(), true);
WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
// write data to global memory
......@@ -352,15 +351,13 @@ __global__ void WarpSoftmaxForward(T* softmax,
reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
if (LogMode) {
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnaryLogFunctor<AccT>>(
&srcdata[i][0][0], &srcdata[i][0][0], UnaryLogFunctor<AccT>());
kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnarySubFunctor<AccT>>(
&out_tmp[i][0][0],
&srcdata[i][0][0],
&sub_data[i][0][0],
UnarySubFunctor<AccT>(std::log(sum[i])));
} else {
kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
&out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
&out_tmp[i][0][0], &exp_data[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
}
kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
&softmax_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void ModeGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& indices,
const DenseTensor& out_grad,
int axis,
bool keepdim,
DenseTensor* x_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void ModeKernel(const Context& dev_ctx,
const DenseTensor& x,
int axis,
bool keepdim,
DenseTensor* out,
DenseTensor* indices);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature(
"mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"});
}
KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("mode_grad",
{"X", "Indices", GradVarName("Out")},
{"axis", "keepdim"},
{GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping);
......@@ -44,6 +44,8 @@ function update_pd_ops() {
cd ${PADDLE_ROOT}/tools/infrt/
python3 generate_pd_op_dialect_from_paddle_op_maker.py
python3 generate_phi_kernel_dialect.py
# generate test model
python3 paddle/infrt/tests/model/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs
}
function init() {
......
......@@ -52,6 +52,30 @@ def parse_args():
'--debug',
action='store_true',
help='If used, the graph of Quant model is drawn.')
parser.add_argument(
'--quant_model_filename',
type=str,
default="",
help='The input model`s file name. If empty, search default `__model__` and separate parameter files and use them or in case if not found, attempt loading `model` and `params` files.'
)
parser.add_argument(
'--quant_params_filename',
type=str,
default="",
help='If quant_model_filename is empty, this field is ignored. The input model`s all parameters file name. If empty load parameters from separate files.'
)
parser.add_argument(
'--save_model_filename',
type=str,
default="__model__",
help='The name of file to save the inference program itself. If is set None, a default filename __model__ will be used.'
)
parser.add_argument(
'--save_params_filename',
type=str,
default=None,
help='The name of file to save all related parameters. If it is set None, parameters will be saved in separate files'
)
test_args, args = parser.parse_known_args(namespace=unittest)
return test_args, sys.argv[:1] + args
......@@ -61,18 +85,29 @@ def transform_and_save_int8_model(original_path,
save_path,
ops_to_quantize='',
op_ids_to_skip='',
debug=False):
debug=False,
quant_model_filename='',
quant_params_filename='',
save_model_filename='',
save_params_filename=''):
place = fluid.CPUPlace()
exe = fluid.Executor(place)
inference_scope = fluid.executor.global_scope()
with fluid.scope_guard(inference_scope):
if os.path.exists(os.path.join(original_path, '__model__')):
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(original_path, exe)
if not quant_model_filename:
if os.path.exists(os.path.join(original_path, '__model__')):
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(original_path,
exe)
else:
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(
original_path, exe, 'model', 'params')
else:
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(original_path, exe,
'model', 'params')
fetch_targets] = fluid.io.load_inference_model(
original_path, exe, quant_model_filename,
quant_params_filename)
ops_to_quantize_set = set()
print(ops_to_quantize)
......@@ -97,8 +132,14 @@ def transform_and_save_int8_model(original_path,
graph = transform_to_mkldnn_int8_pass.apply(graph)
inference_program = graph.to_program()
with fluid.scope_guard(inference_scope):
fluid.io.save_inference_model(save_path, feed_target_names,
fetch_targets, exe, inference_program)
fluid.io.save_inference_model(
save_path,
feed_target_names,
fetch_targets,
exe,
inference_program,
model_filename=save_model_filename,
params_filename=save_params_filename)
print(
"Success! INT8 model obtained from the Quant model can be found at {}\n"
.format(save_path))
......@@ -109,4 +150,6 @@ if __name__ == '__main__':
test_args, remaining_args = parse_args()
transform_and_save_int8_model(
test_args.quant_model_path, test_args.int8_model_save_path,
test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug)
test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug,
test_args.quant_model_filename, test_args.quant_params_filename,
test_args.save_model_filename, test_args.save_params_filename)
......@@ -35,6 +35,12 @@ final_state_name_mapping = {
"x": "X",
"out": "Out",
},
"pool2d": {
"final_op_name": "final_state_pool2d",
"x": "X",
"kernel_size": "ksize",
"out": "Out",
},
"abs": {
"final_op_name": "final_state_abs",
"x": "X",
......
......@@ -17,7 +17,7 @@ import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
import paddle.fluid as fluid
from paddle.fluid import compiler, Program, program_guard
......
......@@ -23,7 +23,7 @@ import paddle.fluid.core as core
from paddle.fluid import Program, compiler, program_guard
from paddle.fluid.op import Operator
from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
class ElementwiseMulOp(OpTest):
......
......@@ -849,6 +849,38 @@ def ref_softsign(x):
return out
class XPUTestSoftshrinkOP(XPUOpTestWrapper):
def __init__(self):
self.op_name = 'softshrink'
self.use_dynamic_create_class = False
class XPUTestSoftshrink(TestActivationOPBase):
def set_case(self):
self.op_type = "softshrink"
self.dtype = self.in_type
threshold = 0.5
np.random.seed(1023)
x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype)
out = ref_softshrink(x, threshold)
self.inputs = {'X': x}
self.outputs = {'Out': out}
self.attrs = {'use_xpu': True}
support_types = get_xpu_op_support_types('softshrink')
for stype in support_types:
create_test_class(globals(), XPUTestSoftshrinkOP, stype)
def ref_softshrink(x, threshold=0.5):
out = np.copy(x)
out = (out < -threshold) * (out + threshold) + (out > threshold) * (
out - threshold)
return out
class XPUTestSwishOP(XPUOpTestWrapper):
def __init__(self):
self.op_name = 'swish'
......@@ -879,5 +911,36 @@ def ref_swish(x):
return out
class XPUTestThresholdedReluOP(XPUOpTestWrapper):
def __init__(self):
self.op_name = 'thresholded_relu'
self.use_dynamic_create_class = False
class XPUTestThresholdedRelu(TestActivationOPBase):
def set_case(self):
self.op_type = "thresholded_relu"
self.dtype = self.in_type
threshold = 1.0
np.random.seed(1024)
x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype)
x[np.abs(x) < 0.005] = 0.02
out = ref_thresholded_relu(x, threshold)
self.inputs = {'X': x}
self.outputs = {'Out': out}
self.attrs = {'use_xpu': True}
support_types = get_xpu_op_support_types('thresholded_relu')
for stype in support_types:
create_test_class(globals(), XPUTestThresholdedReluOP, stype)
def ref_thresholded_relu(x, threshold=1.0):
out = (x > threshold) * x
return out
if __name__ == "__main__":
unittest.main()
......@@ -30,7 +30,7 @@ def segment_sum(data, segment_ids, name=None):
where sum is over j such that `segment_ids[j] == i`.
Args:
data (Tensor): A tensor, available data type float32, float64.
data (Tensor): A tensor, available data type float32, float64, int32, int64.
segment_ids (Tensor): A 1-D tensor, which have the same size
with the first dimension of input data.
Available data type is int32, int64.
......@@ -57,7 +57,8 @@ def segment_sum(data, segment_ids, name=None):
out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM")
return out
check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
"int64"), "segment_pool")
check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
"segment_pool")
......@@ -85,7 +86,7 @@ def segment_mean(data, segment_ids, name=None):
of all index 'segment_ids[j] == i'.
Args:
data (tensor): a tensor, available data type float32, float64.
data (tensor): a tensor, available data type float32, float64, int32, int64.
segment_ids (tensor): a 1-d tensor, which have the same size
with the first dimension of input data.
available data type is int32, int64.
......@@ -113,7 +114,8 @@ def segment_mean(data, segment_ids, name=None):
out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN")
return out
check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
"int64"), "segment_pool")
check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
"segment_pool")
......@@ -140,7 +142,7 @@ def segment_min(data, segment_ids, name=None):
where min is over j such that `segment_ids[j] == i`.
Args:
data (tensor): a tensor, available data type float32, float64.
data (tensor): a tensor, available data type float32, float64, int32, int64.
segment_ids (tensor): a 1-d tensor, which have the same size
with the first dimension of input data.
available data type is int32, int64.
......@@ -167,7 +169,8 @@ def segment_min(data, segment_ids, name=None):
out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN")
return out
check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
"int64"), "segment_pool")
check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
"segment_pool")
......@@ -194,7 +197,7 @@ def segment_max(data, segment_ids, name=None):
where max is over j such that `segment_ids[j] == i`.
Args:
data (tensor): a tensor, available data type float32, float64.
data (tensor): a tensor, available data type float32, float64, int32, int64.
segment_ids (tensor): a 1-d tensor, which have the same size
with the first dimension of input data.
available data type is int32, int64.
......@@ -221,7 +224,8 @@ def segment_max(data, segment_ids, name=None):
out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MAX")
return out
check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
"int64"), "segment_pool")
check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
"segment_pool")
......
......@@ -141,6 +141,14 @@
output : Tensor
invoke : full_like(x, 1, dtype, place)
- api : pool2d
args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
output : Tensor(out)
infer_meta :
func : PoolInferMeta
kernel:
func : pool2d
- api : reshape
args : (Tensor x, ScalarArray shape)
output : Tensor(out)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册